In [5]:
# Load libraries required to do Random Forest
import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [16]:
# Mustang

In [17]:
# Import Training CSV
df = pd.read_csv("../Dataset/mustang_release_v1.0beta.csv")

# Data Cleaning Step
print("rows prior to cleaning: " + str(df.size))

# Drop first 1000 rows
df = df.iloc[1000: , :]

# Convert wallclock_limit to be in seconds
df['wallclock_limit'] = pd.to_timedelta(df['wallclock_limit']).dt.total_seconds()

# Add Runtime Attribute
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
df['runtime'] = (df['end_time'] - df['start_time']).dt.total_seconds()

# We only care about jobs that are completed
df = df[df.job_status == 'COMPLETED']

# Filter to only contain rows that have non-zero runtime
df = df.dropna(subset=['start_time'])
df = df.dropna(subset=['end_time'])
df = df[df.runtime != 0]

print('rows after to cleaning: ' + str(df.size))
print(df.head())

rows prior to cleaning: 19018575
rows after to cleaning: 18147350
      user_ID  group_ID                submit_time                 start_time  \
1000      354       357  2011-10-28 13:13:27-06:00  2011-10-28 13:13:45-06:00   
1001      354       357  2011-10-28 13:13:34-06:00  2011-10-28 13:13:45-06:00   
1002      354       357  2011-10-28 13:13:40-06:00  2011-10-28 13:13:45-06:00   
1003      354       357  2011-10-28 13:13:47-06:00  2011-10-28 13:14:16-06:00   
1004      427       435  2011-10-28 13:20:06-06:00  2011-10-28 13:20:28-06:00   

                       end_time  wallclock_limit job_status  node_count  \
1000  2011-10-28 13:19:36-06:00           7200.0  COMPLETED           8   
1001  2011-10-28 13:19:43-06:00           7200.0  COMPLETED           8   
1002  2011-10-28 13:19:45-06:00           7200.0  COMPLETED           8   
1003  2011-10-28 13:20:31-06:00           7200.0  COMPLETED           8   
1004  2011-10-28 13:20:31-06:00           7200.0  COMPLETED           1 

In [18]:
# Split data to Attributes(X) and Labels(y).
X = df[['user_ID', 'group_ID', 'wallclock_limit', 'node_count', 'tasks_requested']]
y = df[['runtime']]

In [19]:
# Split data into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train random forest regressor
regressor = RandomForestRegressor(n_estimators = 30, random_state = 0)
regressor.fit(X_train, y_train.values.ravel())

# Prediction
y_pred = regressor.predict(X_test)

In [20]:
# metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 840.9898444623071
Mean Squared Error: 12621030.409635426
Root Mean Squared Error: 3552.6089581651718


In [21]:
df_result = pd.DataFrame({'actual': y_test["runtime"], 'Predicted': y_pred})
print(df_result.tail())

         actual   Predicted
2011656     6.0  541.333169
682778     85.0  175.907324
937396    195.0  183.083173
2057438    45.0  207.867417
664626     96.0  116.904426


In [23]:
df_output = X_test
df_output["runtime"] = y_test["runtime"]
df_output["predicted_time"] = y_pred
df_output = df_output.join(df[["submit_time"]])
df_output = df_output.sort_values(by=['submit_time'])

# we do not have a job id, lets make the index be the job id.
df_output['job_id'] = range(0, 0+len(df_output))
df_output = df_output.rename(columns={'user_ID':'user_id','runtime':'wallclock_runtime_sec','wallclock_limit':'wallclock_limit_sec','node_count':'num_cores','submit_time':'time'})

df_output.head()

Unnamed: 0,user_id,group_ID,wallclock_limit_sec,num_cores,tasks_requested,wallclock_runtime_sec,predicted_time,time,job_id
1000,354,357,7200.0,8,192,351.0,1276.475419,2011-10-28 13:13:27-06:00,0
1002,354,357,7200.0,8,192,360.0,1276.475419,2011-10-28 13:13:40-06:00,1
1036,354,357,7200.0,8,192,583.0,1276.475419,2011-10-28 13:17:11-06:00,2
1043,354,357,7200.0,8,192,642.0,1276.475419,2011-10-28 13:17:57-06:00,3
1045,354,357,7200.0,8,192,663.0,1276.475419,2011-10-28 13:18:10-06:00,4


In [24]:
df_output.to_csv("../Dataset/MustangPredictionsRandomForestTest.csv", index=False)

In [6]:
# Trinity

In [8]:
# Import Training CSV
df = pd.read_csv("../Dataset/trinity_formatted_release_v1.0beta.csv")

# Data Cleaning Step
print("rows prior to cleaning: " + str(df.size))

# Drop first 1000 rows
df = df.iloc[1000: , :]

# Convert wallclock_limit to be in seconds
df['wallclock_limit'] = pd.to_timedelta(df['wallclock_limit']).dt.total_seconds()

# Add Runtime Attribute
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
df['runtime'] = (df['end_time'] - df['start_time']).dt.total_seconds()

# We only care about jobs that are completed
df = df[df.job_status == 'JOBEND']

# Filter to only contain rows that have non-zero runtime
df = df.dropna(subset=['start_time'])
df = df.dropna(subset=['end_time'])
df = df[df.runtime != 0]

print('rows after to cleaning: ' + str(df.size))
print(df.head())

rows prior to cleaning: 277607
rows after to cleaning: 192408
      user_ID  group_ID                submit_time                 start_time  \
1000        1         1  2016-02-05 10:15:47-07:00  2016-02-05 10:15:48-07:00   
1001        1         1  2016-02-05 10:15:48-07:00  2016-02-05 10:15:49-07:00   
1002        1         1  2016-02-05 10:15:49-07:00  2016-02-05 10:15:50-07:00   
1003        1         1  2016-02-05 10:15:50-07:00  2016-02-05 10:15:51-07:00   
1004        1         1  2016-02-05 10:15:51-07:00  2016-02-05 10:15:52-07:00   

                  dispatch_time                 queue_time  \
1000  2016-02-05 10:15:48-07:00  2016-02-05 10:15:47-07:00   
1001  2016-02-05 10:15:49-07:00  2016-02-05 10:15:48-07:00   
1002  2016-02-05 10:15:50-07:00  2016-02-05 10:15:49-07:00   
1003  2016-02-05 10:15:51-07:00  2016-02-05 10:15:50-07:00   
1004  2016-02-05 10:15:52-07:00  2016-02-05 10:15:51-07:00   

                       end_time  wallclock_limit job_status  node_count  \
100

In [9]:
# Split data to Attributes(X) and Labels(y).
X = df[['user_ID', 'group_ID', 'wallclock_limit', 'node_count', 'tasks_requested']]
y = df[['runtime']]

In [10]:
# Split data into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train random forest regressor
regressor = RandomForestRegressor(n_estimators = 30, random_state = 0)
regressor.fit(X_train, y_train.values.ravel())

# Prediction
y_pred = regressor.predict(X_test)

In [11]:
# metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 1012.5562426276107
Mean Squared Error: 12988715.22074734
Root Mean Squared Error: 3603.986018389547


In [12]:
df_result = pd.DataFrame({'actual': y_test["runtime"], 'Predicted': y_pred})
print(df_result.tail())

        actual     Predicted
3961   13879.0  14243.296395
6077     104.0     90.351883
9830    8137.0   4120.634260
15981    427.0    547.112447
5494      25.0     65.846008


In [13]:
df_output = X_test
df_output["runtime"] = y_test["runtime"]
df_output["predicted_time"] = y_pred
df_output = df_output.join(df[["submit_time"]])
df_output = df_output.sort_values(by=['submit_time'])

# we do not have a job id, lets make the index be the job id.
df_output['job_id'] = range(0, 0+len(df_output))
df_output = df_output.rename(columns={'user_ID':'user_id','runtime':'wallclock_runtime_sec','wallclock_limit':'wallclock_limit_sec','node_count':'num_cores','submit_time':'time'})

df_output.head()

Unnamed: 0,user_id,group_ID,wallclock_limit_sec,num_cores,tasks_requested,wallclock_runtime_sec,predicted_time,time,job_id
1007,1,1,20700.0,1,1,14288.0,14243.296395,2016-02-05 10:16:14-07:00,0
1009,1,1,20700.0,1,1,14217.0,14243.296395,2016-02-05 10:16:15-07:00,1
1012,1,1,20700.0,1,1,14249.0,14243.296395,2016-02-05 10:16:18-07:00,2
1016,1,1,20700.0,1,1,14227.0,14243.296395,2016-02-05 10:16:41-07:00,3
1018,1,1,20700.0,1,1,14225.0,14243.296395,2016-02-05 10:16:42-07:00,4


In [14]:
df_output.to_csv("../Dataset/TrinityPredictionsRandomForestTest.csv", index=False)

In [44]:
import time
from datetime import datetime, timedelta
df_new = df_output
df_new['time'] = pd.to_datetime(df_output['time'],utc=True)
df_new['offset'] = df_new.apply(lambda row: row['time'] if (row["wallclock_runtime_sec"] >= 14288) else row['time'] + pd.to_timedelta(30, unit='m'), axis=1)

In [45]:
df_new.head()

Unnamed: 0,user_id,group_ID,wallclock_limit_sec,num_cores,tasks_requested,wallclock_runtime_sec,predicted_time,time,job_id,offset
1007,1,1,20700.0,1,1,14288.0,14243.296395,2016-02-05 17:16:14+00:00,0,2016-02-05 17:16:14+00:00
1009,1,1,20700.0,1,1,14217.0,14243.296395,2016-02-05 17:16:15+00:00,1,2016-02-05 17:46:15+00:00
1012,1,1,20700.0,1,1,14249.0,14243.296395,2016-02-05 17:16:18+00:00,2,2016-02-05 17:46:18+00:00
1016,1,1,20700.0,1,1,14227.0,14243.296395,2016-02-05 17:16:41+00:00,3,2016-02-05 17:46:41+00:00
1018,1,1,20700.0,1,1,14225.0,14243.296395,2016-02-05 17:16:42+00:00,4,2016-02-05 17:46:42+00:00
