In [1]:
# Load libraries required to do Neural Nets
import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import metrics

In [2]:
# Import Training CSV
df = pd.read_csv(r"../Dataset/mustang_release_v1.0beta.csv")

# Data Cleaning Step
print("rows prior to cleaning: " + str(df.size))

# Drop first 1000 rows
df = df.iloc[1000: , :]

# Convert wallclock_limit to be in seconds
df['wallclock_limit'] = pd.to_timedelta(df['wallclock_limit']).dt.total_seconds()

# Add Runtime Attribute
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
df['runtime'] = (df['end_time'] - df['start_time']).dt.total_seconds()

# We only care about jobs that are completed
df = df[df.job_status == 'COMPLETED']

# Filter to only contain rows that have non-zero runtime
df = df.dropna(subset=['start_time'])
df = df.dropna(subset=['end_time'])
df = df[df.runtime != 0]

print('rows after to cleaning: ' + str(df.size))
print(df.head())

rows prior to cleaning: 19018575
rows after to cleaning: 18147350
      user_ID  group_ID                submit_time                 start_time  \
1000      354       357  2011-10-28 13:13:27-06:00  2011-10-28 13:13:45-06:00   
1001      354       357  2011-10-28 13:13:34-06:00  2011-10-28 13:13:45-06:00   
1002      354       357  2011-10-28 13:13:40-06:00  2011-10-28 13:13:45-06:00   
1003      354       357  2011-10-28 13:13:47-06:00  2011-10-28 13:14:16-06:00   
1004      427       435  2011-10-28 13:20:06-06:00  2011-10-28 13:20:28-06:00   

                       end_time  wallclock_limit job_status  node_count  \
1000  2011-10-28 13:19:36-06:00           7200.0  COMPLETED           8   
1001  2011-10-28 13:19:43-06:00           7200.0  COMPLETED           8   
1002  2011-10-28 13:19:45-06:00           7200.0  COMPLETED           8   
1003  2011-10-28 13:20:31-06:00           7200.0  COMPLETED           8   
1004  2011-10-28 13:20:31-06:00           7200.0  COMPLETED           1 

In [3]:
# Split data to Attributes(X) and Labels(y).
X = df[['user_ID', 'group_ID', 'wallclock_limit', 'node_count', 'tasks_requested']]
y = df[['runtime']]

In [4]:
# Split data into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# define the keras model
# Tune hidden_layer_size + max_iter. Monitor training progress.
mlp = MLPRegressor(hidden_layer_sizes=(128,64,32), activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train, y_train.values.ravel())

y_pred = mlp.predict(X_test)

In [5]:
# metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 1633.7130188481042
Mean Squared Error: 31906266.85329935
Root Mean Squared Error: 5648.563255669476


In [6]:
df_result = pd.DataFrame({'actual': y_test["runtime"], 'Predicted': y_pred})
print(df_result.tail())

         actual   Predicted
2011656     6.0  928.871723
682778     85.0  282.542249
937396    195.0  297.935724
2057438    45.0  203.295402
664626     96.0  235.517403
