In [76]:
# Load libraries required to do SVR
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [4]:
# Import Training CSV
df = pd.read_csv(r"../Dataset/mustang_release_v1.0beta.csv")

# Data Cleaning Step
print("rows prior to cleaning: " + str(df.size))

# Drop first 1000 rows
df = df.iloc[1000: , :]

# Convert wallclock_limit to be in seconds
df['wallclock_limit'] = pd.to_timedelta(df['wallclock_limit']).dt.total_seconds()

# Add Runtime Attribute
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
df['runtime'] = (df['end_time'] - df['start_time']).dt.total_seconds()

# We only care about jobs that are completed
df = df[df.job_status == 'COMPLETED']

# Filter to only contain rows that have non-zero runtime
df = df.dropna(subset=['start_time'])
df = df.dropna(subset=['end_time'])
df = df[df.runtime != 0]

print('rows after to cleaning: ' + str(df.size))
print(df.head())

rows prior to cleaning: 587889
rows after to cleaning: 423010
   user_ID  group_ID                submit_time                 start_time  \
1      351       354  2011-10-27 11:50:46-06:00  2011-10-27 11:51:06-06:00   
2      287       288  2011-10-27 12:02:50-06:00  2011-10-27 12:03:08-06:00   
3      287       288  2011-10-27 12:02:29-06:00  2011-10-27 12:02:36-06:00   
4      351       354  2011-10-27 12:02:26-06:00  2011-10-27 12:02:36-06:00   
5      351       354  2011-10-27 13:41:52-06:00  2011-10-27 13:41:54-06:00   

                    end_time  wallclock_limit job_status  node_count  \
1  2011-10-27 11:51:13-06:00       31536000.0  COMPLETED           1   
2  2011-10-27 12:03:13-06:00       31536000.0  COMPLETED           1   
3  2011-10-27 12:03:14-06:00       31536000.0  COMPLETED           1   
4  2011-10-27 12:03:18-06:00       31536000.0  COMPLETED         800   
5  2011-10-27 13:42:02-06:00       31536000.0  COMPLETED           1   

   tasks_requested  runtime  
1     

In [67]:
# Split data to Attributes(X) and Labels(y).
X = df[['user_ID', 'group_ID', 'wallclock_limit', 'node_count', 'tasks_requested']]
y = df[['runtime']]

In [81]:
# Split data into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create SVR model and train it
regressor = SVR(kernel='sigmoid')
regressor.fit(X_train, y_train.values.ravel())

SVR(kernel='sigmoid')

In [82]:
#Prediction
y_pred = regressor.predict(X_test)

In [83]:
# metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 6552.364566213863
Mean Squared Error: 245528898.86288422
Root Mean Squared Error: 15669.36178862701


In [86]:
df_result = pd.DataFrame({'actual': y_test['runtime'], 'Predicted': y_pred.flatten()})
print(df_result.head())

        actual  Predicted
36641     31.0  67.900298
30659     11.0  68.359817
29614     10.0  68.359860
28667     81.0  67.900333
60007  44169.0  61.541643
