In [1]:
# Load libraries required to do Linear Regression
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
# Mustang

In [2]:
# Import Training CSV
df = pd.read_csv(r"../Dataset/mustang_release_v1.0beta.csv")

# Data Cleaning Step
print("rows prior to cleaning: " + str(df.size))

# Drop first 1000 rows
df = df.iloc[1000: , :]

# Convert wallclock_limit to be in seconds
df['wallclock_limit'] = pd.to_timedelta(df['wallclock_limit']).dt.total_seconds()

# Add Runtime Attribute
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
df['runtime'] = (df['end_time'] - df['start_time']).dt.total_seconds()

# We only care about jobs that are completed
df = df[df.job_status == 'COMPLETED']

# Filter to only contain rows that have non-zero runtime
df = df.dropna(subset=['start_time'])
df = df.dropna(subset=['end_time'])
df = df[df.runtime != 0]

print('rows after to cleaning: ' + str(df.size))
print(df.head())

rows prior to cleaning: 19018575
rows after to cleaning: 18147350
      user_ID  group_ID                submit_time                 start_time  \
1000      354       357  2011-10-28 13:13:27-06:00  2011-10-28 13:13:45-06:00   
1001      354       357  2011-10-28 13:13:34-06:00  2011-10-28 13:13:45-06:00   
1002      354       357  2011-10-28 13:13:40-06:00  2011-10-28 13:13:45-06:00   
1003      354       357  2011-10-28 13:13:47-06:00  2011-10-28 13:14:16-06:00   
1004      427       435  2011-10-28 13:20:06-06:00  2011-10-28 13:20:28-06:00   

                       end_time  wallclock_limit job_status  node_count  \
1000  2011-10-28 13:19:36-06:00           7200.0  COMPLETED           8   
1001  2011-10-28 13:19:43-06:00           7200.0  COMPLETED           8   
1002  2011-10-28 13:19:45-06:00           7200.0  COMPLETED           8   
1003  2011-10-28 13:20:31-06:00           7200.0  COMPLETED           8   
1004  2011-10-28 13:20:31-06:00           7200.0  COMPLETED           1 

In [3]:
# Split data to Attributes(X) and Labels(y).
X = df[['user_ID', 'group_ID', 'wallclock_limit', 'node_count', 'tasks_requested']]
y = df[['runtime']]

In [4]:
# Split data into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

regressor = LinearRegression()
  
regressor.fit(X_train, y_train)

LinearRegression()

In [5]:
#Prediction
y_pred = regressor.predict(X_test)

In [6]:
# metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 2546.403997021186
Mean Squared Error: 41671659.063300684
Root Mean Squared Error: 6455.3589414765065


In [7]:
df_result = pd.DataFrame({'actual': y_test["runtime"], 'Predicted': y_pred.flatten()})
print(df_result.tail())

         actual    Predicted
2011656     6.0  5892.981941
682778     85.0  1608.022655
937396    195.0  1389.089923
2057438    45.0   330.165995
664626     96.0  1607.994425


In [8]:
# Trinity

In [9]:
# Import Training CSV
df = pd.read_csv("../Dataset/trinity_formatted_release_v1.0beta.csv")

# Data Cleaning Step
print("rows prior to cleaning: " + str(df.size))

# Drop first 1000 rows
df = df.iloc[1000: , :]

# Convert wallclock_limit to be in seconds
df['wallclock_limit'] = pd.to_timedelta(df['wallclock_limit']).dt.total_seconds()

# Add Runtime Attribute
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
df['runtime'] = (df['end_time'] - df['start_time']).dt.total_seconds()

# We only care about jobs that are completed
df = df[df.job_status == 'JOBEND']

# Filter to only contain rows that have non-zero runtime
df = df.dropna(subset=['start_time'])
df = df.dropna(subset=['end_time'])
df = df[df.runtime != 0]

print('rows after to cleaning: ' + str(df.size))
print(df.head())

rows prior to cleaning: 277607
rows after to cleaning: 192408
      user_ID  group_ID                submit_time                 start_time  \
1000        1         1  2016-02-05 10:15:47-07:00  2016-02-05 10:15:48-07:00   
1001        1         1  2016-02-05 10:15:48-07:00  2016-02-05 10:15:49-07:00   
1002        1         1  2016-02-05 10:15:49-07:00  2016-02-05 10:15:50-07:00   
1003        1         1  2016-02-05 10:15:50-07:00  2016-02-05 10:15:51-07:00   
1004        1         1  2016-02-05 10:15:51-07:00  2016-02-05 10:15:52-07:00   

                  dispatch_time                 queue_time  \
1000  2016-02-05 10:15:48-07:00  2016-02-05 10:15:47-07:00   
1001  2016-02-05 10:15:49-07:00  2016-02-05 10:15:48-07:00   
1002  2016-02-05 10:15:50-07:00  2016-02-05 10:15:49-07:00   
1003  2016-02-05 10:15:51-07:00  2016-02-05 10:15:50-07:00   
1004  2016-02-05 10:15:52-07:00  2016-02-05 10:15:51-07:00   

                       end_time  wallclock_limit job_status  node_count  \
100

In [10]:
# Split data to Attributes(X) and Labels(y).
X = df[['user_ID', 'group_ID', 'wallclock_limit', 'node_count', 'tasks_requested']]
y = df[['runtime']]

In [12]:
# Split data into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

regressor = LinearRegression()
  
regressor.fit(X_train, y_train)

LinearRegression()

In [13]:
#Prediction
y_pred = regressor.predict(X_test)

In [14]:
# metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 4242.142144032407
Mean Squared Error: 35291747.63569829
Root Mean Squared Error: 5940.685788332715


In [15]:
df_result = pd.DataFrame({'actual': y_test["runtime"], 'Predicted': y_pred.flatten()})
print(df_result.tail())

        actual     Predicted
3961   13879.0  10095.374569
6077     104.0    -20.908626
9830    8137.0   3899.376670
15981    427.0   3737.132852
5494      25.0   8617.256876
