In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, matthews_corrcoef, accuracy_score, precision_score, recall_score, f1_score, fbeta_score, confusion_matrix, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPRegressor
from sklearn.cluster import KMeans
from IPython.display import FileLink, FileLinks, clear_output
from IPython.display import FileLink, FileLinks
from xgboost import XGBRegressor
import timeit

In [2]:
df = pd.read_csv(r"C:\Users\rrava\OneDrive\Documents\01 - Fall 2022\Research Project\Clean Datasets\cleaned_RICC_with_waiting_times_full.csv")

In [3]:
#Runtime models

In [4]:
test_size = 0.1

tscv = TimeSeriesSplit(n_splits = 3)

beta = 0.25
fbeta_scorer = make_scorer(fbeta_score, beta=beta)

In [5]:
threshold_time = 15 * 60
X = df[['user_id', 'group_id', 'submit_time', 'requested_time', 'requested_CPUs', 'requested_memory']]
y = df[['runtime']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)

In [6]:
# Runtimes - Regression - Random Forest

## Hyperparameter tuning with cross-validation

cv_clf = RandomForestRegressor()
param_search = {'n_estimators': [100, 250], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['runtime'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

RandomForestRegressor(max_depth=5, max_features='sqrt', min_samples_leaf=50,
                      min_samples_split=100, random_state=0)
MCC: 0.0
F1 score: 0.8483398914692796
Precision score: 0.7366234926306386
Recall score: 1.0
FBeta score: 0.7482153960286391


In [7]:
# Runtimes - Regression - Linear Regression

## Hyperparameter tuning with cross-validation

cv_clf = LinearRegression()
param_search = {}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['runtime'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

LinearRegression()
MCC: 0.44610386981648803
F1 score: 0.8824925091855689
Precision score: 0.7982389443476982
Recall score: 0.986630691808646
FBeta score: 0.80730663833935


In [8]:
# Runtimes - Regression - Gradient Boosting

## Hyperparameter tuning with cross-validation

cv_clf = GradientBoostingRegressor()
param_search = {'n_estimators': [100, 250], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['runtime'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

GradientBoostingRegressor(max_features='sqrt', min_samples_leaf=250,
                          min_samples_split=100, random_state=0)
MCC: 0.3576465566520686
F1 score: 0.8702592177297013
Precision score: 0.7703355986828277
Recall score: 0.9999696841084096
FBeta score: 0.7808840117534015


In [9]:
#Classification

In [10]:
df['should_wait_runtime_actual'] = df['runtime'] > threshold_time
y = df[['should_wait_runtime_actual']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)

In [12]:
# Runtimes - Classification - Random Forest

## Hyperparameter tuning with cross-validation

cv_clf = RandomForestClassifier()
param_search = {'n_estimators': [100], 'max_features': ['sqrt'], 'max_depth': [10], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

RandomForestClassifier(max_depth=10, min_samples_leaf=250,
                       min_samples_split=100, random_state=0)
MCC: 0.5333896100793349
F1 score: 0.8963528784793866
Precision score: 0.8159721358377908
Recall score: 0.9943006123810101
FBeta score: 0.8246724626020182


In [13]:
# Runtimes - Classification - Gradient Boosting

## Hyperparameter tuning with cross-validation

cv_clf = GradientBoostingClassifier()
param_search = {'n_estimators': [100], 'max_features': ['sqrt'], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

GradientBoostingClassifier(max_features='sqrt', min_samples_leaf=50,
                           min_samples_split=500, random_state=0)
MCC: 0.5095188393698783
F1 score: 0.8922046037193279
Precision score: 0.80732503620787
Recall score: 0.9970290426241436
FBeta score: 0.8164631457202188


In [14]:
# Runtimes - Classification - Logistic Regression

## Hyperparameter tuning with cross-validation

cv_clf = LogisticRegression()
param_search = {'max_iter': [1000, 10000], 'C': [0.1, 0.01, 0.001, 0.0001], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

LogisticRegression(C=0.1, max_iter=1000, random_state=0)
MCC: 0.3517939164684428
F1 score: 0.8498511517673922
Precision score: 0.8119960234176516
Recall score: 0.8914084763232887
FBeta score: 0.816273608784117


In [15]:
# Runtimes - Classification - Nearest Neighbors

## Hyperparameter tuning with cross-validation

cv_clf = KNeighborsClassifier()
param_search = {'n_neighbors': [5, 10, 15, 25, 50]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

KNeighborsClassifier(n_neighbors=50)
MCC: 0.02815257410853256
F1 score: 0.848480950788465
Precision score: 0.7368856121537086
Recall score: 0.9999090523252289
FBeta score: 0.7484669210007128


In [16]:
# Runtimes - Classification - Naive Bayes

## Hyperparameter tuning with cross-validation

cv_clf = GaussianNB()
param_search = {'var_smoothing': [1e-11, 1e-10, 1e-9]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

GaussianNB(var_smoothing=1e-11)
MCC: 0.20485912106895296
F1 score: 0.8287052714753357
Precision score: 0.7751148181386264
Recall score: 0.8902564724428546
FBeta score: 0.7810570716031302


In [17]:
#Waiting Time Models

In [18]:
df['total_remaining_runtime'] = df['running_job_requested_wallclock_limit'] - df['elapsed_runtime_total']
df['average_remaining_runtime'] = df['running_job_mean_wallclock_limit'] - df['elapsed_runtime_mean']

X = df[['total_remaining_runtime', 'average_remaining_runtime', 'user_id', 'queue_number', 'submit_time', 'requested_time', 'requested_CPUs', 'num_running_jobs', 'num_waiting_jobs', 'running_job_mean_CPU_time', 'running_job_requested_wallclock_limit', 'running_job_mean_wallclock_limit', 'waiting_job_requested_CPU_time', 'waiting_job_mean_CPU_time', 'waiting_job_requested_wallclock_limit', 'waiting_job_mean_wallclock_limit', 'elapsed_runtime_total', 'elapsed_runtime_mean', 'elapsed_waiting_time_total', 'elapsed_waiting_time_mean']]

y = df[['wait_time']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)

waiting_time_threshold = 6 * 60 * 60
beta = 0.25

In [19]:
# Waiting Times - Regression - Random Forest

## Hyperparameter tuning with cross-validation

cv_clf = RandomForestRegressor()
param_search = {'n_estimators': [100], 'max_features': ['sqrt'], 'max_depth': [10], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['wait_time'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=100,
                      min_samples_split=100, random_state=0)
MCC: 0.13075374854708938
F1 score: 0.26252983293556087
Precision score: 0.1510989010989011
Recall score: 1.0
FBeta score: 0.1590406531723082


In [20]:
# Waiting Times - Regression - Linear Regression

## Hyperparameter tuning with cross-validation

cv_clf = LinearRegression()
param_search = {}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['wait_time'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

LinearRegression()
MCC: 0.37931654709702844
F1 score: 0.4675635617375442
Precision score: 0.35857305237387427
Recall score: 0.6717444717444717
FBeta score: 0.36868379720460925


In [21]:
# Waiting Times - Regression - Gradient Boosting

## Hyperparameter tuning with cross-validation

cv_clf = GradientBoostingRegressor()
param_search = {'n_estimators': [100], 'max_features': ['sqrt'], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['wait_time'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

GradientBoostingRegressor(max_features='sqrt', min_samples_leaf=100,
                          min_samples_split=500, random_state=0)
MCC: 0.17649811910389684
F1 score: 0.3054508275289156
Precision score: 0.18973524391696106
Recall score: 0.782964782964783
FBeta score: 0.19858599731666965


In [22]:
# Waiting time classification
df['should_wait_waiting_time_actual'] = df['wait_time'] < waiting_time_threshold
y = df[['should_wait_waiting_time_actual']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)

In [23]:
# Waiting Times - Classification - Random Forest

## Hyperparameter tuning with cross-validation

cv_clf = RandomForestClassifier()
param_search = {'n_estimators': [100], 'max_features': ['sqrt'], 'max_depth': [10], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

RandomForestClassifier(max_depth=10, min_samples_leaf=100,
                       min_samples_split=100, random_state=0)
MCC: 0.40950711724441824
F1 score: 0.9842452215837036
Precision score: 0.9698518485162336
Recall score: 0.9990722486373652
FBeta score: 0.9715232991791589


In [24]:
# Waiting Times - Classification - Gradient Boosting

## Hyperparameter tuning with cross-validation

cv_clf = GradientBoostingClassifier()
param_search = {'n_estimators': [100], 'max_features': ['sqrt'], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

GradientBoostingClassifier(max_features='sqrt', min_samples_leaf=100,
                           min_samples_split=500, random_state=0)
MCC: 0.41650887376593954
F1 score: 0.9842790665767239
Precision score: 0.9705961937404167
Recall score: 0.9983532413313232
FBeta score: 0.9721861660220839


In [25]:
# Waiting Times - Classification - Logistic Regression

## Hyperparameter tuning with cross-validation

cv_clf = LogisticRegression()
param_search = {'max_iter': [1000, 10000], 'C': [0.1, 0.01, 0.001, 0.0001], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

LogisticRegression(C=0.0001, max_iter=1000, random_state=0)
MCC: 0.0
F1 score: 0.9810569429432846
Precision score: 0.9628182224207236
Recall score: 1.0
FBeta score: 0.9649286790987303


In [26]:
# Waiting Times - Classification - Nearest Neighbors

## Hyperparameter tuning with cross-validation

cv_clf = KNeighborsClassifier()
param_search = {'n_neighbors': [5, 10, 15, 25, 50]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

KNeighborsClassifier(n_neighbors=50)
MCC: -0.012196409060691105
F1 score: 0.9375118934348239
Precision score: 0.9621139997558892
Recall score: 0.914136611388148
FBeta score: 0.9591528226123927


In [27]:
# Waiting Times - Classification - Naive Bayes

## Hyperparameter tuning with cross-validation

cv_clf = GaussianNB()
param_search = {'var_smoothing': [1e-11, 1e-10, 1e-9]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

GaussianNB(var_smoothing=1e-11)
MCC: -0.00624689959685952
F1 score: 0.9800236891257801
Precision score: 0.9627648862136096
Recall score: 0.9979125594340716
FBeta score: 0.9647637174713343
