In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, matthews_corrcoef, accuracy_score, precision_score, recall_score, f1_score, fbeta_score, confusion_matrix, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPRegressor
from sklearn.cluster import KMeans
from IPython.display import FileLink, FileLinks, clear_output
from IPython.display import FileLink, FileLinks
from xgboost import XGBRegressor
import timeit

In [2]:
df = pd.read_csv(r"C:\Users\rrava\OneDrive\Documents\01 - Fall 2022\Research Project\Clean Datasets\cleaned_ANL_with_waiting_times_full.csv")

In [3]:
#Runtime models

In [4]:
test_size = 0.1

tscv = TimeSeriesSplit(n_splits = 3)

beta = 0.25
fbeta_scorer = make_scorer(fbeta_score, beta=beta)

In [5]:
threshold_time = 15 * 60
X = df[['user_id', 'queue_number', 'submit_time', 'requested_time', 'requested_CPUs']]
y = df[['runtime']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)

In [6]:
# Runtimes - Regression - Random Forest

## Hyperparameter tuning with cross-validation

cv_clf = RandomForestRegressor()
param_search = {'n_estimators': [100, 250], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['runtime'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=50,
                      min_samples_split=100, n_estimators=250, random_state=0)
MCC: 0.5733255289985368
F1 score: 0.8524624157594608
Precision score: 0.7584870848708487
Recall score: 0.9730177514792899
FBeta score: 0.7684534608829512


In [7]:
# Runtimes - Regression - Linear Regression

## Hyperparameter tuning with cross-validation

cv_clf = LinearRegression()
param_search = {}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['runtime'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

LinearRegression()
MCC: 0.1614363921064019
F1 score: 0.7530499803227076
Precision score: 0.644384576528035
Recall score: 0.9057988165680473
FBeta score: 0.6555129018932181


In [8]:
# Runtimes - Regression - Gradient Boosting

## Hyperparameter tuning with cross-validation

cv_clf = GradientBoostingRegressor()
param_search = {'n_estimators': [100, 250], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['runtime'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

GradientBoostingRegressor(max_features='sqrt', min_samples_leaf=250,
                          min_samples_split=100, random_state=0)
MCC: 0.5937210486886912
F1 score: 0.8542108198549916
Precision score: 0.8078059071729958
Recall score: 0.9062721893491125
FBeta score: 0.8130019359270593


In [9]:
#Classification

In [6]:
df['should_wait_runtime_actual'] = df['runtime'] > threshold_time
y = df[['should_wait_runtime_actual']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)

In [11]:
# Runtimes - Classification - Random Forest

## Hyperparameter tuning with cross-validation

cv_clf = RandomForestClassifier()
param_search = {'n_estimators': [100, 250], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

RandomForestClassifier(max_depth=15, min_samples_leaf=50, min_samples_split=500,
                       random_state=0)
MCC: 0.6151297113679481
F1 score: 0.8647663951993142
Precision score: 0.7900920305463089
Recall score: 0.9550295857988166
FBeta score: 0.7982010077149538


In [23]:
# Runtimes - Classification - Gradient Boosting

## Hyperparameter tuning with cross-validation

cv_clf = GradientBoostingClassifier()
param_search = {'n_estimators': [100], 'max_features': ['sqrt'], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

GradientBoostingClassifier(max_features='sqrt', min_samples_leaf=50,
                           min_samples_split=500, random_state=0)
MCC: 0.5967221615488132
F1 score: 0.8563212034067028
Precision score: 0.8037790697674418
Recall score: 0.9162130177514793
FBeta score: 0.809623405223853


In [13]:
# Runtimes - Classification - Logistic Regression

## Hyperparameter tuning with cross-validation

cv_clf = LogisticRegression()
param_search = {'max_iter': [1000, 10000], 'C': [0.1, 0.01, 0.001, 0.0001], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

LogisticRegression(C=0.1, max_iter=1000, random_state=0)
MCC: 0.5522736161851266
F1 score: 0.8454169607744277
Precision score: 0.7364722417427969
Recall score: 0.9921893491124261
FBeta score: 0.7478094798367211


In [14]:
# Runtimes - Classification - Nearest Neighbors

## Hyperparameter tuning with cross-validation

cv_clf = KNeighborsClassifier()
param_search = {'n_neighbors': [5, 10, 15, 25, 50]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

KNeighborsClassifier(n_neighbors=50)
MCC: 0.0
F1 score: 0.7599604280960518
Precision score: 0.6128517551494053
Recall score: 1.0
FBeta score: 0.6271337390529909


In [7]:
# Runtimes - Classification - Naive Bayes

## Hyperparameter tuning with cross-validation

cv_clf = GaussianNB()
param_search = {}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

GaussianNB()
MCC: 0.39883627179129255
F1 score: 0.6535061873895109
Precision score: 0.8653921186110027
Recall score: 0.5249704142011834
FBeta score: 0.8335949417460703


In [16]:
#Waiting Time Models

In [24]:
df['total_remaining_runtime'] = df['running_job_requested_wallclock_limit'] - df['elapsed_runtime_total']
df['average_remaining_runtime'] = df['running_job_mean_wallclock_limit'] - df['elapsed_runtime_mean']

X = df[['total_remaining_runtime', 'average_remaining_runtime', 'user_id', 'queue_number', 'submit_time', 'requested_time', 'requested_CPUs', 'num_running_jobs', 'num_waiting_jobs', 'running_job_mean_CPU_time', 'running_job_requested_wallclock_limit', 'running_job_mean_wallclock_limit', 'waiting_job_requested_CPU_time', 'waiting_job_mean_CPU_time', 'waiting_job_requested_wallclock_limit', 'waiting_job_mean_wallclock_limit', 'elapsed_runtime_total', 'elapsed_runtime_mean', 'elapsed_waiting_time_total', 'elapsed_waiting_time_mean']]

y = df[['wait_time']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)

waiting_time_threshold = 6 * 60 * 60
beta = 0.25

In [25]:
# Waiting Times - Regression - Random Forest

## Hyperparameter tuning with cross-validation

cv_clf = RandomForestRegressor()
param_search = {'n_estimators': [100], 'max_features': ['sqrt'], 'max_depth': [5, 10, 15], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['wait_time'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

RandomForestRegressor(max_depth=5, max_features='sqrt', min_samples_leaf=250,
                      min_samples_split=100, random_state=0)
MCC: 0.0
F1 score: 0.48516809492419255
Precision score: 0.3202785030461271
Recall score: 1.0
FBeta score: 0.3336177474402731


In [26]:
# Waiting Times - Regression - Linear Regression

## Hyperparameter tuning with cross-validation

cv_clf = LinearRegression()
param_search = {}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['wait_time'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

LinearRegression()
MCC: 0.2766183943534182
F1 score: 0.5496711664935964
Precision score: 0.4448179271708683
Recall score: 0.7192028985507246
FBeta score: 0.455029665587918


In [27]:
# Waiting Times - Regression - Gradient Boosting

## Hyperparameter tuning with cross-validation

cv_clf = GradientBoostingRegressor()
param_search = {'n_estimators': [100], 'max_features': ['sqrt'], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring='neg_mean_absolute_error')
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

y_test_binary = y_test['wait_time'] > threshold_time
y_pred_binary = y_pred > threshold_time

print("MCC: " + str(matthews_corrcoef(y_test_binary, y_pred_binary)))
print("F1 score: " + str(f1_score(y_test_binary, y_pred_binary)))
print("Precision score: " + str(precision_score(y_test_binary, y_pred_binary)))
print("Recall score: " + str(recall_score(y_test_binary, y_pred_binary)))
print("FBeta score: " + str(fbeta_score(y_test_binary, y_pred_binary, beta=beta)))

GradientBoostingRegressor(max_features='sqrt', min_samples_leaf=250,
                          min_samples_split=100, random_state=0)
MCC: 0.3587105189308657
F1 score: 0.5882352941176471
Precision score: 0.5122606650990931
Recall score: 0.6906702898550725
FBeta score: 0.5201645264847512


In [28]:
# Waiting time classification
df['should_wait_waiting_time_actual'] = df['wait_time'] < waiting_time_threshold
y = df[['should_wait_waiting_time_actual']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)

In [29]:
# Waiting Times - Classification - Random Forest

## Hyperparameter tuning with cross-validation

cv_clf = RandomForestClassifier()
param_search = {'n_estimators': [100, 250], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

RandomForestClassifier(max_depth=15, min_samples_leaf=50, min_samples_split=100,
                       n_estimators=250, random_state=0)
MCC: 0.40038652204653113
F1 score: 0.9634071340713407
Precision score: 0.9398530073496325
Recall score: 0.9881722125847658
FBeta score: 0.9425641297903781


In [30]:
# Waiting Times - Classification - Gradient Boosting

## Hyperparameter tuning with cross-validation

cv_clf = GradientBoostingClassifier()
param_search = {'n_estimators': [100, 250], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [50, 100, 250], 'min_samples_split': [100, 200, 500], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

GradientBoostingClassifier(max_features='sqrt', min_samples_leaf=250,
                           min_samples_split=100, n_estimators=250,
                           random_state=0)
MCC: 0.582607966651537
F1 score: 0.970556161395856
Precision score: 0.958904109589041
Recall score: 0.9824948746254534
FBeta score: 0.9602603973053593


In [31]:
# Waiting Times - Classification - Logistic Regression

## Hyperparameter tuning with cross-validation

cv_clf = LogisticRegression()
param_search = {'max_iter': [1000, 10000], 'C': [0.1, 0.01, 0.001, 0.0001], 'random_state': [0]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

LogisticRegression(C=0.1, max_iter=1000, random_state=0)
MCC: 0.0
F1 score: 0.958216849263317
Precision score: 0.9197853205686104
Recall score: 1.0
FBeta score: 0.9241459128123795


In [32]:
# Waiting Times - Classification - Nearest Neighbors

## Hyperparameter tuning with cross-validation

cv_clf = KNeighborsClassifier()
param_search = {'n_neighbors': [5, 10, 15, 25, 50]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

KNeighborsClassifier(n_neighbors=50)
MCC: 0.0027762111699319997
F1 score: 0.9575803402646502
Precision score: 0.9198140615920977
Recall score: 0.9985806655101719
FBeta score: 0.9241018156844228


In [33]:
# Waiting Times - Classification - Naive Bayes

## Hyperparameter tuning with cross-validation

cv_clf = GaussianNB()
param_search = {'var_smoothing': [1e-11, 1e-10, 1e-9]}
gsearch = GridSearchCV(estimator=cv_clf, cv=tscv, param_grid=param_search, scoring=fbeta_scorer)
gsearch.fit(X_train, y_train.values.ravel())
print(gsearch.best_estimator_)

## Test set evaluation

y_pred = gsearch.predict(X_test)

print("MCC: " + str(matthews_corrcoef(y_test, y_pred)))
print("F1 score: " + str(f1_score(y_test, y_pred)))
print("Precision score: " + str(precision_score(y_test, y_pred)))
print("Recall score: " + str(recall_score(y_test, y_pred)))
print("FBeta score: " + str(fbeta_score(y_test, y_pred, beta=beta)))

GaussianNB(var_smoothing=1e-11)
MCC: 0.04268542482587055
F1 score: 0.9569037339998485
Precision score: 0.9205770912270476
Recall score: 0.9962151080271251
FBeta score: 0.9247070169546985
