In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from pprint import pprint
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('/content/payload-core-mem-full-dataset-train-3-quarter-3.csv')

# df = og_df[((og_df['ml_model'] != 'mnist') & ((og_df['rate'] <= 25) | ((og_df['rate'] >= 45) & (og_df['rate'] <= 65)) | (og_df['rate'] >= 85))) | ((og_df['ml_model'] == 'mnist') & ((og_df['rate'] <= 135) | ((og_df['rate'] >= 225) & (og_df['rate'] <= 295)) | (og_df['rate'] >= 405)))]
# df = og_df[((og_df['ml_model'] != 'mnist') & ((og_df['rate'] <= 45) | (og_df['rate'] >= 85))) | ((og_df['ml_model'] == 'mnist') & ((og_df['rate'] <= 225) | (og_df['rate'] >= 445)))]
# df = og_df[((og_df['ml_model'] != 'mnist') & ((og_df['rate'] <= 25) | (og_df['rate'] >= 65))) | ((og_df['ml_model'] == 'mnist') & ((og_df['rate'] <= 135) | (og_df['rate'] >= 345)))]

selected_columns = ['rate','memory','payload_instance','cores','ml_model']


X = df[selected_columns]
y = df[['response_time']]
string_encoder = preprocessing.LabelEncoder()
X['ml_model'] = string_encoder.fit_transform(X['ml_model'])
file_prefix = 'predict_response_time_payload_3-3-3_combo_errorfree'

In [None]:
df.plot(x='rate', y=['response_time'], style='o')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

##LR with Polynomial Features

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly_features = PolynomialFeatures(degree=4)

X_poly_train = poly_features.fit_transform(X_train)
X_poly_test = poly_features.fit_transform(X_test)

lin_reg = LinearRegression()

lin_reg.fit(X_poly_train, y_train)

y_pred = lin_reg.predict(X_poly_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']

errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())

##Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(random_state = 0)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())

In [None]:
importances = regressor.feature_importances_
forest_importances = pd.Series(importances, index=selected_columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_title("Feature importances using MDI")
fig.tight_layout()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 50)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

new_reg = RandomForestRegressor()
regressor_random = RandomizedSearchCV(estimator = new_reg, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
regressor_random.fit(X_train, y_train)

In [None]:
y_pred = regressor_random.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'rf_random.csv')
regressor_random.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 10, 20, 60, 80, 90],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4, 5],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [90, 480, 180, 40, 350]
}
grid_reg = RandomForestRegressor()
grid_search = GridSearchCV(estimator = grid_reg, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y_pred = grid_search.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'rf_gridsearch.csv')

In [None]:
sns.distplot(abs(y_test-predicted_df)/y_test)

In [None]:
plt.scatter(y_test,predicted_df)

In [None]:
grid_search_best_model = RandomForestRegressor(bootstrap=True,max_depth=90,max_features='auto', \
                                               min_samples_leaf=2,min_samples_split=2,n_estimators=90)
grid_search_best_model.fit(X_train, y_train)
importances = grid_search_best_model.feature_importances_
forest_importances = pd.Series(importances, index=selected_columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_title("Feature importances using MDI")
fig.tight_layout()

In [None]:
############ new test file ############

test_10c = pd.read_csv('/content/payload-core-mem-full-dataset-test-12-full-single.csv')
# test_10c = og_test_10c[(og_test_10c['ml_model'] != 'mnist') & (og_test_10c['ml_model'] != 'bert')]
test_10c.plot(x='rate', y=['response_time'], style='o')

In [None]:
########## tests with default regressor #########

test_data = test_10c[selected_columns]
string_encoder = preprocessing.LabelEncoder()
test_data['ml_model'] = string_encoder.fit_transform(test_data['ml_model'])
test_data = test_data.dropna(axis=0)
test_labels = test_10c[['response_time']]
test_labels = test_labels.dropna(axis=0)
predicted_labels = regressor.predict(test_data)
predicted_label_df = pd.DataFrame(predicted_labels)
predicted_label_df.columns = ['response_time']

In [None]:
errors = pd.concat([predicted_label_df, test_labels.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['rate'] = test_data['rate']
errors['cores'] = test_data['cores']
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv('rf_default_unseen.csv')
regressor.get_params()

In [None]:
########## tests with random regressor #########

test_data = test_10c[selected_columns]
string_encoder = preprocessing.LabelEncoder()
test_data['ml_model'] = string_encoder.fit_transform(test_data['ml_model'])
test_labels = test_10c[['cores']]
test_labels = test_labels.dropna(axis=0)
predicted_labels = regressor_random.predict(test_data)
predicted_label_df = pd.DataFrame(predicted_labels)
predicted_label_df.columns = ['cores']
errors = pd.concat([predicted_label_df, test_labels.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['rate'] = test_data['rate']
errors['ml_model'] = test_data['ml_model']
errors['cores'] = test_data['cores']
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'rf_randomsearch.csv')
regressor.get_params()

## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(random_state=0)
dt.fit(X_train, y_train)

In [None]:
y_pred = dt.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'dt.csv')

In [None]:
test_data = test_10c[selected_columns]
string_encoder = preprocessing.LabelEncoder()
test_data['ml_model'] = string_encoder.fit_transform(test_data['ml_model'])
# test_data = test_data.dropna(axis=0)
test_labels = test_10c[['response_time']]
test_labels = test_labels.dropna(axis=0)
predicted_labels = dt.predict(test_data)
predicted_label_df = pd.DataFrame(predicted_labels)
predicted_label_df.columns = ['response_time']
errors = pd.concat([predicted_label_df, test_labels.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['rate'] = test_data['rate']
errors['ml_model'] = test_data['ml_model']
errors['cores'] = test_data['cores']
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv('dt_default_unseen.csv')
errors['error']
dt.get_params()

In [None]:
sns.distplot(y_test-predicted_df)

In [None]:
plt.scatter(y_test,predicted_df)

In [None]:
# Hyper parameters range intialization for tuning 

parameters={"splitter":["best","random"],
            "max_depth" : [None, 1,5,10,15],
            "min_samples_leaf":[1,2,3,4,5,7,9,11],
            "min_samples_split":[2,5,10,15],
            "min_weight_fraction_leaf":[0.1,0.3,0.5],
            "max_features":["auto","log2","sqrt",None] }
tuning_model=GridSearchCV(dt,param_grid=parameters,scoring='neg_mean_squared_error',cv=3,verbose=3)
tuning_model.fit(X_train,y_train)
tuning_model.best_params_

In [None]:
tuned_hyper_model= DecisionTreeRegressor(max_depth=None,max_features='auto',min_samples_leaf=1,min_samples_split=2,min_weight_fraction_leaf=0.1,splitter='best')
tuned_hyper_model.fit(X_train, y_train)

In [None]:
y_pred = tuned_hyper_model.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'dt_gridsearch.csv')

In [None]:
test_data = test_10c[selected_columns]
string_encoder = preprocessing.LabelEncoder()
test_data['ml_model'] = string_encoder.fit_transform(test_data['ml_model'])
test_data = test_data.dropna(axis=0)
test_labels = test_10c[['response_time']]
test_labels = test_labels.dropna(axis=0)
predicted_labels = tuned_hyper_model.predict(test_data)
predicted_label_df = pd.DataFrame(predicted_labels)
predicted_label_df.columns = ['response_time']
errors = pd.concat([predicted_label_df, test_labels.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'dt_gridsearch_unseen.csv')

errors['error'].mean()
errors['error'].count()

tuned_hyper_model.get_params()

##Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
rr = Ridge(alpha=0.01)
rr.fit(X_train, y_train)

In [None]:
y_pred = rr.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())

In [None]:
predicted_labels = rr.predict(test_data)
predicted_label_df = pd.DataFrame(predicted_labels)
predicted_label_df.columns = ['response_time']
errors = pd.concat([predicted_label_df, test_labels.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'rr_default.csv')

errors['error'].mean()
errors['error'].count()

##Elastic Net Regression

In [None]:
from sklearn.linear_model import ElasticNet
model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(X_train, y_train)

In [None]:
y_pred = model_enet.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())


In [None]:
predicted_labels = model_enet.predict(test_data)
predicted_label_df = pd.DataFrame(predicted_labels)
predicted_label_df.columns = ['response_time']
errors = pd.concat([predicted_label_df, test_labels.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'en_default.csv')

errors['error'].mean()
errors['error'].count()

## Support Vector Regression


In [None]:
from sklearn.svm import SVR
svm_regressor = SVR(kernel='rbf')
svm_regressor.fit(X_train,y_train)

In [None]:
y_pred = svm_regressor.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'rr_default.csv')

In [None]:
predicted_labels = svm_regressor.predict(test_data)
predicted_label_df = pd.DataFrame(predicted_labels)
predicted_label_df.columns = ['response_time']
errors = pd.concat([predicted_label_df, test_labels.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'svm_default.csv')
errors['error'].mean()
errors['error'].count()

Adaboost Algorithms

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ada_regr = AdaBoostRegressor(random_state=0, n_estimators=50)
ada_regr.fit(X_train,y_train)
y_pred = ada_regr.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())

In [None]:
predicted_labels = ada_regr.predict(test_data)
predicted_label_df = pd.DataFrame(predicted_labels)
predicted_label_df.columns = ['response_time']

errors = pd.concat([predicted_label_df, test_labels.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'ad_default.csv')
errors['error'].mean()
errors['error'].count()

Gradient Boosting Algorithms

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

grad_reg = GradientBoostingRegressor(random_state=0)
grad_reg.fit(X_train,y_train)
y_pred = grad_reg.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())

In [None]:
predicted_labels = grad_reg.predict(test_data)
predicted_label_df = pd.DataFrame(predicted_labels)
predicted_label_df.columns = ['response_time']

errors = pd.concat([predicted_label_df, test_labels.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'gb_default.csv')
errors['error'].mean()
errors['error'].count()

Extreme Gradient Boosting

In [None]:
from xgboost import XGBRegressor 
xgb_reg = XGBRegressor(objective ='reg:linear',
                  n_estimators = 10, seed = 123)
xgb_reg.fit(X_train,y_train)
y_pred = xgb_reg.predict(X_test)
predicted_df = pd.DataFrame(y_pred)
predicted_df.columns = ['response_time']
errors = pd.concat([predicted_df, y_test.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())

In [None]:
predicted_labels = xgb_reg.predict(test_data)
predicted_label_df = pd.DataFrame(predicted_labels)
predicted_label_df.columns = ['response_time']

errors = pd.concat([predicted_label_df, test_labels.reset_index(drop=True)], keys=['predicted', 'actual'], axis=1)
errors['error'] = (((errors['predicted'] - errors['actual']).abs())/errors['actual'])*100
print('Average Relative Error = ', errors['error'].mean())
errors.to_csv(file_prefix+'xgb_default.csv')
errors['error'].mean()
errors['error'].count()