In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge

from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error

import time

pd.set_option('display.max_rows', None)

In [21]:
def load_data():
    df = pd.read_csv('../data/train_workloads_final_110_clusters_50_batch.csv')
    
    feature_cols = ['cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5', 'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9', 'cluster_10', 'cluster_11', 'cluster_12', 'cluster_13', 'cluster_14', 'cluster_15', 'cluster_16', 'cluster_17', 'cluster_18', 'cluster_19', 'cluster_20', 'cluster_21', 'cluster_22', 'cluster_23', 'cluster_24', 'cluster_25', 'cluster_26', 'cluster_27', 'cluster_28', 'cluster_29', 'cluster_30', 'cluster_31', 'cluster_32', 'cluster_33', 'cluster_34', 'cluster_35', 'cluster_36', 'cluster_37', 'cluster_38', 'cluster_39', 'cluster_40', 'cluster_41', 'cluster_42', 'cluster_43', 'cluster_44', 'cluster_45', 'cluster_46', 'cluster_47', 'cluster_48', 'cluster_49', 'cluster_50', 'cluster_51', 'cluster_52', 'cluster_53', 'cluster_54', 'cluster_55', 'cluster_56', 'cluster_57', 'cluster_58', 'cluster_59', 'cluster_60', 'cluster_61', 'cluster_62', 'cluster_63', 'cluster_64', 'cluster_65', 'cluster_66', 'cluster_67', 'cluster_68', 'cluster_69', 'cluster_70', 'cluster_71', 'cluster_72', 'cluster_73', 'cluster_74', 'cluster_75', 'cluster_76', 'cluster_77', 'cluster_78', 'cluster_79', 'cluster_80', 'cluster_81', 'cluster_82', 'cluster_83', 'cluster_84', 'cluster_85', 'cluster_86', 'cluster_87', 'cluster_88', 'cluster_89', 'cluster_90', 'cluster_91', 'cluster_92', 'cluster_93', 'cluster_94', 'cluster_95', 'cluster_96', 'cluster_97', 'cluster_98', 'cluster_99', 'cluster_100', 'cluster_101', 'cluster_102', 'cluster_103', 'cluster_104', 'cluster_105', 'cluster_106', 'cluster_107', 'cluster_108', 'cluster_109']
    target_col = ['actual']
    
    X = df[feature_cols]
    y = df[target_col]
    
    print('X.shape: ', X.shape)
    print('y.shape: ', y.shape)
    
    return X.values, y.values

def load_test_data():
    df = pd.read_csv('../data/test_workloads_final_110_clusters_50_batch.csv')
    
    feature_cols = ['cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5', 'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9', 'cluster_10', 'cluster_11', 'cluster_12', 'cluster_13', 'cluster_14', 'cluster_15', 'cluster_16', 'cluster_17', 'cluster_18', 'cluster_19', 'cluster_20', 'cluster_21', 'cluster_22', 'cluster_23', 'cluster_24', 'cluster_25', 'cluster_26', 'cluster_27', 'cluster_28', 'cluster_29', 'cluster_30', 'cluster_31', 'cluster_32', 'cluster_33', 'cluster_34', 'cluster_35', 'cluster_36', 'cluster_37', 'cluster_38', 'cluster_39', 'cluster_40', 'cluster_41', 'cluster_42', 'cluster_43', 'cluster_44', 'cluster_45', 'cluster_46', 'cluster_47', 'cluster_48', 'cluster_49', 'cluster_50', 'cluster_51', 'cluster_52', 'cluster_53', 'cluster_54', 'cluster_55', 'cluster_56', 'cluster_57', 'cluster_58', 'cluster_59', 'cluster_60', 'cluster_61', 'cluster_62', 'cluster_63', 'cluster_64', 'cluster_65', 'cluster_66', 'cluster_67', 'cluster_68', 'cluster_69', 'cluster_70', 'cluster_71', 'cluster_72', 'cluster_73', 'cluster_74', 'cluster_75', 'cluster_76', 'cluster_77', 'cluster_78', 'cluster_79', 'cluster_80', 'cluster_81', 'cluster_82', 'cluster_83', 'cluster_84', 'cluster_85', 'cluster_86', 'cluster_87', 'cluster_88', 'cluster_89', 'cluster_90', 'cluster_91', 'cluster_92', 'cluster_93', 'cluster_94', 'cluster_95', 'cluster_96', 'cluster_97', 'cluster_98', 'cluster_99', 'cluster_100', 'cluster_101', 'cluster_102', 'cluster_103', 'cluster_104', 'cluster_105', 'cluster_106', 'cluster_107', 'cluster_108', 'cluster_109']
    target_cols = ['db2', 'actual']
    
    X = df[feature_cols]
    Y = df[target_cols]
    
    return X, Y

def my_validation_curve(estimator_name, estimator, param_name, param_range):
    train_scores, valid_scores = validation_curve(estimator, X, y, param_name=param_name,
        param_range=param_range, cv=10, scoring="neg_mean_squared_error",
    )

    train_scores = np.sqrt(np.abs(train_scores))
    valid_scores = np.sqrt(np.abs(valid_scores))
    
    print(len(train_scores))
    print(len(valid_scores))

    train_scores_mean = np.mean(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    
    title_str = "Validation Curve with " + estimator_name
    plt.title(title_str)
    plt.xlabel(param_name)
    plt.ylabel("RMSE")
    plt.plot(param_range, train_scores_mean, label="train rmse")
    plt.plot(param_range, valid_scores_mean, label="validation rmse")

    plt.legend(loc='lower right')
    plt.show()
    
    train_rmse = [round(elem, 2) for elem in train_scores_mean]
    valid_rmse = [round(elem, 2) for elem in valid_scores_mean]
    
    df_scores = pd.DataFrame({param_name: param_range, 'training rmse': train_rmse, 'validation rmse': valid_rmse})
    print(df_scores)
    
def cross_validate(model):
    # Load data
    X, y = load_data()
    train_data = X.copy()
    train_targets = y.copy()

    k = 10
    num_val_samples = len(train_data) // k
    all_train_scores = []
    all_scores = []
    
    for i in range(k):
        print(f"Processing fold #{i}")
        val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
        val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]
        
        partial_train_data = np.concatenate([train_data[:i * num_val_samples],
                                             train_data[(i + 1) * num_val_samples:]], axis=0)
        
        partial_train_targets = np.concatenate([train_targets[:i * num_val_samples],
                                                train_targets[(i + 1) * num_val_samples:]], axis=0)

        model.fit(partial_train_data, partial_train_targets)
    
        train_mse = mean_squared_error(partial_train_targets, model.predict(partial_train_data))
        val_mse = mean_squared_error(val_targets, model.predict(val_data))
    
        all_train_scores.append(train_mse)
        all_scores.append(val_mse)
    
    print('train rmse ', np.sqrt(np.mean(all_train_scores)))
    print('validation rmse ', np.sqrt(np.mean(all_scores)))

# XGB Regressor 

In [22]:
# final model
xgb_regressor = XGBRegressor(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    gpu_id=-1,
    interaction_constraints="",
    n_estimators=200,
    learning_rate=1.0,
    max_delta_step=0,
    max_depth=1,
    min_child_weight=2,
    missing=float("nan"),
    monotone_constraints="()",
    n_jobs=-1,
    num_parallel_tree=1,
    random_state=33,
    reg_alpha=1,
    reg_lambda=1.0,
    scale_pos_weight=1,
    subsample=0.9996300055522972,
    tree_method="exact",
    validate_parameters=1,
    verbosity=0,
    silent=False,
    nthread=4,
    seed=33,
)

cross_validate(xgb_regressor)

X.shape:  (1370, 110)
y.shape:  (1370, 1)
Processing fold #0
Processing fold #1
Processing fold #2
Processing fold #3
Processing fold #4
Processing fold #5
Processing fold #6
Processing fold #7
Processing fold #8
Processing fold #9
train rmse  451.28076168214636
validation rmse  587.803025679425









# All final models

In [23]:
models = [xgb_regressor]

In [24]:
X_test, Y_test = load_test_data()

In [25]:
X_test.shape

(342, 110)

In [26]:
Y_test.shape[0]

342

In [27]:
num_test_workloads = Y_test.shape[0]

In [28]:
X_test.shape[0]

342

## Generating Predictions

In [29]:
Y_test['xgb'] = xgb_regressor.predict(X_test.values)

In [30]:
Y_test.head()

Unnamed: 0,db2,actual,xgb
0,15645.956,23842.064,23244.818359
1,13802.936,21645.34,20747.507812
2,16441.604,32478.996,31493.230469
3,19066.052,25228.696,24704.291016
4,11631.22,16488.256,16931.574219


In [31]:
Y_test.head()

Unnamed: 0,db2,actual,xgb
0,15645.956,23842.064,23244.818359
1,13802.936,21645.34,20747.507812
2,16441.604,32478.996,31493.230469
3,19066.052,25228.696,24704.291016
4,11631.22,16488.256,16931.574219


In [32]:
Y_test = Y_test[['actual', 'db2', 'xgb']]

In [33]:
from sklearn.metrics import mean_squared_error

def rmse(Y):
    cols = Y.columns.values[1:]
    rmse_dict = {}
    
    for col in cols:
        rmse = np.round(np.sqrt(mean_squared_error(Y['actual'].values, Y[col].values)))
        rmse_dict[col] = rmse
    
    return rmse_dict

In [34]:
print('TEST RMSE')
rmse(Y_test)

TEST RMSE


{'db2': 8206.0, 'xgb': 593.0}

### calculating Mean Absolute Percent Error (MAPE)

# loading the original test data again for using with the MAPE
the re-loading is done as the RMSE function in the previous step had modified the test dataframe

In [35]:
X_test, Y_test = load_test_data()

In [36]:
Y_test['xgb'] = xgb_regressor.predict(X_test.values)

Y_test = Y_test[['actual', 'db2', 'xgb']]

In [37]:
from sklearn.metrics import mean_absolute_percentage_error

def mape(Y):
    cols = Y.columns.values[1:]
    mape_dict = {}
    
    for col in cols:
        mape = mean_absolute_percentage_error(Y['actual'].values, Y[col].values) * 100
        mape_dict[col] = mape
    
    return mape_dict

In [38]:
print('TEST MAPE')
mape(Y_test)

TEST MAPE


{'db2': 33.1393622386074, 'xgb': 2.2104347531579474}