In [1]:
from pandas import read_csv, get_dummies, DataFrame
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib inline

In [2]:
df = read_csv('data/clean.csv')
df.drop(['id', 'memUsage', 'freeMem', 'totalMem', 'timestamp'], axis=1, inplace=True)
display(df.shape)

(1548823, 10)

In [3]:
df = df.groupby(['instanceType', 'serverType', 'databaseType', 'requestSize', 'frequency']).mean().reset_index()
df['instanceType'] = df['instanceType'].astype('category')
df['serverType'] = df['serverType'].astype('category')
df['databaseType'] = df['databaseType'].astype('category')
encoded = get_dummies(df, columns=['instanceType', 'serverType', 'databaseType']).sample(frac=1).reset_index(drop=True)
display(df.shape, encoded.shape)

(1058, 10)

(1058, 15)

In [6]:
def get_top_weights(weights, columns):
    top_2 = list(map(lambda x: x.argsort()[-2:][::-1], weights))
    names = list(map(lambda x: list(map(lambda y: columns[y].split('_')[0], x)), top_2))
    return names
    

def score(y_true, y_pred):
    return r2_score(y_true, y_pred, multioutput='uniform_average')
    
label_cols = ['timeWrite', 'timeRead', 'timeDelete', 'totalTime', 'clientTotalTime']
models = []

features = encoded.drop(label_cols, axis=1)
target = encoded[label_cols]
columns = features.columns
X_train, X_test, y_train, y_test = train_test_split(features.to_numpy(), target.to_numpy(), test_size=0.3)

lreg = LinearRegression().fit(X_train, y_train)
model = {
    'name': 'LinearRegression',
    'train_score': score(y_train, lreg.predict(X_train)),
    'test_score': score(y_test, lreg.predict(X_test)),
    'weights': get_top_weights(lreg.coef_, columns),
}
models.append(model)


ri = Ridge().fit(X_train, y_train)
model = {
    'name': 'Ridge',
    'train_score': score(y_train, ri.predict(X_train)),
    'test_score': score(y_test, ri.predict(X_test)),
    'weights': get_top_weights(ri.coef_, columns),
}
models.append(model)


la = Lasso().fit(X_train, y_train)
model = {
    'name': 'Lasso',
    'train_score': score(y_train, la.predict(X_train)),
    'test_score': score(y_test, la.predict(X_test)),
    'weights': get_top_weights(la.coef_, columns),
}
models.append(model)


el = ElasticNet().fit(X_train, y_train)
model = {
    'name': 'ElasticNet',
    'train_score': score(y_train, el.predict(X_train)),
    'test_score': score(y_test, el.predict(X_test)),
    'weights': get_top_weights(el.coef_, columns),
}
models.append(model)


rf = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
model = {
    'name': 'RandomForest',
    'train_score': score(y_train, rf.predict(X_train)),
    'test_score': score(y_test, rf.predict(X_test)),
}
models.append(model)
models = sorted(models, key=lambda x: x['test_score'])
    
for model in models:
    if 'Random' not in model['name']:
        print('{}: train={:.5f}, test={:.5f}, importance={}'.format(model['name'], model['train_score'], model['test_score'], model['weights']))
    else:
        print('{}: train={:.5f}, test={:.5f}'.format(model['name'], model['train_score'], model['test_score']))

ElasticNet: train=0.52329, test=0.58407, importance=[['databaseType', 'instanceType'], ['databaseType', 'instanceType'], ['databaseType', 'serverType'], ['databaseType', 'serverType'], ['instanceType', 'serverType']]
Lasso: train=0.64836, test=0.71293, importance=[['databaseType', 'instanceType'], ['databaseType', 'instanceType'], ['databaseType', 'instanceType'], ['databaseType', 'instanceType'], ['instanceType', 'instanceType']]
LinearRegression: train=0.65548, test=0.71367, importance=[['databaseType', 'instanceType'], ['databaseType', 'instanceType'], ['databaseType', 'instanceType'], ['instanceType', 'databaseType'], ['instanceType', 'serverType']]
Ridge: train=0.65547, test=0.71392, importance=[['databaseType', 'instanceType'], ['databaseType', 'instanceType'], ['databaseType', 'instanceType'], ['instanceType', 'databaseType'], ['instanceType', 'serverType']]
RandomForest: train=0.96750, test=0.83653


In [5]:
display(features)
display(target)

Unnamed: 0,requestSize,frequency,instanceType_b1,instanceType_b2,instanceType_b4,instanceType_b8,serverType_java,serverType_nodejs,databaseType_nosql,databaseType_sql
0,1.0,20.0,1,0,0,0,0,1,0,1
1,1.0,4.0,0,0,0,1,1,0,1,0
2,50.0,60.0,0,1,0,0,1,0,0,1
3,100.0,4.0,0,0,0,1,0,1,0,1
4,100.0,2.0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1053,100.0,40.0,0,0,0,1,0,1,0,1
1054,1.0,60.0,0,0,1,0,0,1,0,1
1055,100.0,15.0,1,0,0,0,0,1,0,1
1056,5.0,1.0,1,0,0,0,0,1,0,1


Unnamed: 0,timeWrite,timeRead,timeDelete,totalTime,clientTotalTime
0,7.828221,4.519427,7.071575,260.292434,10035.507157
1,59.227557,29.559499,58.480167,209.148225,336.701461
2,14.425179,5.381078,6.878628,87.070863,8984.898605
3,24.225470,4.713987,5.883090,142.405010,406.910230
4,21.188285,6.372385,5.589958,100.472803,360.832636
...,...,...,...,...,...
1053,90.847619,85.616667,84.080952,455.919048,7564.383333
1054,16.347594,9.844920,22.240642,236.326203,7600.844920
1055,41.390000,22.490000,16.520000,384.150000,8779.160000
1056,23.111940,5.820896,6.932836,320.981343,484.753731
