In [1]:
from pandas import read_csv, get_dummies, DataFrame
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib inline

In [2]:
df = read_csv('data/clean.csv')
df.drop(['id', 'memUsage', 'freeMem', 'totalMem', 'timestamp'], axis=1, inplace=True)
display(df.shape)

(1548823, 10)

In [3]:
df = df.groupby(['instanceType', 'serverType', 'databaseType', 'requestSize', 'frequency']).mean().reset_index()
df['instanceType'] = df['instanceType'].astype('category')
df['serverType'] = df['serverType'].astype('category')
df['databaseType'] = df['databaseType'].astype('category')
encoded = get_dummies(df, columns=['instanceType', 'serverType', 'databaseType']).sample(frac=1).reset_index(drop=True)
display(df.shape, encoded.shape)

(1058, 10)

(1058, 15)

In [4]:
label_cols = ['timeWrite', 'timeRead', 'timeDelete', 'totalTime', 'clientTotalTime']

models_data = {
    'timeWrite': {
        'train_n': 0,
        'test_n': 0,
        'models': []
    },
    'timeRead': {
        'train_n': 0,
        'test_n': 0,
        'models': []
    },
    'timeDelete': {
        'train_n': 0,
        'test_n': 0,
        'models': []
    },
    'totalTime': {
        'train_n': 0,
        'test_n': 0,
        'models': []
    },
    'clientTotalTime': {
        'train_n': 0,
        'test_n': 0,
        'models': []
    },
}

for col in label_cols:
    data = models_data[col]
    
    cols_to_drop = [x for x in label_cols if x != col]
    data_set = encoded.drop(cols_to_drop, axis=1)

    columns = data_set.loc[:, data_set.columns != col].columns
    features = data_set.loc[:, data_set.columns != col].to_numpy()
    target = data_set.loc[:, col].to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
    
    data['train_n'] = len(X_train)
    data['test_n'] = len(X_test)
    
    
    lreg = LinearRegression().fit(X_train, y_train)
    model = {
        'name': 'LinearRegression',
        'train_score': lreg.score(X_train, y_train),
        'test_score': lreg.score(X_test, y_test),
        'weights': set([columns[x].split('_')[0] for x in lreg.coef_.argsort()[-2:][::-1]]),
    }
    data['models'].append(model)

    
    ri = Ridge().fit(X_train, y_train)
    model = {
        'name': 'Ridge',
        'train_score': ri.score(X_train, y_train),
        'test_score': ri.score(X_test, y_test),
        'weights': set([columns[x].split('_')[0] for x in ri.coef_.argsort()[-2:][::-1]]),
    }
    data['models'].append(model)
    
    la = Lasso().fit(X_train, y_train)
    model = {
        'name': 'Lasso',
        'train_score': la.score(X_train, y_train),
        'test_score': la.score(X_test, y_test),
        'weights': set([columns[x].split('_')[0] for x in la.coef_.argsort()[-2:][::-1]]),
    }
    data['models'].append(model)
    
    rf = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
    model = {
        'name': 'RandomForest',
        'train_score': rf.score(X_train, y_train),
        'test_score': rf.score(X_test, y_test),
    }
    data['models'].append(model)
    
data_set

Unnamed: 0,requestSize,frequency,clientTotalTime,instanceType_b1,instanceType_b2,instanceType_b4,instanceType_b8,serverType_java,serverType_nodejs,databaseType_nosql,databaseType_sql
0,500.0,1.0,915.285714,0,0,0,1,1,0,0,1
1,50.0,10.0,391.142977,0,0,1,0,1,0,1,0
2,1.0,4.0,242.904453,1,0,0,0,1,0,0,1
3,50.0,10.0,284.465330,0,0,1,0,1,0,0,1
4,500.0,4.0,1328.052192,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
1053,1.0,60.0,3439.824531,0,1,0,0,1,0,0,1
1054,1.0,40.0,7663.678322,1,0,0,0,1,0,1,0
1055,5.0,20.0,9947.187032,1,0,0,0,0,1,1,0
1056,500.0,2.0,1777.292683,1,0,0,0,1,0,0,1


In [5]:
for key in models_data:
    val = models_data[key]
    models = sorted(val['models'], key=lambda x: x['test_score'])
    models_data[key]['models'] = models
    

    
for metric in models_data:
    data = models_data[metric]
    print('{}'.format(metric.upper()))
    
    for model in data['models']:
        if 'Random' not in model['name']:
            print('{}: train={:.5f}, test={:.5f}, importance={}'.format(model['name'], model['train_score'], model['test_score'], model['weights']))
        else:
            print('{}: train={:.5f}, test={:.5f}'.format(model['name'], model['train_score'], model['test_score']))
    print()

TIMEWRITE
Lasso: train=0.64816, test=0.70435, importance={'databaseType', 'instanceType'}
Ridge: train=0.65477, test=0.70993, importance={'databaseType', 'instanceType'}
LinearRegression: train=0.65477, test=0.71005, importance={'databaseType', 'instanceType'}
RandomForest: train=0.97496, test=0.88527

TIMEREAD
Lasso: train=0.61021, test=0.68474, importance={'databaseType', 'instanceType'}
LinearRegression: train=0.62578, test=0.69635, importance={'databaseType', 'instanceType'}
Ridge: train=0.62578, test=0.69646, importance={'databaseType', 'instanceType'}
RandomForest: train=0.98106, test=0.86152

TIMEDELETE
Lasso: train=0.80613, test=0.75265, importance={'databaseType', 'instanceType'}
Ridge: train=0.81983, test=0.76643, importance={'databaseType', 'instanceType'}
LinearRegression: train=0.81984, test=0.76644, importance={'databaseType', 'instanceType'}
RandomForest: train=0.98923, test=0.86281

TOTALTIME
Lasso: train=0.69845, test=0.70012, importance={'databaseType', 'instanceType'