In [None]:
from pandas import read_csv, get_dummies, DataFrame
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib inline

In [None]:
df = read_csv('data/clean.csv')
df.drop(['id', 'memUsage', 'freeMem', 'totalMem', 'timestamp'], axis=1, inplace=True)
display(df.shape)
df.head()

In [None]:
df = df.groupby(['instanceType', 'serverType', 'databaseType', 'requestSize', 'frequency']).mean().reset_index()
df['instanceType'] = df['instanceType'].astype('category')
df['serverType'] = df['serverType'].astype('category')
df['databaseType'] = df['databaseType'].astype('category')
encoded = get_dummies(df, columns=['instanceType', 'serverType', 'databaseType']).sample(frac=1).reset_index(drop=True)
display(df.shape, encoded.shape)

In [None]:
label_cols = ['timeWrite', 'timeRead', 'timeDelete', 'totalTime', 'clientTotalTime']

models_data = {
    'timeWrite': {
        'train_n': 0,
        'test_n': 0,
        'models': []
    },
    'timeRead': {
        'train_n': 0,
        'test_n': 0,
        'models': []
    },
    'timeDelete': {
        'train_n': 0,
        'test_n': 0,
        'models': []
    },
    'totalTime': {
        'train_n': 0,
        'test_n': 0,
        'models': []
    },
    'clientTotalTime': {
        'train_n': 0,
        'test_n': 0,
        'models': []
    },
}

for col in label_cols:
    data = models_data[col]
    
    cols_to_drop = [x for x in label_cols if x != col]
    data_set = encoded.drop(cols_to_drop, axis=1)

    columns = data_set.loc[:, data_set.columns != col].columns
    features = data_set.loc[:, data_set.columns != col].to_numpy()
    target = data_set.loc[:, col].to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
    
    data['train_n'] = len(X_train)
    data['test_n'] = len(X_test)
    
    
    lreg = LinearRegression().fit(X_train, y_train)
    model = {
        'name': 'LinearRegression',
        'train_score': lreg.score(X_train, y_train),
        'test_score': lreg.score(X_test, y_test),
        'weights': set([columns[x].split('_')[0] for x in lreg.coef_.argsort()[-3:][::-1]]),
    }
    data['models'].append(model)

    
    ri = Ridge().fit(X_train, y_train)
    model = {
        'name': 'Ridge',
        'train_score': ri.score(X_train, y_train),
        'test_score': ri.score(X_test, y_test),
        'weights': set([columns[x].split('_')[0] for x in ri.coef_.argsort()[-3:][::-1]]),
    }
    data['models'].append(model)
    
    la = Lasso().fit(X_train, y_train)
    model = {
        'name': 'Lasso',
        'train_score': la.score(X_train, y_train),
        'test_score': la.score(X_test, y_test),
        'weights': set([columns[x].split('_')[0] for x in la.coef_.argsort()[-3:][::-1]]),
    }
    data['models'].append(model)
    
    rf = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
    model = {
        'name': 'RandomForest',
        'train_score': rf.score(X_train, y_train),
        'test_score': rf.score(X_test, y_test),
    }
    data['models'].append(model)

In [None]:
for key in models_data:
    val = models_data[key]
    models = sorted(val['models'], key=lambda x: x['test_score'])
    models_data[key]['models'] = models
    

    
for metric in models_data:
    data = models_data[metric]
    print('{}'.format(metric.upper()))
    
    for model in data['models']:
        if 'Random' not in model['name']:
            print('{}: train={:.5f}, test={:.5f}, importance={}'.format(model['name'], model['train_score'], model['test_score'], model['weights']))
        else:
            print('{}: train={:.5f}, test={:.5f}'.format(model['name'], model['train_score'], model['test_score']))
    print()