In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

from utils import *

In [3]:

DATA_FOLDER="./data/"
alldatafile = "allData_pkaul.csv"
train_file = "trainData_pkaul.csv"
test_file = "testData_pkaul.csv"

train_in = pd.read_csv(os.path.join(DATA_FOLDER, train_file))
test_in = pd.read_csv(os.path.join(DATA_FOLDER, test_file))
all_in = pd.read_csv(os.path.join(DATA_FOLDER, alldatafile))


# remove all features with no variation at all
df = all_in.std().reset_index()
df.rename(columns={'index':'feature', 0:'stddev'}, inplace=True)
exclude_list = list(df[df.stddev==0].feature.unique())
exclude_list_add = ['RESTART', 'ATTR02', 'ATTR07']

print("Excluded feature set(no variation):\n{}".format(exclude_list_add + exclude_list))

train_in.drop(columns=exclude_list_add + exclude_list, inplace=True)
test_in.drop(columns=exclude_list_add + exclude_list, inplace=True)
all_in.drop(columns=exclude_list_add + exclude_list, inplace=True)

train_in.set_index('ID', inplace = True)
test_in.set_index('ID', inplace = True)
all_in.set_index('ID', inplace = True)

train_in.columns = [c.lower() for c in train_in.columns]
test_in.columns = [c.lower() for c in test_in.columns]
all_in.columns = [c.lower() for c in all_in.columns]


Excluded feature set(no variation):
['RESTART', 'ATTR02', 'ATTR07', 'RESENTSHOTS', 'NTHREADS', 'ATTR09', 'ATTR11', 'ATTR12', 'ATTR13', 'ATTR14', 'ATTR15', 'ATTR16', 'ATTR19', 'ATTR22', 'ATTR23', 'ATTR24', 'ATTR26', 'ATTR27', 'ATTR28', 'ATTR29', 'ATTR20.1', 'ATTR31', 'ATTR32', 'ATTR38', 'ATTR39', 'ATTR30', 'ATTR31.1', 'ATTR32.1']


In [4]:
print(train_in.shape)
print(train_in.columns)

(57636, 23)
Index(['node_minutes', 'nodes', 'executionstart', 'shots', 'scname', 'ncpu',
       'attr01', 'attr03', 'attr04', 'attr05', 'attr06', 'attr08', 'attr10',
       'attr17', 'attr18', 'attr20', 'attr21', 'attr25', 'attr33', 'attr34',
       'attr35', 'attr36', 'attr37'],
      dtype='object')


#### Notes:

1. attr08 can possibly be made a catg-variable becasue most of the values are zeros and besides that there are very few unique vales- 'binning'
2. attr06 has strong linear correlation to target. We will just normalize it and keep it.(#sns.lineplot(train_in.attr06, y=train_in.node_minutes))
3. attr10 seems to have one outlier which can be ommitted   #(train_in[train_in.attr10<max(train_in.attr10)].attr10.hist(bins = 50) )
4. 'attr21', 'attr25', 'attr20', 'attr18', 'attr17' -> scale and use
5. 'attr01', 'attr03', 'attr04','attr05' - make these catg variables - one hot encoding
6. nodes, shots -> numeric
7. scname -> catg


Additional

8. Consider making attr36 catgs -> binning.

In [5]:
train_in.executionstart = pd.to_datetime(train_in.executionstart)
train_in['hour'] = train_in.executionstart.apply(lambda dt: dt.time().hour)
train_in['peher'] = pd.cut(train_in.hour, bins=[-0.2, 6, 12, 18, 24], labels=['peher1', 'peher2', 'peher3', 'peher4'])
train_in['peher'] = train_in.peher.astype(str)
train_in['day_of_week'] = train_in.executionstart.apply(lambda dt: dt.isocalendar()[1])

train_in['attr08_catg'] = train_in.attr08.apply(lambda i: attr08_binning(x=i))
train_in = train_in[train_in.attr10 < max(train_in.attr10)]

train_in.attr34 = train_in.attr34/1000000000

num_for_scaling = ['attr06', 'attr10','attr21', 'attr25', 'attr20', 'attr18',
                   'attr17', 'nodes','shots', 'attr35','attr34', 'attr36', 'attr37']

catg_for_ohe = ['attr01', 'attr03', 'attr04','attr05', 'scname','ncpu','attr08_catg', 'day_of_week','peher']

In [6]:
# prepare test_set

test_in.executionstart = pd.to_datetime(test_in.executionstart)
test_in['hour'] = test_in.executionstart.apply(lambda dt: dt.time().hour)
test_in['peher'] = pd.cut(test_in.hour, bins=[-0.2, 6, 12, 18, 24], labels=['peher1', 'peher2', 'peher3', 'peher4'])
test_in['peher'] = test_in.peher.astype(str)
test_in['day_of_week'] = test_in.executionstart.apply(lambda dt: dt.isocalendar()[1])

test_in['attr08_catg'] = test_in.attr08.apply(lambda i: attr08_binning(x=i))
test_in = test_in[test_in.attr10 < max(test_in.attr10)]

test_in.attr34 = test_in.attr34/1000000000

#### Model

In [14]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer as FT
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import category_encoders as ce

from keras import Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

In [9]:
X_train, X_test, y_train, y_test = train_test_split( train_in, train_in.node_minutes, test_size=0.2,
                                                    random_state=42, shuffle=True)

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot', ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True) , catg_for_ohe),
        ('sqrt_transform', MinMaxScaler(), num_for_scaling)
    ]
)

base_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('lasso', Lasso())   
])


In [11]:
# cv = KFold(n_splits=5, random_state=42, shuffle=True)

# param_grid = {
#     'lasso__alpha' : [0.1,0.01,0.001,1]
# }

# clf = GridSearchCV( base_pipeline, param_grid , cv=cv, scoring='neg_mean_absolute_error')
# clf.fit(X_train, y_train)

In [12]:

def build_regressor():
    regressor = Sequential()
    regressor.add(Dense(units=45, input_dim=45, activation='relu'))
    regressor.add(Dropout(0.2))
    regressor.add(Dense(units=25, activation='relu'))
    regressor.add(Dropout(0.2))
    regressor.add(Dense(units=10, activation='relu'))
    regressor.add(Dropout(0.2))
    regressor.add(Dense(units=5, activation='relu'))
    regressor.add(Dense(units=1))
    regressor.compile(optimizer='adam', loss='mean_squared_error',  metrics=['mae'])
    return regressor



# nn_pipeline = Pipeline([
#     ('preprocessing', preprocessor),
#     ('NN', regressor)   
# ])

In [19]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)
regressor = KerasRegressor(build_fn=build_regressor, batch_size=32,epochs=25)
regressor.fit(X_train_preprocessed, np.log(y_train), validation_split=0.2, callbacks=[es, mc], verbose=1)


Train on 36886 samples, validate on 9222 samples
Epoch 1/25

Epoch 00001: val_loss improved from inf to 3.42166, saving model to best_model.h5
Epoch 2/25

Epoch 00002: val_loss improved from 3.42166 to 2.59417, saving model to best_model.h5
Epoch 3/25

Epoch 00003: val_loss improved from 2.59417 to 1.44879, saving model to best_model.h5
Epoch 4/25

Epoch 00004: val_loss improved from 1.44879 to 0.88509, saving model to best_model.h5
Epoch 5/25

Epoch 00005: val_loss improved from 0.88509 to 0.61556, saving model to best_model.h5
Epoch 6/25

Epoch 00006: val_loss improved from 0.61556 to 0.51616, saving model to best_model.h5
Epoch 7/25

Epoch 00007: val_loss improved from 0.51616 to 0.50895, saving model to best_model.h5
Epoch 8/25

Epoch 00008: val_loss improved from 0.50895 to 0.38994, saving model to best_model.h5
Epoch 9/25

Epoch 00009: val_loss improved from 0.38994 to 0.32542, saving model to best_model.h5
Epoch 10/25

Epoch 00010: val_loss did not improve from 0.32542
Epoch 11/

<keras.callbacks.History at 0x1a3b009978>

In [20]:
clf = load_model('best_model.h5')
pred = clf.predict(X_test_preprocessed)


R2 = r2_score(y_pred=pred, y_true=y_test)
n=len(y_test)
p = 45
Adj_R2 = 1- (1-R2)*(n-1)/(n-p-1)

print(f'MAE: {mean_absolute_error(y_pred=pred, y_true=y_test)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_pred=pred, y_true=y_test))}')
print(f'r-squared: {R2}')
print(f'adj_r-squared: {Adj_R2}')


MAE: 395.6440918267916
RMSE: 1382.5119204855348
r-squared: 0.46642949218226293
adj_r-squared: 0.4643381523293061
