In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

from utils import *

In [19]:

DATA_FOLDER="./data/"
alldatafile = "allData_pkaul.csv"
train_file = "trainData_pkaul.csv"
test_file = "testData_pkaul.csv"

train_in = pd.read_csv(os.path.join(DATA_FOLDER, train_file))
test_in = pd.read_csv(os.path.join(DATA_FOLDER, test_file))
all_in = pd.read_csv(os.path.join(DATA_FOLDER, alldatafile))


# remove all features with no variation at all
df = all_in.std().reset_index()
df.rename(columns={'index':'feature', 0:'stddev'}, inplace=True)
exclude_list = list(df[df.stddev==0].feature.unique())
exclude_list_add = ['RESTART', 'ATTR02', 'ATTR07']

print("Excluded feature set(no variation):\n{}".format(exclude_list_add + exclude_list))

train_in.drop(columns=exclude_list_add + exclude_list, inplace=True)
test_in.drop(columns=exclude_list_add + exclude_list, inplace=True)
all_in.drop(columns=exclude_list_add + exclude_list, inplace=True)

train_in.set_index('ID', inplace = True)
test_in.set_index('ID', inplace = True)
all_in.set_index('ID', inplace = True)

train_in.columns = [c.lower() for c in train_in.columns]
test_in.columns = [c.lower() for c in test_in.columns]
all_in.columns = [c.lower() for c in all_in.columns]


Excluded feature set(no variation):
['RESTART', 'ATTR02', 'ATTR07', 'RESENTSHOTS', 'NTHREADS', 'ATTR09', 'ATTR11', 'ATTR12', 'ATTR13', 'ATTR14', 'ATTR15', 'ATTR16', 'ATTR19', 'ATTR22', 'ATTR23', 'ATTR24', 'ATTR26', 'ATTR27', 'ATTR28', 'ATTR29', 'ATTR20.1', 'ATTR31', 'ATTR32', 'ATTR38', 'ATTR39', 'ATTR30', 'ATTR31.1', 'ATTR32.1']


In [20]:
print(train_in.shape)
print(train_in.columns)

(57636, 23)
Index(['node_minutes', 'nodes', 'executionstart', 'shots', 'scname', 'ncpu',
       'attr01', 'attr03', 'attr04', 'attr05', 'attr06', 'attr08', 'attr10',
       'attr17', 'attr18', 'attr20', 'attr21', 'attr25', 'attr33', 'attr34',
       'attr35', 'attr36', 'attr37'],
      dtype='object')


#### Notes:

1. attr08 can possibly be made a catg-variable becasue most of the values are zeros and besides that there are very few unique vales- 'binning'
2. attr06 has strong linear correlation to target. We will just normalize it and keep it.(#sns.lineplot(train_in.attr06, y=train_in.node_minutes))
3. attr10 seems to have one outlier which can be ommitted   #(train_in[train_in.attr10<max(train_in.attr10)].attr10.hist(bins = 50) )
4. 'attr21', 'attr25', 'attr20', 'attr18', 'attr17' -> scale and use
5. 'attr01', 'attr03', 'attr04','attr05' - make these catg variables - one hot encoding
6. nodes, shots -> numeric
7. scname -> catg


Additional

8. Consider making attr36 catgs -> binning.

In [21]:
train_in.executionstart = pd.to_datetime(train_in.executionstart)
train_in['hour'] = train_in.executionstart.apply(lambda dt: dt.time().hour)
train_in['peher'] = pd.cut(train_in.hour, bins=[-0.2, 6, 12, 18, 24])
train_in['day_of_week'] = train_in.executionstart.apply(lambda dt: dt.isocalendar()[1])

train_in['attr08_catg'] = train_in.attr08.apply(lambda i: attr08_binning(x=i))
train_in = train_in[train_in.attr10 < max(train_in.attr10)]

train_in.attr34 = train_in.attr34/1000000000

num_for_scaling = ['attr06', 'attr10','attr21', 'attr25', 'attr20', 'attr18',
                   'attr17', 'nodes','shots', 'attr35','attr34', 'attr36', 'attr37']

catg_for_ohe = ['attr01', 'attr03', 'attr04','attr05', 'scname','ncpu','attr08_catg', 'day_of_week', 'peher']

In [22]:
left = list(set(train_in.columns) - set(num_for_scaling) -set(catg_for_ohe))
train_in[ left].head()

Unnamed: 0_level_0,hour,attr33,attr08,executionstart,node_minutes
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
402296701,9,1656955,0,2018-11-08 09:09:57,197
402289644,9,1226489,0,2018-11-08 09:10:36,179
402143371,9,9753278,0,2018-11-08 09:10:43,1174
402106192,9,2032833,0,2018-11-08 09:10:55,926
401575307,9,131726,0,2018-11-08 09:11:10,64


In [23]:
# prepare test_set

test_in.executionstart = pd.to_datetime(test_in.executionstart)
test_in['hour'] = test_in.executionstart.apply(lambda dt: dt.time().hour)
test_in['peher'] = pd.cut(test_in.hour, bins=[-0.2, 6, 12, 18, 24])
test_in['day_of_week'] = test_in.executionstart.apply(lambda dt: dt.isocalendar()[1])

test_in['attr08_catg'] = test_in.attr08.apply(lambda i: attr08_binning(x=i))
test_in = test_in[test_in.attr10 < max(test_in.attr10)]

test_in.attr34 = test_in.attr34/1000000000

#### Model

In [24]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, FunctionTransformer as FT
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot', OneHotEncoder(), catg_for_ohe),
        ('sqrt_transform', MinMaxScaler(), num_for_scaling)
    ]
)

base_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('lr', LinearRegression())   
])


In [9]:
base_pipeline.fit(train_in, train_in.node_minutes)
# pred = base_pipeline.predict(test_in)

Pipeline(memory=None,
     steps=[('preprocessing', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('one_hot', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_value...), ('lr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

In [26]:
test_in.index

Int64Index([405558811, 405157005, 405558776, 405558777, 405558778, 405558779,
            405558780, 405558810, 405460112, 405460097,
            ...
            406643637, 406643621, 406643638, 406643620, 406643639, 406643640,
            406694518, 406643644, 406643645, 406643646],
           dtype='int64', name='ID', length=24700)

In [29]:
test_in.shape

(24700, 26)