In [81]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
import numpy as np
import os
import warnings 

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)


In [82]:

DATA_FOLDER="./data/"
alldatafile = "allData_pkaul.csv"
train_file = "trainData_pkaul.csv"
test_file = "testData_pkaul.csv"

train_in = pd.read_csv(os.path.join(DATA_FOLDER, train_file))
test_in = pd.read_csv(os.path.join(DATA_FOLDER, test_file))
all_in = pd.read_csv(os.path.join(DATA_FOLDER, alldatafile))

# remove all features with no variation at all
df = all_in.std().reset_index()
df.rename(columns={'index':'feature', 0:'stddev'}, inplace=True)
exclude_list = list(df[df.stddev==0].feature.unique())
exclude_list_add = ['RESTART', 'ATTR02', 'ATTR07']

print("Excluded feature set(no variation):\n{}".format(exclude_list_add + exclude_list))

train_in.drop(columns=exclude_list_add + exclude_list, inplace=True)
test_in.drop(columns=exclude_list_add + exclude_list, inplace=True)
all_in.drop(columns=exclude_list_add + exclude_list, inplace=True)

train_in.set_index('ID', inplace = True)
test_in.set_index('ID', inplace = True)

Excluded feature set(no variation):
['RESTART', 'ATTR02', 'ATTR07', 'RESENTSHOTS', 'NTHREADS', 'ATTR09', 'ATTR11', 'ATTR12', 'ATTR13', 'ATTR14', 'ATTR15', 'ATTR16', 'ATTR19', 'ATTR22', 'ATTR23', 'ATTR24', 'ATTR26', 'ATTR27', 'ATTR28', 'ATTR29', 'ATTR20.1', 'ATTR31', 'ATTR32', 'ATTR38', 'ATTR39', 'ATTR30', 'ATTR31.1', 'ATTR32.1']


In [83]:
print(train_in.shape)
print(test_in.shape)
print(train_in.columns)

(57636, 23)
(24701, 22)
Index(['NODE_MINUTES', 'NODES', 'EXECUTIONSTART', 'SHOTS', 'SCNAME', 'NCPU',
       'ATTR01', 'ATTR03', 'ATTR04', 'ATTR05', 'ATTR06', 'ATTR08', 'ATTR10',
       'ATTR17', 'ATTR18', 'ATTR20', 'ATTR21', 'ATTR25', 'ATTR33', 'ATTR34',
       'ATTR35', 'ATTR36', 'ATTR37'],
      dtype='object')


In [84]:
print(train_in.isnull().values.any())
train_in.head()

False


Unnamed: 0_level_0,NODE_MINUTES,NODES,EXECUTIONSTART,SHOTS,SCNAME,NCPU,ATTR01,ATTR03,ATTR04,ATTR05,...,ATTR17,ATTR18,ATTR20,ATTR21,ATTR25,ATTR33,ATTR34,ATTR35,ATTR36,ATTR37
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
402296701,197,12,2018-11-08 09:09:57,197,center_11,4,?,?,?,tamnone,...,192,160,300,19.221,84,1656955,9000103968,100,12,1196
402289644,179,8,2018-11-08 09:10:36,81,center_8,2,?,?,?,tamboth,...,160,160,300,14.91,84,1226489,9978282770,219,8,1750
402143371,1174,8,2018-11-08 09:10:43,254,center_8,2,?,?,?,tamboth,...,192,160,530,34.842,110,9753278,63123134349,141,10,1401
402106192,926,8,2018-11-08 09:10:55,129,center_1,4,?,?,?,tamboth,...,192,224,337,26.395,47,2032833,19165217816,390,6,2335
401575307,64,4,2018-11-08 09:11:10,6,center_8,2,?,tamnone,?,?,...,320,256,334,54.036,114,131726,943684845,188,8,1501


In [85]:
target_column = 'NODE_MINUTES'
exclude_columns = ['EXECUTIONSTART']

# Marking categorical columns
catg_columns = ['NCPU', 'SCNAME', 'ATTR01', 'ATTR03', 'ATTR04', 'ATTR05']
for feature in catg_columns:
    train_in[feature] = train_in[feature].astype('category')
    test_in[feature] = test_in[feature].astype('category')

# Numeric columns
numeric_columns = list(train_in.select_dtypes(include=[np.number]).columns)

features = numeric_columns + catg_columns
features.remove(target_column)
print(features)
print("# Features: {}".format(len(features)))

['NODES', 'SHOTS', 'ATTR06', 'ATTR08', 'ATTR10', 'ATTR17', 'ATTR18', 'ATTR20', 'ATTR21', 'ATTR25', 'ATTR33', 'ATTR34', 'ATTR35', 'ATTR36', 'ATTR37', 'NCPU', 'SCNAME', 'ATTR01', 'ATTR03', 'ATTR04', 'ATTR05']
# Features: 21


In [86]:
x_train = train_in[features]
y_train = train_in[target_column]

x_test = test_in[features]
x_test = x_test[x_test.SCNAME!='center_5'] # for now

#### transformers

In [87]:
# include transformers

class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])
        
        
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key    
    def fit(self, X, y=None):
        return self   
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X[[self.key]]

#### Feature processing pipeline

In [88]:

# pipeline -> array of tuples of the format (name, object)

numeric_feature_scaling = Pipeline([
    ('numeric_cols', TypeSelector(dtype=np.number)),
    ('std_scaler', StandardScaler())
])

catg_one_hot_encoding = Pipeline([
    ('catg_cols', TypeSelector(dtype='category')),
    ('one_hot_encoding', OneHotEncoder())
])

feats = FeatureUnion([('numeric', numeric_feature_scaling), 
                      ('category', catg_one_hot_encoding)
                     ])

feature_processing = Pipeline([('feats', feats)])

#### Model

In [104]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(39, input_dim=39, kernel_initializer='normal', activation='relu'))
    model.add(Dense(10, kernel_initializer='normal', activation='relu'))
    model.add(Dense(6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# evaluate model with standardized dataset
dnn = KerasRegressor(build_fn=baseline_model, epochs=1000, batch_size=5000, verbose=0)


#### Final pipeline

In [105]:
pipeline = Pipeline([
    ('features',feats),
    ('regressor', dnn)
])

In [106]:
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, x_train, y_train, cv=kfold, scoring='r2')
print(results)

[ 0.50264106  0.48564458 -3.08926247  0.54512305  0.89140986  0.92278815
  0.52594144  0.93032305  0.93371591  0.79031009]
