__Since the neural network creates several models, we need to find the best among them. Usually the least MAE error model, comes the top, when sorted by name. This is a confirmation that it is the best model obtained.__

In [1]:
import warnings
warnings.filterwarnings('ignore')
from keras.models import Sequential
from keras.layers import Dense,Dropout
def mlp(dropout_rate=0.0,activation1='relu', activation2='relu', activation3='relu', activation4='relu',
                       activation5='relu',activation6='relu'):
    model = Sequential()
    model.add(Dense(1024, input_dim=len(X.columns), kernel_initializer='normal', activation=activation1))
    model.add(Dropout(dropout_rate))
    model.add(Dense(512, kernel_initializer='normal', activation=activation2))
    model.add(Dropout(dropout_rate))
    model.add(Dense(256, kernel_initializer='normal', activation=activation3))
    model.add(Dropout(dropout_rate))
    model.add(Dense(128, kernel_initializer='normal', activation=activation4))
    model.add(Dropout(dropout_rate))
    model.add(Dense(64, kernel_initializer='normal', activation=activation5))
    model.add(Dropout(dropout_rate))
    model.add(Dense(32, kernel_initializer='normal', activation=activation6))
    model.add(Dense(4, kernel_initializer='normal', activation='relu'))
    # Compile model
    model.compile(loss='mae', optimizer='adam', metrics=['mae'])
    return model

Using TensorFlow backend.


__splitting the data once again into inputs and outputs.__

In [2]:
import pandas as pd

df = pd.read_csv('cleaned_data.csv')
X = df.loc[:,:'capacity']
Y = df.loc[:,'tickets_9_eur':'tickets_19_eur']
X.head()

Unnamed: 0,month,day_of_year,hour,minute,day_of_week,holiday,route_A->B,route_B->A,capacity
0,1,1,8,15,3,1,0,1,82.0
1,1,1,9,15,3,1,1,0,82.0
2,1,1,10,15,3,1,0,1,82.0
3,1,1,11,45,3,1,1,0,82.0
4,1,1,12,45,3,1,0,1,82.0


__Here we only need the test dataset to test the models. Previously pickled scaler is reused on test data.__

In [3]:
from sklearn.model_selection import train_test_split
_, X_test,__,Y_test = train_test_split(X,Y,test_size=0.2,shuffle=False)
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib
scaler = joblib.load('scaler.pkl')
X_test.loc[:,'month':'day_of_week'] = scaler.transform(X_test.loc[:,'month':'day_of_week'])
X_test.head()

Unnamed: 0,month,day_of_year,hour,minute,day_of_week,holiday,route_A->B,route_B->A,capacity
15432,1.654512,1.540487,1.620661,-0.881874,1.389186,0,0,1,82.0
15433,1.654512,1.540487,1.818428,-0.220211,1.389186,0,1,0,82.0
15434,1.654512,1.5505,-1.741382,-0.881874,-1.612973,0,0,1,82.0
15435,1.654512,1.5505,-1.543615,-0.881874,-1.612973,0,0,1,82.0
15436,1.654512,1.5505,-1.543615,1.103114,-1.612973,0,1,0,82.0


__Testing a small set of best possible models and storing it to a csv file.__

In [4]:
import glob
path = 'models'
files = sorted(glob.glob(path + '/model-5.*.hdf5'),reverse=False)

__It looks like the best model is `model-5.8806.hdf5`. Though it is saved with MAE of 5.88, it seems the actual MAE is still less, that is 4.17. This can be attributed to the fact that usage of dropout in the layers, doesn't show the full potential while validating on the test set.__

In [6]:
from sklearn.metrics import mean_absolute_error
import csv
from keras.models import load_model
import warnings
warnings.filterwarnings('ignore')
for file in files[0:50]:
    model = load_model(file)
    Y_test_pred = model.predict(X_test)
    mae = mean_absolute_error(Y_test_pred, Y_test)
    key = file.split('/')[-1].split('.hdf5')[0]
    print(key,mae)
    with open('validation_results.csv', 'a') as f:
            writer = csv.writer(f)
            writer.writerow([key, mae])

model-5.8806 4.171369634281425
model-5.8979 4.189123009451259
model-5.9133 4.217888092518842
model-5.9158 4.149058233050242
model-5.9166 4.14946626844884
model-5.9323 4.213984692876723
model-5.9518 4.42608068106979
model-5.9546 4.229189552396185
model-5.9569 4.275531436327773
model-5.9726 4.358896814346938
model-5.9790 4.40574406583247
