In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.externals import joblib

import pickle

Using TensorFlow backend.


In [3]:
#Load training, train, validation, and test sets for X
training_X = pd.read_pickle('D:/CUNY Files/capstone/code files/training_X_v.pkl').values
train_X = pd.read_pickle('D:/CUNY Files/capstone/code files/train_X_v.pkl').values
validation_X = pd.read_pickle('D:/CUNY Files/capstone/code files/validation_X_v.pkl').values
test_X = pd.read_pickle('D:/CUNY Files/capstone/code files/test_X_v.pkl').values

In [4]:
#Load training, train, validation, and test sets for y
training_y = pd.read_pickle('D:/CUNY Files/capstone/code files/training_y_v.pkl').values
train_y = pd.read_pickle('D:/CUNY Files/capstone/code files/train_y_v.pkl').values
validation_y = pd.read_pickle('D:/CUNY Files/capstone/code files/validation_y_v.pkl').values
test_y = pd.read_pickle('D:/CUNY Files/capstone/code files/test_y_v.pkl').values

In [5]:
len(training_X), len(train_X), len(validation_X), len(test_X)

(902, 721, 181, 226)

In [6]:
len(training_y), len(train_y), len(validation_y), len(test_y)

(902, 721, 181, 226)

In [7]:
#number of features
n_features = train_X.shape[1]
n_features

20

** Define the baseline model **

Create a keras model and evaluate it with with scikit-learn.

Define a baseline model with a single fully connected hidden layer with the same number of neurons as input attributes (features). The network uses the rectifier activation function ('relu')for the hidden layer. No activation function is used for the output layer because it is a regression problem and we are interested in predicting numerical values directly. 

The optimization algorithm used is: ADAM 
The loss function optimized is: mean squared error 
This MSE will be also used to evaluate the performance of the model. Using this metric and taking the square root provides an error value that can be easily interpreted in the context of the problem which is the number of days between the estimated arrival date and actual arrival date.

In [8]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(n_features, input_dim=n_features, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [9]:
# fix random seed for reproducibility
seed = 15
np.random.seed(seed)

Standardized the features:  
Use scikit-learn Pipeline function to perform the standardization during the model evaluation process, within each fold of the cross validation. This ensures that there is no data leakage from each testset cross validation fold into the training data.

The cross validation provides an estimate of the model’s performance for unseen data. The result reports the mean squared error including the average and standard deviation (average variance) across all 10 folds of the cross validation evaluation

In [10]:
# evaluate model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, 
                                         epochs=5, batch_size=10,verbose =0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, training_X, training_y, cv=kfold)
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Standardized: 40.08 (19.87) MSE


This is not an improvement over the polynomial ridge regression model

** Evaluate a Deeper Network Topology  **  
  
Test if adding another layer will improve the performance of the neural network. (This might allow the model to extract and recombine higher order features embedded in the data.)  
  
Add an additional layer with 15 neurons

In [11]:
# define a model with one additional layer over the baseline model
def deeper_model():
    # create model
    model = Sequential()
    model.add(Dense(n_features, input_dim=n_features, kernel_initializer='normal', activation='relu'))
    model.add(Dense(15, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [12]:
np.random.seed(seed)
estimators_d = []
estimators_d.append(('standardize', StandardScaler()))
estimators_d.append(('mlp', KerasRegressor(build_fn=deeper_model, 
                                         epochs=5, batch_size=10, verbose=0)))
pipeline_d = Pipeline(estimators_d)
kfold_d = KFold(n_splits=10, random_state=seed)
results_d = cross_val_score(pipeline_d, training_X, training_y, cv=kfold_d)
print("Deeper: %.2f (%.2f) MSE" % (results_d.mean(), results_d.std()))

Deeper: 39.75 (19.28) MSE


There is essentially the same as the baseline model

** Evaluate a Wider Network Topology **

Test if adding more neurons will improve the performance of the neural network.  Maintain a shallow network but substantially increase the number of neurons in the one hidden layer.

In [17]:
# neurons equal to 150% of the number of features (inputs)
n_features_200 = int(round(n_features * 2,0))
n_features_200

40

In [18]:
# define base model
def wider_model():
    # create model
    model = Sequential()
    model.add(Dense(n_features_200, input_dim=n_features, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [19]:
#this takes about 10 minutes to run (11 seconds per epoch, 20 k-folds)
np.random.seed(seed)
estimators_w = []
estimators_w.append(('standardize', StandardScaler()))
estimators_w.append(('mlp', KerasRegressor(build_fn=wider_model, 
                                         epochs=5, batch_size=10, 
                                           verbose=0)))
pipeline_w = Pipeline(estimators_w)
kfold_w = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline_w, training_X, training_y, cv=kfold_w)
print("Wider: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Wider: 39.89 (19.52) MSE


The results are no better than the baseline model

In [None]:
#fit the baseline model
pipeline.fit(train_X,train_y)

In [None]:
# save the model
filename = 'neural_net.sav'
pickle.dump(pipeline, open(filename, 'wb'))