In [16]:
import time

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
df = pd.read_csv('https://storage.googleapis.com/goodwatercap_fs/public/ml_take_home_data/ml_take_home_data_sample.csv')
print(df.shape)
df.head()

(100000, 9)


Unnamed: 0,date,year,month,day_of_week,rank,rank_last_month,downloads,downloads_last_month,downloads_next_month
0,2013-02-03,2013,2,1,10540,6835,1240,1728,1495
1,2013-02-03,2013,2,1,33884,34071,200,177,158
2,2013-02-03,2013,2,1,2194,2030,6225,5757,1849
3,2013-02-03,2013,2,1,40288,40543,107,95,32
4,2013-02-03,2013,2,1,28329,32121,291,203,374


# Neural Network Model

In [5]:
# Select features to be used
reg_features = ['downloads', 'downloads_last_month', 'rank', 'rank_last_month', 'year', 'month', 'day_of_week']
y_var = 'downloads_next_month'
X = np.log10(df[reg_features].copy())
y = np.log10(df[y_var].copy())

In [6]:
# Split between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [7]:
# Normalize the features
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Define the hidden layer dimensions
layers_dimensions = (100, 50, 20, 5)

In [None]:
%%time
model = MLPRegressor(hidden_layer_sizes = layers_dimensions, random_state=1)
model.fit(X_train, y_train)

predict_train = NeuralNet.predict(X_train)
predict_test = NeuralNet.predict(X_test)
rmse_train = np.sqrt(mean_squared_error(y_train, predict_train))
rmse_test = np.sqrt(mean_squared_error(y_test, predict_test))
print('RMSE train was %.5f' % rmse_train)
print('RMSE test was %.5f' % rmse_test)
print('R2 train was %.5f' % r2

In [43]:
%%time
model = MLPRegressor(hidden_layer_sizes = layers_dimensions, random_state=1)
model.fit(X_train, y_train)

predict_train_baseline = NeuralNet.predict(X_train)
predict_test_baseline = NeuralNet.predict(X_test)

rmse_train_baseline = np.sqrt(mean_squared_error(y_train, predict_train))
rmse_test_baseline = np.sqrt(mean_squared_error(y_test, predict_test))

r2_train_baseline = r2_score(y_train, predict_train)
r2_test_baseline = r2_score(y_test, predict_test)

print('RMSE train was %.5f' % rmse_train_baseline)
print('RMSE test was %.5f' % rmse_test_baseline)
print('R2 train was %.5f' % r2_train_baseline)
print('R2 test was %.5f' % r2_test_baseline)

RMSE train was 0.21694
RMSE test was 0.21392
R2 train was 0.91422
R2 test was 0.91688
CPU times: user 1min 20s, sys: 4.01 s, total: 1min 24s
Wall time: 14.2 s


# Optimized Neural Network Model

Baseline CPU time is about 80 seconds. How can this be improved?

Adam optimizer is already being used, and it's typically the fastest optimizer. So keep that the same.

Add early_stopping - 40 seconds

In [44]:
%%time
model = MLPRegressor(hidden_layer_sizes = layers_dimensions, random_state=1, early_stopping=True)
model.fit(X_train, y_train)

predict_train = NeuralNet.predict(X_train)
predict_test = NeuralNet.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train, predict_train))
rmse_test = np.sqrt(mean_squared_error(y_test, predict_test))

r2_train = r2_score(y_train, predict_train)
r2_test = r2_score(y_test, predict_test)

print('RMSE train', rmse_train, 'difference from baseline', rmse_train - rmse_train_baseline)
print('RMSE test', rmse_test, 'difference from baseline', rmse_test - rmse_test_baseline)
print('R2 train', r2_train, 'difference from baseline', r2_train - r2_train_baseline)
print('R2 test', r2_test, 'difference from baseline', r2_test - r2_test_baseline)

RMSE train 0.21693550096467135 difference from baseline 0.0
RMSE test 0.2139183885844201 difference from baseline 0.0
R2 train 0.9142206250020465 difference from baseline 0.0
R2 test 0.9168784909799671 difference from baseline 0.0
CPU times: user 41.9 s, sys: 1.85 s, total: 43.8 s
Wall time: 7.32 s
