In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_Monika.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=3, max_evals=500)
    features = list(df)
    features.remove('score')
    nn.raw_features = features

    nn.pipeline = Pipeline([
        ('estimate', MLPRegressor())
    ])
   
    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), # 'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : scope.int(hp.uniform('estimate__hidden_layer_sizes', 1, 100)),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = cat.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=6.02 sec	RMSE=0.077275	R^2=-0.047131
[2/500]	cv_eval_time=7.67 sec	RMSE=0.077320	R^2=-0.022341
[3/500]	cv_eval_time=8.67 sec	RMSE=0.078005	R^2=-0.021481
[4/500]	cv_eval_time=3.33 sec	RMSE=0.077253	R^2=-0.002174
[5/500]	cv_eval_time=7.27 sec	RMSE=0.078495	R^2=-0.005663
[6/500]	cv_eval_time=10.37 sec	RMSE=0.077317	R^2=-0.004342
[7/500]	cv_eval_time=8.50 sec	RMSE=0.082211	R^2=-0.108788
[8/500]	cv_eval_time=7.47 sec	RMSE=0.081398	R^2=-0.018716
[9/500]	cv_eval_time=3.13 sec	RMSE=0.087244	R^2=-0.001654
[10/500]	cv_eval_time=9.21 sec	RMSE=0.078634	R^2=-0.028404
[11/500]	cv_eval_time=8.58 sec	RMSE=0.077495	R^2=-0.006760
[12/500]	cv_eval_time=11.71 sec	RMSE=0.084167	R^2=-0.019027
[13/500]	cv_eval_time=6.54 sec	RMSE=0.077418	R^2=-0.005007
[14/500]	cv_eval_time=9.12 sec	RMSE=0.079148	R^2=-0.066668
[15/500]	cv_eval_time=4.42 sec	RMSE=0.077110	R^2=-0.004773
[16/500]	cv_eval_time=8.44 sec	RMSE=0.077132	R^2=-0.041634
[17/500]	cv_eval_time=10.

[138/500]	cv_eval_time=5.26 sec	RMSE=0.084795	R^2=-0.489406
[139/500]	cv_eval_time=3.65 sec	RMSE=0.129010	R^2=-0.005182
[140/500]	cv_eval_time=5.01 sec	RMSE=0.077039	R^2=-0.005642
[141/500]	cv_eval_time=5.89 sec	RMSE=0.077672	R^2=-0.138171
[142/500]	cv_eval_time=3.08 sec	RMSE=0.076935	R^2=-0.018219
[143/500]	cv_eval_time=5.22 sec	RMSE=0.076952	R^2=-0.045932
[144/500]	cv_eval_time=5.91 sec	RMSE=0.090461	R^2=-0.005271
[145/500]	cv_eval_time=7.32 sec	RMSE=0.080507	R^2=-0.125477
[146/500]	cv_eval_time=4.23 sec	RMSE=0.148399	R^2=-0.002902
[147/500]	cv_eval_time=4.14 sec	RMSE=0.095830	R^2=-0.008954
[148/500]	cv_eval_time=15.72 sec	RMSE=0.079619	R^2=-1.065376
[149/500]	cv_eval_time=5.03 sec	RMSE=0.077310	R^2=-0.010268
[150/500]	cv_eval_time=5.41 sec	RMSE=0.080658	R^2=-0.133857
[151/500]	cv_eval_time=8.42 sec	RMSE=0.077646	R^2=-0.035943
[152/500]	cv_eval_time=13.91 sec	RMSE=0.077816	R^2=-0.686085
[153/500]	cv_eval_time=9.52 sec	RMSE=0.077508	R^2=-0.027545
[154/500]	cv_eval_time=7.50 sec	RMSE=0

[275/500]	cv_eval_time=4.80 sec	RMSE=0.092231	R^2=-0.030033
[276/500]	cv_eval_time=3.84 sec	RMSE=0.077168	R^2=-0.048942
[277/500]	cv_eval_time=4.92 sec	RMSE=0.077219	R^2=-0.007050
[278/500]	cv_eval_time=4.33 sec	RMSE=0.076941	R^2=-0.008763
[279/500]	cv_eval_time=3.84 sec	RMSE=0.076954	R^2=-0.010388
[280/500]	cv_eval_time=4.14 sec	RMSE=0.078024	R^2=-0.009088
[281/500]	cv_eval_time=3.96 sec	RMSE=0.077177	R^2=-0.002035
[282/500]	cv_eval_time=3.93 sec	RMSE=0.082410	R^2=-0.012145
[283/500]	cv_eval_time=5.31 sec	RMSE=0.077170	R^2=-0.001049
[284/500]	cv_eval_time=4.47 sec	RMSE=0.076893	R^2=-0.008946
[285/500]	cv_eval_time=3.47 sec	RMSE=0.076844	R^2=-0.003165
[286/500]	cv_eval_time=3.75 sec	RMSE=0.076897	R^2=-0.004414
[287/500]	cv_eval_time=4.24 sec	RMSE=0.082607	R^2=-0.007108
[288/500]	cv_eval_time=4.17 sec	RMSE=0.079341	R^2=-0.001131
[289/500]	cv_eval_time=3.19 sec	RMSE=0.077027	R^2=-0.175876
[290/500]	cv_eval_time=4.94 sec	RMSE=0.077495	R^2=-0.052436
[291/500]	cv_eval_time=4.10 sec	RMSE=0.0

[412/500]	cv_eval_time=4.61 sec	RMSE=0.077722	R^2=-0.000111
[413/500]	cv_eval_time=3.18 sec	RMSE=0.076903	R^2=-0.008818
[414/500]	cv_eval_time=3.22 sec	RMSE=0.076856	R^2=-0.027130
[415/500]	cv_eval_time=3.34 sec	RMSE=0.090899	R^2=-3.316898
[416/500]	cv_eval_time=5.21 sec	RMSE=0.077908	R^2=-0.008940
[417/500]	cv_eval_time=3.32 sec	RMSE=0.076842	R^2=-0.088321
[418/500]	cv_eval_time=4.03 sec	RMSE=0.077168	R^2=-0.013279
[419/500]	cv_eval_time=3.20 sec	RMSE=0.077443	R^2=-0.005882
[420/500]	cv_eval_time=4.07 sec	RMSE=0.077006	R^2=-1.887726
[421/500]	cv_eval_time=3.47 sec	RMSE=0.077054	R^2=-0.011853
[422/500]	cv_eval_time=3.17 sec	RMSE=0.076879	R^2=-0.002323
[423/500]	cv_eval_time=3.98 sec	RMSE=0.077417	R^2=-0.056861
[424/500]	cv_eval_time=4.44 sec	RMSE=0.077056	R^2=-0.083923
[425/500]	cv_eval_time=3.85 sec	RMSE=0.077143	R^2=-0.000072
[426/500]	cv_eval_time=4.35 sec	RMSE=0.077225	R^2=-0.007143
[427/500]	cv_eval_time=3.51 sec	RMSE=0.077087	R^2=-0.020534
[428/500]	cv_eval_time=3.62 sec	RMSE=0.0