In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_Monika.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=4, max_evals=500)
    features = list(df)
    features.remove('score')
    nn.raw_features = features

    nn.pipeline = Pipeline([
        ('estimate', MLPRegressor())
    ])
   
    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), # 'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : scope.int(hp.uniform('estimate__hidden_layer_sizes', 1, 100)),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = cat.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=20.13 sec	RMSE=0.073820	R^2=0.087016
[2/500]	cv_eval_time=18.95 sec	RMSE=0.076057	R^2=0.082635
[3/500]	cv_eval_time=8.42 sec	RMSE=0.074798	R^2=0.042949
[4/500]	cv_eval_time=15.91 sec	RMSE=0.073623	R^2=0.077632
[5/500]	cv_eval_time=33.48 sec	RMSE=0.074293	R^2=0.098702
[6/500]	cv_eval_time=30.94 sec	RMSE=0.074850	R^2=0.071508
[7/500]	cv_eval_time=32.18 sec	RMSE=0.074204	R^2=0.086731
[8/500]	cv_eval_time=37.92 sec	RMSE=0.074037	R^2=0.058880
[9/500]	cv_eval_time=40.46 sec	RMSE=0.073556	R^2=0.042291
[10/500]	cv_eval_time=37.36 sec	RMSE=0.073745	R^2=0.037763
[11/500]	cv_eval_time=17.67 sec	RMSE=0.073716	R^2=0.051062
[12/500]	cv_eval_time=35.16 sec	RMSE=0.072645	R^2=0.029460
[13/500]	cv_eval_time=23.95 sec	RMSE=0.074326	R^2=0.037204
[14/500]	cv_eval_time=17.68 sec	RMSE=0.073924	R^2=0.017172
[15/500]	cv_eval_time=22.46 sec	RMSE=0.075807	R^2=0.023032
[16/500]	cv_eval_time=33.27 sec	RMSE=0.073566	R^2=0.066051
[17/500]	cv_eval_time=49.56 

[139/500]	cv_eval_time=33.41 sec	RMSE=0.074328	R^2=0.085630
[140/500]	cv_eval_time=34.08 sec	RMSE=0.073454	R^2=0.077178
[141/500]	cv_eval_time=21.72 sec	RMSE=0.074707	R^2=0.033340
[142/500]	cv_eval_time=32.16 sec	RMSE=0.074339	R^2=0.033456
[143/500]	cv_eval_time=33.28 sec	RMSE=0.074576	R^2=0.064636
[144/500]	cv_eval_time=23.07 sec	RMSE=0.074006	R^2=0.080232
[145/500]	cv_eval_time=37.90 sec	RMSE=0.074285	R^2=0.066979
[146/500]	cv_eval_time=36.50 sec	RMSE=0.073942	R^2=0.069716
[147/500]	cv_eval_time=35.45 sec	RMSE=0.073207	R^2=0.039894
[148/500]	cv_eval_time=35.38 sec	RMSE=0.073157	R^2=0.066396
[149/500]	cv_eval_time=23.35 sec	RMSE=0.074818	R^2=0.082186
[150/500]	cv_eval_time=38.71 sec	RMSE=0.074354	R^2=0.060207
[151/500]	cv_eval_time=36.27 sec	RMSE=0.073996	R^2=0.083361
[152/500]	cv_eval_time=35.16 sec	RMSE=0.073631	R^2=0.072786
[153/500]	cv_eval_time=36.41 sec	RMSE=0.073152	R^2=0.050337
[154/500]	cv_eval_time=33.10 sec	RMSE=0.074344	R^2=0.067895
[155/500]	cv_eval_time=22.90 sec	RMSE=0.

[276/500]	cv_eval_time=41.86 sec	RMSE=0.074528	R^2=0.077980
[277/500]	cv_eval_time=22.82 sec	RMSE=0.074651	R^2=0.078152
[278/500]	cv_eval_time=30.46 sec	RMSE=0.073622	R^2=0.079328
[279/500]	cv_eval_time=31.12 sec	RMSE=0.074348	R^2=0.085643
[280/500]	cv_eval_time=32.87 sec	RMSE=0.074135	R^2=0.060229
[281/500]	cv_eval_time=31.92 sec	RMSE=0.074034	R^2=0.105689
[282/500]	cv_eval_time=28.86 sec	RMSE=0.073631	R^2=0.072186
[283/500]	cv_eval_time=21.71 sec	RMSE=0.073108	R^2=0.075998
[284/500]	cv_eval_time=24.34 sec	RMSE=0.072638	R^2=0.080178
[285/500]	cv_eval_time=22.01 sec	RMSE=0.073702	R^2=0.083965
[286/500]	cv_eval_time=24.36 sec	RMSE=0.073428	R^2=0.070537
[287/500]	cv_eval_time=21.59 sec	RMSE=0.074121	R^2=0.067232
[288/500]	cv_eval_time=20.62 sec	RMSE=0.074172	R^2=0.061754
[289/500]	cv_eval_time=23.71 sec	RMSE=0.073236	R^2=0.088584
[290/500]	cv_eval_time=22.55 sec	RMSE=0.073386	R^2=0.061259
[291/500]	cv_eval_time=23.14 sec	RMSE=0.073719	R^2=0.081543
[292/500]	cv_eval_time=25.12 sec	RMSE=0.

[413/500]	cv_eval_time=31.05 sec	RMSE=0.073988	R^2=0.098700
[414/500]	cv_eval_time=23.51 sec	RMSE=0.074066	R^2=0.054490
[415/500]	cv_eval_time=20.44 sec	RMSE=0.074820	R^2=0.081702
[416/500]	cv_eval_time=24.87 sec	RMSE=0.074433	R^2=0.053363
[417/500]	cv_eval_time=25.71 sec	RMSE=0.073883	R^2=0.023119
[418/500]	cv_eval_time=34.86 sec	RMSE=0.073184	R^2=0.097126
[419/500]	cv_eval_time=27.74 sec	RMSE=0.074406	R^2=0.070170
[420/500]	cv_eval_time=27.55 sec	RMSE=0.074780	R^2=0.090385
[421/500]	cv_eval_time=27.11 sec	RMSE=0.073820	R^2=0.061066
[422/500]	cv_eval_time=29.37 sec	RMSE=0.073704	R^2=0.034046
[423/500]	cv_eval_time=34.45 sec	RMSE=0.073857	R^2=0.071773
[424/500]	cv_eval_time=20.75 sec	RMSE=0.073253	R^2=0.055011
[425/500]	cv_eval_time=37.07 sec	RMSE=0.074358	R^2=0.074735
[426/500]	cv_eval_time=25.25 sec	RMSE=0.074062	R^2=0.074992
[427/500]	cv_eval_time=33.09 sec	RMSE=0.073600	R^2=0.072142
[428/500]	cv_eval_time=29.62 sec	RMSE=0.073100	R^2=0.086864
[429/500]	cv_eval_time=28.06 sec	RMSE=0.