In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_Klout.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=4, max_evals=500)
    features = list(df)
    features.remove('score')
    nn.raw_features = features

    nn.pipeline = Pipeline([
        ('estimate', MLPRegressor())
    ])
   
    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), #'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : scope.int(hp.uniform('estimate__hidden_layer_sizes', 1, 100)),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = cat.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=14.49 sec	RMSE=0.155211	R^2=0.431761
[2/500]	cv_eval_time=26.01 sec	RMSE=0.153676	R^2=0.425751
[3/500]	cv_eval_time=12.13 sec	RMSE=0.158556	R^2=0.426492
[4/500]	cv_eval_time=14.68 sec	RMSE=0.159333	R^2=0.398978
[5/500]	cv_eval_time=15.98 sec	RMSE=0.158490	R^2=0.423922
[6/500]	cv_eval_time=29.35 sec	RMSE=0.153649	R^2=0.452050
[7/500]	cv_eval_time=31.79 sec	RMSE=0.154332	R^2=0.425018
[8/500]	cv_eval_time=43.37 sec	RMSE=0.156747	R^2=0.451449
[9/500]	cv_eval_time=14.48 sec	RMSE=0.192874	R^2=-0.004976
[10/500]	cv_eval_time=56.99 sec	RMSE=0.154089	R^2=0.442906
[11/500]	cv_eval_time=25.49 sec	RMSE=0.157708	R^2=0.391091
[12/500]	cv_eval_time=38.14 sec	RMSE=0.154487	R^2=0.448582
[13/500]	cv_eval_time=46.53 sec	RMSE=0.156030	R^2=0.434195
[14/500]	cv_eval_time=29.55 sec	RMSE=0.156849	R^2=0.416417
[15/500]	cv_eval_time=31.42 sec	RMSE=0.161274	R^2=0.410990
[16/500]	cv_eval_time=26.64 sec	RMSE=0.162090	R^2=0.383828
[17/500]	cv_eval_time=37.0

[139/500]	cv_eval_time=28.31 sec	RMSE=0.160887	R^2=0.413941
[140/500]	cv_eval_time=31.20 sec	RMSE=0.154324	R^2=0.409328
[141/500]	cv_eval_time=21.95 sec	RMSE=0.161209	R^2=0.392649
[142/500]	cv_eval_time=35.32 sec	RMSE=0.154053	R^2=0.417629
[143/500]	cv_eval_time=40.67 sec	RMSE=0.156657	R^2=0.439409
[144/500]	cv_eval_time=43.74 sec	RMSE=0.156333	R^2=0.443589
[145/500]	cv_eval_time=45.96 sec	RMSE=0.157188	R^2=0.436074
[146/500]	cv_eval_time=41.75 sec	RMSE=0.156640	R^2=0.431167
[147/500]	cv_eval_time=39.24 sec	RMSE=0.156782	R^2=0.439020
[148/500]	cv_eval_time=39.96 sec	RMSE=0.155596	R^2=0.438946
[149/500]	cv_eval_time=40.23 sec	RMSE=0.156555	R^2=0.437469
[150/500]	cv_eval_time=49.60 sec	RMSE=0.154713	R^2=0.429699
[151/500]	cv_eval_time=40.31 sec	RMSE=0.156262	R^2=0.431734
[152/500]	cv_eval_time=45.33 sec	RMSE=0.154060	R^2=0.441029
[153/500]	cv_eval_time=40.11 sec	RMSE=0.157567	R^2=0.451292
[154/500]	cv_eval_time=42.99 sec	RMSE=0.154017	R^2=0.413768
[155/500]	cv_eval_time=43.81 sec	RMSE=0.

[276/500]	cv_eval_time=38.32 sec	RMSE=0.156293	R^2=0.431038
[277/500]	cv_eval_time=37.55 sec	RMSE=0.158189	R^2=0.447147
[278/500]	cv_eval_time=32.51 sec	RMSE=0.157058	R^2=0.427807
[279/500]	cv_eval_time=32.11 sec	RMSE=0.155980	R^2=0.410438
[280/500]	cv_eval_time=27.50 sec	RMSE=0.160237	R^2=0.428196
[281/500]	cv_eval_time=31.04 sec	RMSE=0.153865	R^2=0.403409
[282/500]	cv_eval_time=40.08 sec	RMSE=0.154998	R^2=0.433706
[283/500]	cv_eval_time=32.19 sec	RMSE=0.156068	R^2=0.381294
[284/500]	cv_eval_time=37.61 sec	RMSE=0.155795	R^2=0.428168
[285/500]	cv_eval_time=41.73 sec	RMSE=0.157733	R^2=0.460800
[286/500]	cv_eval_time=28.02 sec	RMSE=0.158438	R^2=0.444611
[287/500]	cv_eval_time=26.34 sec	RMSE=0.156247	R^2=0.408924
[288/500]	cv_eval_time=38.56 sec	RMSE=0.155707	R^2=0.411205
[289/500]	cv_eval_time=37.06 sec	RMSE=0.157524	R^2=0.425955
[290/500]	cv_eval_time=41.03 sec	RMSE=0.157888	R^2=0.427242
[291/500]	cv_eval_time=29.57 sec	RMSE=0.155204	R^2=0.437767
[292/500]	cv_eval_time=44.64 sec	RMSE=0.

[413/500]	cv_eval_time=27.18 sec	RMSE=0.156957	R^2=0.444895
[414/500]	cv_eval_time=27.52 sec	RMSE=0.155036	R^2=0.440464
[415/500]	cv_eval_time=29.56 sec	RMSE=0.155000	R^2=0.416866
[416/500]	cv_eval_time=29.95 sec	RMSE=0.155157	R^2=0.443009
[417/500]	cv_eval_time=26.64 sec	RMSE=0.157727	R^2=0.404280
[418/500]	cv_eval_time=27.45 sec	RMSE=0.157304	R^2=0.425567
[419/500]	cv_eval_time=28.65 sec	RMSE=0.155952	R^2=0.416465
[420/500]	cv_eval_time=34.21 sec	RMSE=0.157062	R^2=0.425883
[421/500]	cv_eval_time=33.15 sec	RMSE=0.153806	R^2=0.424926
[422/500]	cv_eval_time=31.00 sec	RMSE=0.154522	R^2=0.447070
[423/500]	cv_eval_time=27.85 sec	RMSE=0.155445	R^2=0.421396
[424/500]	cv_eval_time=34.76 sec	RMSE=0.153833	R^2=0.438357
[425/500]	cv_eval_time=32.09 sec	RMSE=0.155924	R^2=0.442689
[426/500]	cv_eval_time=29.80 sec	RMSE=0.157690	R^2=0.410625
[427/500]	cv_eval_time=28.45 sec	RMSE=0.157756	R^2=0.428364
[428/500]	cv_eval_time=34.58 sec	RMSE=0.154402	R^2=0.437950
[429/500]	cv_eval_time=28.95 sec	RMSE=0.