In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_Klout.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=3, max_evals=500)
    features = list(df)
    features.remove('score')
    nn.raw_features = features

    nn.pipeline = Pipeline([
        ('estimate', MLPRegressor())
    ])
   
    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), #'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : scope.int(hp.uniform('estimate__hidden_layer_sizes', 1, 100)),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = cat.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=6.38 sec	RMSE=0.209746	R^2=-0.011027
[2/500]	cv_eval_time=2.24 sec	RMSE=0.207244	R^2=-0.002231
[3/500]	cv_eval_time=3.81 sec	RMSE=0.207663	R^2=-0.066179
[4/500]	cv_eval_time=5.95 sec	RMSE=0.207127	R^2=0.000676
[5/500]	cv_eval_time=6.27 sec	RMSE=0.207641	R^2=-0.006228
[6/500]	cv_eval_time=5.73 sec	RMSE=0.207746	R^2=-0.010702
[7/500]	cv_eval_time=4.01 sec	RMSE=0.224685	R^2=-0.007650
[8/500]	cv_eval_time=5.29 sec	RMSE=0.207383	R^2=-0.037969
[9/500]	cv_eval_time=2.35 sec	RMSE=0.209362	R^2=-0.006195
[10/500]	cv_eval_time=5.40 sec	RMSE=0.210637	R^2=-0.004554
[11/500]	cv_eval_time=4.35 sec	RMSE=0.209114	R^2=-0.003194
[12/500]	cv_eval_time=4.26 sec	RMSE=0.208037	R^2=-0.289923
[13/500]	cv_eval_time=7.72 sec	RMSE=0.208209	R^2=-0.025429
[14/500]	cv_eval_time=3.27 sec	RMSE=0.211711	R^2=-0.009932
[15/500]	cv_eval_time=2.65 sec	RMSE=0.207444	R^2=-0.000706
[16/500]	cv_eval_time=6.70 sec	RMSE=0.207190	R^2=-0.011726
[17/500]	cv_eval_time=3.95 s

[138/500]	cv_eval_time=7.70 sec	RMSE=0.207127	R^2=-0.008841
[139/500]	cv_eval_time=10.55 sec	RMSE=0.207119	R^2=-0.003767
[140/500]	cv_eval_time=10.51 sec	RMSE=0.209461	R^2=-0.002997
[141/500]	cv_eval_time=7.63 sec	RMSE=0.212143	R^2=-0.017795
[142/500]	cv_eval_time=13.14 sec	RMSE=0.208761	R^2=-0.045070
[143/500]	cv_eval_time=6.11 sec	RMSE=0.207701	R^2=-0.000304
[144/500]	cv_eval_time=7.40 sec	RMSE=0.207170	R^2=-0.008585
[145/500]	cv_eval_time=5.97 sec	RMSE=0.238417	R^2=-0.005123
[146/500]	cv_eval_time=9.39 sec	RMSE=0.208483	R^2=-0.005996
[147/500]	cv_eval_time=9.89 sec	RMSE=0.208421	R^2=-0.090026
[148/500]	cv_eval_time=9.25 sec	RMSE=0.207776	R^2=-0.006205
[149/500]	cv_eval_time=10.25 sec	RMSE=0.208214	R^2=-0.106901
[150/500]	cv_eval_time=7.37 sec	RMSE=0.208372	R^2=-0.033666
[151/500]	cv_eval_time=9.35 sec	RMSE=0.208164	R^2=-0.071620
[152/500]	cv_eval_time=9.72 sec	RMSE=0.210390	R^2=-0.013537
[153/500]	cv_eval_time=6.92 sec	RMSE=0.222430	R^2=-0.010174
[154/500]	cv_eval_time=6.87 sec	RMSE

[275/500]	cv_eval_time=9.55 sec	RMSE=0.207019	R^2=-0.008236
[276/500]	cv_eval_time=9.96 sec	RMSE=0.210880	R^2=-0.033663
[277/500]	cv_eval_time=9.89 sec	RMSE=0.221348	R^2=-0.005103
[278/500]	cv_eval_time=10.46 sec	RMSE=0.207909	R^2=0.000078
[279/500]	cv_eval_time=8.60 sec	RMSE=0.218542	R^2=0.015538
[280/500]	cv_eval_time=12.01 sec	RMSE=0.207544	R^2=-0.005866
[281/500]	cv_eval_time=10.16 sec	RMSE=0.210959	R^2=-0.000157
[282/500]	cv_eval_time=11.93 sec	RMSE=0.208293	R^2=-0.000224
[283/500]	cv_eval_time=8.79 sec	RMSE=0.207200	R^2=-0.010043
[284/500]	cv_eval_time=9.38 sec	RMSE=0.207387	R^2=-0.002650
[285/500]	cv_eval_time=8.82 sec	RMSE=0.260280	R^2=-0.076835
[286/500]	cv_eval_time=10.42 sec	RMSE=0.213707	R^2=0.000263
[287/500]	cv_eval_time=8.35 sec	RMSE=0.226238	R^2=-0.000861
[288/500]	cv_eval_time=9.07 sec	RMSE=0.207554	R^2=-0.000665
[289/500]	cv_eval_time=8.33 sec	RMSE=0.209282	R^2=-0.027432
[290/500]	cv_eval_time=9.90 sec	RMSE=0.211172	R^2=-0.013664
[291/500]	cv_eval_time=10.71 sec	RMSE=

[412/500]	cv_eval_time=7.96 sec	RMSE=0.215265	R^2=-0.062828
[413/500]	cv_eval_time=9.75 sec	RMSE=0.207281	R^2=-0.003221
[414/500]	cv_eval_time=8.49 sec	RMSE=0.207515	R^2=0.001156
[415/500]	cv_eval_time=9.04 sec	RMSE=0.209851	R^2=-0.025487
[416/500]	cv_eval_time=8.53 sec	RMSE=0.208106	R^2=-0.061534
[417/500]	cv_eval_time=7.67 sec	RMSE=0.208263	R^2=-0.001120
[418/500]	cv_eval_time=8.27 sec	RMSE=0.206483	R^2=-0.064308
[419/500]	cv_eval_time=8.24 sec	RMSE=0.214067	R^2=-0.256937
[420/500]	cv_eval_time=7.82 sec	RMSE=0.215636	R^2=0.001017
[421/500]	cv_eval_time=6.71 sec	RMSE=0.206864	R^2=-0.011489
[422/500]	cv_eval_time=6.86 sec	RMSE=0.208868	R^2=-0.040685
[423/500]	cv_eval_time=9.72 sec	RMSE=0.221842	R^2=-0.019593
[424/500]	cv_eval_time=8.81 sec	RMSE=0.209243	R^2=-0.014254
[425/500]	cv_eval_time=8.84 sec	RMSE=0.208256	R^2=-0.009375
[426/500]	cv_eval_time=6.28 sec	RMSE=0.207324	R^2=-0.240463
[427/500]	cv_eval_time=5.68 sec	RMSE=0.207248	R^2=-0.014542
[428/500]	cv_eval_time=7.52 sec	RMSE=0.207