In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_qualitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=4, max_evals=500)
    features = list(df)
    features.remove('score')
    nn.raw_features = features

    nn.pipeline = Pipeline([
        ('estimate', MLPRegressor())
    ])
   
    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), # 'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : scope.int(hp.uniform('estimate__hidden_layer_sizes', 1, 100)),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = cat.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=21.75 sec	RMSE=0.023073	R^2=-1.988413
[2/500]	cv_eval_time=27.29 sec	RMSE=0.027690	R^2=-3.493761
[3/500]	cv_eval_time=40.52 sec	RMSE=0.016380	R^2=-1.463709
[4/500]	cv_eval_time=30.57 sec	RMSE=0.017960	R^2=-4.727255
[5/500]	cv_eval_time=40.04 sec	RMSE=0.018651	R^2=-1.280264
[6/500]	cv_eval_time=25.66 sec	RMSE=0.017301	R^2=-0.410585
[7/500]	cv_eval_time=18.70 sec	RMSE=0.015282	R^2=-1.199858
[8/500]	cv_eval_time=24.13 sec	RMSE=0.019858	R^2=-1.235536
[9/500]	cv_eval_time=19.35 sec	RMSE=0.019424	R^2=-1.237853
[10/500]	cv_eval_time=23.78 sec	RMSE=0.017484	R^2=-2.050458
[11/500]	cv_eval_time=14.67 sec	RMSE=0.021338	R^2=-2.203309
[12/500]	cv_eval_time=21.71 sec	RMSE=0.013327	R^2=-0.213331
[13/500]	cv_eval_time=27.08 sec	RMSE=0.015742	R^2=-0.666239
[14/500]	cv_eval_time=22.27 sec	RMSE=0.031130	R^2=-4.803348
[15/500]	cv_eval_time=16.27 sec	RMSE=0.027622	R^2=-4.720866
[16/500]	cv_eval_time=18.50 sec	RMSE=0.018417	R^2=-2.055203
[17/500]	cv

[137/500]	cv_eval_time=31.85 sec	RMSE=0.012552	R^2=-0.549913
[138/500]	cv_eval_time=21.32 sec	RMSE=0.012691	R^2=-0.830338
[139/500]	cv_eval_time=16.05 sec	RMSE=0.014887	R^2=-0.801897
[140/500]	cv_eval_time=30.56 sec	RMSE=0.013856	R^2=-1.393767
[141/500]	cv_eval_time=24.50 sec	RMSE=0.014208	R^2=-0.370667
[142/500]	cv_eval_time=30.82 sec	RMSE=0.012221	R^2=-0.383190
[143/500]	cv_eval_time=21.62 sec	RMSE=0.015056	R^2=-1.787787
[144/500]	cv_eval_time=34.42 sec	RMSE=0.014682	R^2=-0.645942
[145/500]	cv_eval_time=29.39 sec	RMSE=0.022152	R^2=-5.850900
[146/500]	cv_eval_time=35.41 sec	RMSE=0.013646	R^2=-1.233029
[147/500]	cv_eval_time=33.50 sec	RMSE=0.012470	R^2=-0.509305
[148/500]	cv_eval_time=40.71 sec	RMSE=0.015539	R^2=-0.464949
[149/500]	cv_eval_time=29.67 sec	RMSE=0.013069	R^2=-0.669709
[150/500]	cv_eval_time=34.12 sec	RMSE=0.013951	R^2=-0.440624
[151/500]	cv_eval_time=33.79 sec	RMSE=0.013852	R^2=-0.964437
[152/500]	cv_eval_time=34.33 sec	RMSE=0.013097	R^2=-0.508330
[153/500]	cv_eval_time=2

[272/500]	cv_eval_time=35.75 sec	RMSE=0.013556	R^2=-0.267514
[273/500]	cv_eval_time=36.84 sec	RMSE=0.012750	R^2=-0.313134
[274/500]	cv_eval_time=35.55 sec	RMSE=0.013615	R^2=-0.234497
[275/500]	cv_eval_time=42.22 sec	RMSE=0.013436	R^2=-0.492962
[276/500]	cv_eval_time=35.40 sec	RMSE=0.012703	R^2=-0.434391
[277/500]	cv_eval_time=39.26 sec	RMSE=0.014931	R^2=-0.793106
[278/500]	cv_eval_time=32.28 sec	RMSE=0.014118	R^2=-0.200836
[279/500]	cv_eval_time=28.93 sec	RMSE=0.016688	R^2=-1.461370
[280/500]	cv_eval_time=38.24 sec	RMSE=0.014194	R^2=-0.699178
[281/500]	cv_eval_time=27.80 sec	RMSE=0.013804	R^2=-0.265637
[282/500]	cv_eval_time=35.53 sec	RMSE=0.014283	R^2=-0.803450
[283/500]	cv_eval_time=30.81 sec	RMSE=0.015858	R^2=-3.560121
[284/500]	cv_eval_time=32.71 sec	RMSE=0.014704	R^2=-0.281850
[285/500]	cv_eval_time=31.18 sec	RMSE=0.014550	R^2=-0.299728
[286/500]	cv_eval_time=37.25 sec	RMSE=0.012543	R^2=-0.533618
[287/500]	cv_eval_time=34.85 sec	RMSE=0.013768	R^2=-0.297807
[288/500]	cv_eval_time=3

[407/500]	cv_eval_time=33.66 sec	RMSE=0.013940	R^2=-0.641278
[408/500]	cv_eval_time=28.03 sec	RMSE=0.014172	R^2=-0.162675
[409/500]	cv_eval_time=35.03 sec	RMSE=0.014476	R^2=-0.625738
[410/500]	cv_eval_time=29.30 sec	RMSE=0.014751	R^2=-0.256859
[411/500]	cv_eval_time=33.18 sec	RMSE=0.012440	R^2=-0.222612
[412/500]	cv_eval_time=30.40 sec	RMSE=0.013148	R^2=-0.068825
[413/500]	cv_eval_time=38.61 sec	RMSE=0.013658	R^2=-0.556191
[414/500]	cv_eval_time=38.10 sec	RMSE=0.013610	R^2=-0.490392
[415/500]	cv_eval_time=36.41 sec	RMSE=0.012483	R^2=-0.992495
[416/500]	cv_eval_time=30.93 sec	RMSE=0.013108	R^2=-0.530515
[417/500]	cv_eval_time=25.70 sec	RMSE=0.012353	R^2=-0.111724
[418/500]	cv_eval_time=24.85 sec	RMSE=0.013843	R^2=-0.325080
[419/500]	cv_eval_time=29.82 sec	RMSE=0.013089	R^2=-0.302803
[420/500]	cv_eval_time=29.61 sec	RMSE=0.014298	R^2=-0.445829
[421/500]	cv_eval_time=36.13 sec	RMSE=0.013130	R^2=-0.459516
[422/500]	cv_eval_time=30.82 sec	RMSE=0.013692	R^2=-0.955310
[423/500]	cv_eval_time=2