In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_qualitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=3, max_evals=500)
    features = list(df)
    features.remove('score')
    nn.raw_features = features

    nn.pipeline = Pipeline([
        ('estimate', MLPRegressor())
    ])
   
    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), #'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : scope.int(hp.uniform('estimate__hidden_layer_sizes', 1, 100)),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = cat.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=61.39 sec	RMSE=0.117249	R^2=0.433545
[2/500]	cv_eval_time=30.24 sec	RMSE=0.116529	R^2=0.435736
[3/500]	cv_eval_time=60.64 sec	RMSE=0.115711	R^2=0.452616
[4/500]	cv_eval_time=69.21 sec	RMSE=0.115647	R^2=0.429255
[5/500]	cv_eval_time=62.85 sec	RMSE=0.116379	R^2=0.438244
[6/500]	cv_eval_time=76.23 sec	RMSE=0.117653	R^2=0.448890
[7/500]	cv_eval_time=87.09 sec	RMSE=0.115713	R^2=0.436052
[8/500]	cv_eval_time=67.84 sec	RMSE=0.115858	R^2=0.441531
[9/500]	cv_eval_time=55.69 sec	RMSE=0.117323	R^2=0.407516
[10/500]	cv_eval_time=47.75 sec	RMSE=0.115537	R^2=0.433756
[11/500]	cv_eval_time=68.98 sec	RMSE=0.114581	R^2=0.447729
[12/500]	cv_eval_time=34.04 sec	RMSE=0.116190	R^2=0.412869
[13/500]	cv_eval_time=39.40 sec	RMSE=0.116995	R^2=0.438635
[14/500]	cv_eval_time=57.12 sec	RMSE=0.116394	R^2=0.433497
[15/500]	cv_eval_time=49.20 sec	RMSE=0.120802	R^2=0.423664
[16/500]	cv_eval_time=77.61 sec	RMSE=0.115529	R^2=0.422731
[17/500]	cv_eval_time=59.80

[139/500]	cv_eval_time=47.25 sec	RMSE=0.119351	R^2=0.425824
[140/500]	cv_eval_time=63.79 sec	RMSE=0.116253	R^2=0.431730
[141/500]	cv_eval_time=63.60 sec	RMSE=0.115478	R^2=0.441246
[142/500]	cv_eval_time=61.07 sec	RMSE=0.116915	R^2=0.447225
[143/500]	cv_eval_time=66.68 sec	RMSE=0.114546	R^2=0.394344
[144/500]	cv_eval_time=70.98 sec	RMSE=0.115897	R^2=0.422421
[145/500]	cv_eval_time=64.83 sec	RMSE=0.117431	R^2=0.441736
[146/500]	cv_eval_time=57.89 sec	RMSE=0.115710	R^2=0.442329
[147/500]	cv_eval_time=61.90 sec	RMSE=0.115049	R^2=0.450174
[148/500]	cv_eval_time=57.83 sec	RMSE=0.117596	R^2=0.446479
[149/500]	cv_eval_time=53.87 sec	RMSE=0.117442	R^2=0.449997
[150/500]	cv_eval_time=53.83 sec	RMSE=0.117880	R^2=0.448721
[151/500]	cv_eval_time=53.30 sec	RMSE=0.114451	R^2=0.446507
[152/500]	cv_eval_time=58.88 sec	RMSE=0.115550	R^2=0.370499
[153/500]	cv_eval_time=69.11 sec	RMSE=0.115736	R^2=0.443828
[154/500]	cv_eval_time=61.05 sec	RMSE=0.115273	R^2=0.420527
[155/500]	cv_eval_time=62.82 sec	RMSE=0.

[276/500]	cv_eval_time=43.65 sec	RMSE=0.115504	R^2=0.441273
[277/500]	cv_eval_time=36.90 sec	RMSE=0.116035	R^2=0.446720
[278/500]	cv_eval_time=51.96 sec	RMSE=0.115529	R^2=0.443496
[279/500]	cv_eval_time=47.33 sec	RMSE=0.116183	R^2=0.451303
[280/500]	cv_eval_time=38.03 sec	RMSE=0.115268	R^2=0.436779
[281/500]	cv_eval_time=42.45 sec	RMSE=0.117108	R^2=0.442117
[282/500]	cv_eval_time=51.14 sec	RMSE=0.116178	R^2=0.440986
[283/500]	cv_eval_time=55.81 sec	RMSE=0.118084	R^2=0.457538
[284/500]	cv_eval_time=43.16 sec	RMSE=0.116022	R^2=0.439744
[285/500]	cv_eval_time=48.39 sec	RMSE=0.114297	R^2=0.453290
[286/500]	cv_eval_time=41.54 sec	RMSE=0.118184	R^2=0.440037
[287/500]	cv_eval_time=44.65 sec	RMSE=0.115657	R^2=0.434055
[288/500]	cv_eval_time=39.61 sec	RMSE=0.115066	R^2=0.434446
[289/500]	cv_eval_time=52.05 sec	RMSE=0.117472	R^2=0.449744
[290/500]	cv_eval_time=36.25 sec	RMSE=0.116076	R^2=0.449885
[291/500]	cv_eval_time=46.40 sec	RMSE=0.117099	R^2=0.427086
[292/500]	cv_eval_time=41.26 sec	RMSE=0.

[413/500]	cv_eval_time=31.34 sec	RMSE=0.115343	R^2=0.445161
[414/500]	cv_eval_time=28.00 sec	RMSE=0.117234	R^2=0.446167
[415/500]	cv_eval_time=36.71 sec	RMSE=0.117694	R^2=0.442834
[416/500]	cv_eval_time=30.95 sec	RMSE=0.116185	R^2=0.449002
[417/500]	cv_eval_time=30.47 sec	RMSE=0.115536	R^2=0.449619
[418/500]	cv_eval_time=33.03 sec	RMSE=0.115438	R^2=0.446813
[419/500]	cv_eval_time=32.06 sec	RMSE=0.114608	R^2=0.453770
[420/500]	cv_eval_time=25.79 sec	RMSE=0.116632	R^2=0.439939
[421/500]	cv_eval_time=31.22 sec	RMSE=0.115831	R^2=0.450726
[422/500]	cv_eval_time=34.27 sec	RMSE=0.116797	R^2=0.443314
[423/500]	cv_eval_time=36.48 sec	RMSE=0.115941	R^2=0.445535
[424/500]	cv_eval_time=31.00 sec	RMSE=0.115108	R^2=0.436132
[425/500]	cv_eval_time=31.30 sec	RMSE=0.114399	R^2=0.425239
[426/500]	cv_eval_time=29.53 sec	RMSE=0.115486	R^2=0.405619
[427/500]	cv_eval_time=31.14 sec	RMSE=0.115444	R^2=0.442052
[428/500]	cv_eval_time=33.06 sec	RMSE=0.118889	R^2=0.442964
[429/500]	cv_eval_time=30.27 sec	RMSE=0.