In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_quantitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=3, max_evals=500)
    features = list(df)
    features.remove('score')
    nn.raw_features = features

    nn.pipeline = Pipeline([
        ('estimate', MLPRegressor())
    ])
   
    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), # 'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : scope.int(hp.uniform('estimate__hidden_layer_sizes', 1, 100)),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = cat.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=35.58 sec	RMSE=0.096735	R^2=0.768278
[2/500]	cv_eval_time=47.75 sec	RMSE=0.093961	R^2=0.779403
[3/500]	cv_eval_time=47.43 sec	RMSE=0.097255	R^2=0.775422
[4/500]	cv_eval_time=63.64 sec	RMSE=0.096672	R^2=0.783152
[5/500]	cv_eval_time=46.13 sec	RMSE=0.095932	R^2=0.764344
[6/500]	cv_eval_time=65.77 sec	RMSE=0.093037	R^2=0.770080
[7/500]	cv_eval_time=50.07 sec	RMSE=0.095587	R^2=0.772467
[8/500]	cv_eval_time=78.27 sec	RMSE=0.096096	R^2=0.770132
[9/500]	cv_eval_time=54.99 sec	RMSE=0.096701	R^2=0.776202
[10/500]	cv_eval_time=66.73 sec	RMSE=0.098436	R^2=0.772908
[11/500]	cv_eval_time=33.44 sec	RMSE=0.097375	R^2=0.772841
[12/500]	cv_eval_time=54.06 sec	RMSE=0.094021	R^2=0.772108
[13/500]	cv_eval_time=54.45 sec	RMSE=0.093551	R^2=0.779051
[14/500]	cv_eval_time=63.99 sec	RMSE=0.093389	R^2=0.782850
[15/500]	cv_eval_time=49.87 sec	RMSE=0.095072	R^2=0.777743
[16/500]	cv_eval_time=55.03 sec	RMSE=0.094592	R^2=0.778158
[17/500]	cv_eval_time=50.07

[139/500]	cv_eval_time=42.29 sec	RMSE=0.096093	R^2=0.768468
[140/500]	cv_eval_time=54.31 sec	RMSE=0.093436	R^2=0.773882
[141/500]	cv_eval_time=56.69 sec	RMSE=0.095940	R^2=0.778832
[142/500]	cv_eval_time=69.42 sec	RMSE=0.094342	R^2=0.777757
[143/500]	cv_eval_time=70.41 sec	RMSE=0.095961	R^2=0.771342
[144/500]	cv_eval_time=52.96 sec	RMSE=0.094063	R^2=0.781795
[145/500]	cv_eval_time=72.52 sec	RMSE=0.095142	R^2=0.778549
[146/500]	cv_eval_time=64.00 sec	RMSE=0.095851	R^2=0.774299
[147/500]	cv_eval_time=69.96 sec	RMSE=0.095628	R^2=0.774031
[148/500]	cv_eval_time=59.01 sec	RMSE=0.096904	R^2=0.784241
[149/500]	cv_eval_time=62.11 sec	RMSE=0.095721	R^2=0.781565
[150/500]	cv_eval_time=66.82 sec	RMSE=0.092868	R^2=0.783441
[151/500]	cv_eval_time=58.00 sec	RMSE=0.093718	R^2=0.780278
[152/500]	cv_eval_time=69.93 sec	RMSE=0.093064	R^2=0.776286
[153/500]	cv_eval_time=54.31 sec	RMSE=0.094164	R^2=0.777825
[154/500]	cv_eval_time=61.79 sec	RMSE=0.093623	R^2=0.776670
[155/500]	cv_eval_time=72.32 sec	RMSE=0.

[276/500]	cv_eval_time=51.92 sec	RMSE=0.094657	R^2=0.783740
[277/500]	cv_eval_time=46.33 sec	RMSE=0.094393	R^2=0.778182
[278/500]	cv_eval_time=35.22 sec	RMSE=0.093751	R^2=0.775385
[279/500]	cv_eval_time=39.77 sec	RMSE=0.094336	R^2=0.771269
[280/500]	cv_eval_time=52.36 sec	RMSE=0.093845	R^2=0.769283
[281/500]	cv_eval_time=44.33 sec	RMSE=0.094304	R^2=0.770642
[282/500]	cv_eval_time=40.76 sec	RMSE=0.094443	R^2=0.775224
[283/500]	cv_eval_time=42.25 sec	RMSE=0.093669	R^2=0.781211
[284/500]	cv_eval_time=46.48 sec	RMSE=0.096025	R^2=0.765496
[285/500]	cv_eval_time=47.71 sec	RMSE=0.093980	R^2=0.782225
[286/500]	cv_eval_time=31.85 sec	RMSE=0.094377	R^2=0.764696
[287/500]	cv_eval_time=45.83 sec	RMSE=0.094738	R^2=0.783024
[288/500]	cv_eval_time=44.46 sec	RMSE=0.093883	R^2=0.780985
[289/500]	cv_eval_time=29.59 sec	RMSE=0.093763	R^2=0.773563
[290/500]	cv_eval_time=27.70 sec	RMSE=0.097176	R^2=0.776072
[291/500]	cv_eval_time=37.90 sec	RMSE=0.094028	R^2=0.778055
[292/500]	cv_eval_time=53.91 sec	RMSE=0.

[413/500]	cv_eval_time=32.92 sec	RMSE=0.094299	R^2=0.775655
[414/500]	cv_eval_time=36.07 sec	RMSE=0.094230	R^2=0.777077
[415/500]	cv_eval_time=36.38 sec	RMSE=0.094314	R^2=0.776644
[416/500]	cv_eval_time=38.87 sec	RMSE=0.095253	R^2=0.781897
[417/500]	cv_eval_time=34.52 sec	RMSE=0.094885	R^2=0.784874
[418/500]	cv_eval_time=29.93 sec	RMSE=0.094019	R^2=0.780195
[419/500]	cv_eval_time=27.98 sec	RMSE=0.094919	R^2=0.775649
[420/500]	cv_eval_time=38.14 sec	RMSE=0.093569	R^2=0.757332
[421/500]	cv_eval_time=32.75 sec	RMSE=0.092616	R^2=0.782675
[422/500]	cv_eval_time=34.16 sec	RMSE=0.094417	R^2=0.762431
[423/500]	cv_eval_time=31.23 sec	RMSE=0.094660	R^2=0.770828
[424/500]	cv_eval_time=39.94 sec	RMSE=0.094556	R^2=0.775401
[425/500]	cv_eval_time=33.05 sec	RMSE=0.096035	R^2=0.781852
[426/500]	cv_eval_time=42.25 sec	RMSE=0.093724	R^2=0.772789
[427/500]	cv_eval_time=36.33 sec	RMSE=0.093662	R^2=0.781999
[428/500]	cv_eval_time=34.34 sec	RMSE=0.093887	R^2=0.781696
[429/500]	cv_eval_time=30.88 sec	RMSE=0.