In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_hindex.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=3, max_evals=500)
    features = list(df)
    features.remove('score')
    nn.raw_features = features

    nn.pipeline = Pipeline([
        ('estimate', MLPRegressor())
    ])
   
    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), # 'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : scope.int(hp.uniform('estimate__hidden_layer_sizes', 1, 100)),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = cat.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=117.47 sec	RMSE=0.097627	R^2=0.700477
[2/500]	cv_eval_time=99.31 sec	RMSE=0.094912	R^2=0.713811
[3/500]	cv_eval_time=94.01 sec	RMSE=0.093884	R^2=0.719150
[4/500]	cv_eval_time=85.42 sec	RMSE=0.103916	R^2=0.643844
[5/500]	cv_eval_time=64.65 sec	RMSE=0.106204	R^2=0.653907
[6/500]	cv_eval_time=63.98 sec	RMSE=0.097539	R^2=0.705956
[7/500]	cv_eval_time=35.05 sec	RMSE=0.087612	R^2=0.759917
[8/500]	cv_eval_time=48.05 sec	RMSE=0.087624	R^2=0.757679
[9/500]	cv_eval_time=33.79 sec	RMSE=0.102651	R^2=0.662063
[10/500]	cv_eval_time=32.79 sec	RMSE=0.089890	R^2=0.745995
[11/500]	cv_eval_time=34.83 sec	RMSE=0.090412	R^2=0.742484
[12/500]	cv_eval_time=26.38 sec	RMSE=0.102225	R^2=0.676112
[13/500]	cv_eval_time=32.31 sec	RMSE=0.090656	R^2=0.742559
[14/500]	cv_eval_time=26.44 sec	RMSE=0.093809	R^2=0.730510
[15/500]	cv_eval_time=32.85 sec	RMSE=0.095905	R^2=0.711106
[16/500]	cv_eval_time=35.36 sec	RMSE=0.088468	R^2=0.756973
[17/500]	cv_eval_time=18.8

[139/500]	cv_eval_time=21.98 sec	RMSE=0.088160	R^2=0.748113
[140/500]	cv_eval_time=16.90 sec	RMSE=0.086204	R^2=0.765810
[141/500]	cv_eval_time=19.60 sec	RMSE=0.095396	R^2=0.714666
[142/500]	cv_eval_time=19.46 sec	RMSE=0.086290	R^2=0.764699
[143/500]	cv_eval_time=25.43 sec	RMSE=0.090980	R^2=0.734366
[144/500]	cv_eval_time=16.23 sec	RMSE=0.086838	R^2=0.768176
[145/500]	cv_eval_time=22.11 sec	RMSE=0.089173	R^2=0.750430
[146/500]	cv_eval_time=23.88 sec	RMSE=0.086318	R^2=0.761065
[147/500]	cv_eval_time=19.76 sec	RMSE=0.087419	R^2=0.765861
[148/500]	cv_eval_time=23.16 sec	RMSE=0.086946	R^2=0.763398
[149/500]	cv_eval_time=21.02 sec	RMSE=0.087304	R^2=0.762406
[150/500]	cv_eval_time=14.82 sec	RMSE=0.086963	R^2=0.767184
[151/500]	cv_eval_time=21.44 sec	RMSE=0.086981	R^2=0.763413
[152/500]	cv_eval_time=21.33 sec	RMSE=0.087980	R^2=0.748379
[153/500]	cv_eval_time=26.39 sec	RMSE=0.086569	R^2=0.764764
[154/500]	cv_eval_time=27.00 sec	RMSE=0.087629	R^2=0.754698
[155/500]	cv_eval_time=23.27 sec	RMSE=0.

[276/500]	cv_eval_time=15.32 sec	RMSE=0.086246	R^2=0.766564
[277/500]	cv_eval_time=22.70 sec	RMSE=0.088901	R^2=0.744210
[278/500]	cv_eval_time=19.52 sec	RMSE=0.087463	R^2=0.761821
[279/500]	cv_eval_time=15.00 sec	RMSE=0.088497	R^2=0.756001
[280/500]	cv_eval_time=15.91 sec	RMSE=0.086883	R^2=0.761164
[281/500]	cv_eval_time=20.55 sec	RMSE=0.086809	R^2=0.766770
[282/500]	cv_eval_time=22.43 sec	RMSE=0.090112	R^2=0.746541
[283/500]	cv_eval_time=20.58 sec	RMSE=0.088225	R^2=0.759632
[284/500]	cv_eval_time=22.47 sec	RMSE=0.087123	R^2=0.762424
[285/500]	cv_eval_time=20.88 sec	RMSE=0.088283	R^2=0.757838
[286/500]	cv_eval_time=19.46 sec	RMSE=0.086695	R^2=0.767167
[287/500]	cv_eval_time=12.98 sec	RMSE=0.086658	R^2=0.759343
[288/500]	cv_eval_time=18.67 sec	RMSE=0.090600	R^2=0.749369
[289/500]	cv_eval_time=16.83 sec	RMSE=0.096563	R^2=0.707826
[290/500]	cv_eval_time=12.94 sec	RMSE=0.086346	R^2=0.766094
[291/500]	cv_eval_time=18.21 sec	RMSE=0.086913	R^2=0.762192
[292/500]	cv_eval_time=22.20 sec	RMSE=0.

[413/500]	cv_eval_time=16.57 sec	RMSE=0.087454	R^2=0.758822
[414/500]	cv_eval_time=12.33 sec	RMSE=0.086676	R^2=0.766126
[415/500]	cv_eval_time=20.25 sec	RMSE=0.087227	R^2=0.759029
[416/500]	cv_eval_time=16.98 sec	RMSE=0.087338	R^2=0.766015
[417/500]	cv_eval_time=21.97 sec	RMSE=0.088852	R^2=0.754018
[418/500]	cv_eval_time=14.29 sec	RMSE=0.086551	R^2=0.764542
[419/500]	cv_eval_time=16.85 sec	RMSE=0.086661	R^2=0.765053
[420/500]	cv_eval_time=16.33 sec	RMSE=0.087456	R^2=0.760102
[421/500]	cv_eval_time=13.71 sec	RMSE=0.086616	R^2=0.763483
[422/500]	cv_eval_time=14.56 sec	RMSE=0.086284	R^2=0.766420
[423/500]	cv_eval_time=18.24 sec	RMSE=0.088174	R^2=0.755683
[424/500]	cv_eval_time=18.48 sec	RMSE=0.086996	R^2=0.764460
[425/500]	cv_eval_time=17.43 sec	RMSE=0.087534	R^2=0.760082
[426/500]	cv_eval_time=17.75 sec	RMSE=0.086504	R^2=0.766595
[427/500]	cv_eval_time=15.68 sec	RMSE=0.088097	R^2=0.752076
[428/500]	cv_eval_time=16.06 sec	RMSE=0.087059	R^2=0.761127
[429/500]	cv_eval_time=20.78 sec	RMSE=0.