In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_hindex.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run MLPRegressor with hyperopt optimization
    nn = HyperoptModel(train.copy(), test.copy(),'nn', cv=3, max_evals=500)
    features = list(df)
    features.remove('score')
    nn.raw_features = features

    nn.pipeline = Pipeline([
        ('estimate', MLPRegressor())
    ])
   
    nn.space = {
         'estimate__alpha' : hp.uniform('estimate__alpha', 0.001, 1),
         'estimate__activation' : hp.choice('estimate__activation', ['logistic']), # 'identity', 'logistic', 'tanh', 'relu'
         #'estimate__learning_rate' : hp.choice('estimate__learning_rate', ['constant', 'invscaling', 'adaptive']),
         'estimate__hidden_layer_sizes' : scope.int(hp.uniform('estimate__hidden_layer_sizes', 1, 100)),
         'estimate__solver' : hp.choice('estimate__solver', ['adam']), #'lbfgs', 'sgd',
         #'estimate__max_iter' : scope.int(hp.uniform('estimate__max_iter', 500, 1000))
    }

    if hyperopt:
        nn.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        nn.pipeline.fit(X=nn.X_train, y=nn.y_train)
        nn.model = cat.pipeline
        nn.stats()
        nn.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        nn.plot_feature_importance()
        nn.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=105.14 sec	RMSE=0.087821	R^2=0.756712
[2/500]	cv_eval_time=78.28 sec	RMSE=0.087540	R^2=0.759851
[3/500]	cv_eval_time=49.68 sec	RMSE=0.089695	R^2=0.760978
[4/500]	cv_eval_time=39.34 sec	RMSE=0.088019	R^2=0.765298
[5/500]	cv_eval_time=55.19 sec	RMSE=0.086928	R^2=0.761747
[6/500]	cv_eval_time=48.84 sec	RMSE=0.091620	R^2=0.750121
[7/500]	cv_eval_time=69.24 sec	RMSE=0.087862	R^2=0.756222
[8/500]	cv_eval_time=80.08 sec	RMSE=0.090673	R^2=0.744438
[9/500]	cv_eval_time=71.83 sec	RMSE=0.086826	R^2=0.768423
[10/500]	cv_eval_time=54.45 sec	RMSE=0.087869	R^2=0.773270
[11/500]	cv_eval_time=45.06 sec	RMSE=0.088593	R^2=0.754007
[12/500]	cv_eval_time=61.78 sec	RMSE=0.089561	R^2=0.732688
[13/500]	cv_eval_time=58.21 sec	RMSE=0.086584	R^2=0.752707
[14/500]	cv_eval_time=73.03 sec	RMSE=0.092945	R^2=0.762931
[15/500]	cv_eval_time=79.43 sec	RMSE=0.089589	R^2=0.758212
[16/500]	cv_eval_time=47.73 sec	RMSE=0.087323	R^2=0.746046
[17/500]	cv_eval_time=61.2

[139/500]	cv_eval_time=52.13 sec	RMSE=0.089029	R^2=0.729234
[140/500]	cv_eval_time=53.28 sec	RMSE=0.086024	R^2=0.756474
[141/500]	cv_eval_time=66.31 sec	RMSE=0.088161	R^2=0.770521
[142/500]	cv_eval_time=44.40 sec	RMSE=0.086475	R^2=0.766576
[143/500]	cv_eval_time=59.65 sec	RMSE=0.088677	R^2=0.768167
[144/500]	cv_eval_time=59.92 sec	RMSE=0.087260	R^2=0.772012
[145/500]	cv_eval_time=59.57 sec	RMSE=0.085414	R^2=0.761871
[146/500]	cv_eval_time=56.92 sec	RMSE=0.085699	R^2=0.761545
[147/500]	cv_eval_time=54.46 sec	RMSE=0.086833	R^2=0.754041
[148/500]	cv_eval_time=50.17 sec	RMSE=0.085314	R^2=0.768456
[149/500]	cv_eval_time=40.86 sec	RMSE=0.086743	R^2=0.756731
[150/500]	cv_eval_time=57.87 sec	RMSE=0.088514	R^2=0.769321
[151/500]	cv_eval_time=65.13 sec	RMSE=0.089353	R^2=0.766398
[152/500]	cv_eval_time=60.22 sec	RMSE=0.087067	R^2=0.758728
[153/500]	cv_eval_time=54.76 sec	RMSE=0.085690	R^2=0.761101
[154/500]	cv_eval_time=66.56 sec	RMSE=0.086673	R^2=0.767673
[155/500]	cv_eval_time=58.58 sec	RMSE=0.

[276/500]	cv_eval_time=43.46 sec	RMSE=0.089128	R^2=0.764953
[277/500]	cv_eval_time=44.03 sec	RMSE=0.086032	R^2=0.763787
[278/500]	cv_eval_time=43.16 sec	RMSE=0.086501	R^2=0.762604
[279/500]	cv_eval_time=45.91 sec	RMSE=0.089519	R^2=0.765598
[280/500]	cv_eval_time=33.04 sec	RMSE=0.087619	R^2=0.758889
[281/500]	cv_eval_time=43.38 sec	RMSE=0.086723	R^2=0.756549
[282/500]	cv_eval_time=37.13 sec	RMSE=0.086057	R^2=0.754113
[283/500]	cv_eval_time=46.08 sec	RMSE=0.087101	R^2=0.772184
[284/500]	cv_eval_time=46.77 sec	RMSE=0.090770	R^2=0.754004
[285/500]	cv_eval_time=35.04 sec	RMSE=0.090084	R^2=0.722056
[286/500]	cv_eval_time=51.26 sec	RMSE=0.085329	R^2=0.764430
[287/500]	cv_eval_time=39.18 sec	RMSE=0.087145	R^2=0.763972
[288/500]	cv_eval_time=41.50 sec	RMSE=0.085263	R^2=0.766095
[289/500]	cv_eval_time=36.29 sec	RMSE=0.087968	R^2=0.764955
[290/500]	cv_eval_time=43.81 sec	RMSE=0.089660	R^2=0.762413
[291/500]	cv_eval_time=44.79 sec	RMSE=0.086490	R^2=0.768256
[292/500]	cv_eval_time=45.12 sec	RMSE=0.

[413/500]	cv_eval_time=28.74 sec	RMSE=0.089348	R^2=0.768696
[414/500]	cv_eval_time=39.46 sec	RMSE=0.085685	R^2=0.761162
[415/500]	cv_eval_time=29.66 sec	RMSE=0.085727	R^2=0.767387
[416/500]	cv_eval_time=32.90 sec	RMSE=0.089999	R^2=0.772181
[417/500]	cv_eval_time=35.51 sec	RMSE=0.086437	R^2=0.758305
[418/500]	cv_eval_time=30.08 sec	RMSE=0.086227	R^2=0.767484
[419/500]	cv_eval_time=36.74 sec	RMSE=0.087028	R^2=0.766871
[420/500]	cv_eval_time=31.16 sec	RMSE=0.087205	R^2=0.751780
[421/500]	cv_eval_time=37.17 sec	RMSE=0.087208	R^2=0.769594
[422/500]	cv_eval_time=29.41 sec	RMSE=0.086590	R^2=0.761367
[423/500]	cv_eval_time=27.93 sec	RMSE=0.086570	R^2=0.763228
[424/500]	cv_eval_time=27.93 sec	RMSE=0.085370	R^2=0.769078
[425/500]	cv_eval_time=27.66 sec	RMSE=0.087068	R^2=0.747677
[426/500]	cv_eval_time=34.73 sec	RMSE=0.085778	R^2=0.764051
[427/500]	cv_eval_time=27.77 sec	RMSE=0.085247	R^2=0.748414
[428/500]	cv_eval_time=37.60 sec	RMSE=0.086896	R^2=0.770314
[429/500]	cv_eval_time=31.95 sec	RMSE=0.