In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_hindex.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)
print(df.shape)

(99178, 36)


In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.05231780585024858}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=ridge.X_train, y=ridge.y_train)
        ridge.model = ridge.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=1.51 sec	RMSE=0.093375	R^2=0.726796
[2/500]	cv_eval_time=1.50 sec	RMSE=0.094448	R^2=0.720619
[3/500]	cv_eval_time=1.66 sec	RMSE=0.095125	R^2=0.716551
[4/500]	cv_eval_time=1.40 sec	RMSE=0.095854	R^2=0.711945
[5/500]	cv_eval_time=1.59 sec	RMSE=0.095634	R^2=0.713318
[6/500]	cv_eval_time=1.13 sec	RMSE=0.094918	R^2=0.717781
[7/500]	cv_eval_time=1.26 sec	RMSE=0.094307	R^2=0.721192
[8/500]	cv_eval_time=1.47 sec	RMSE=0.095848	R^2=0.712459
[9/500]	cv_eval_time=1.32 sec	RMSE=0.095458	R^2=0.714529
[10/500]	cv_eval_time=1.24 sec	RMSE=0.094721	R^2=0.718888
[11/500]	cv_eval_time=1.34 sec	RMSE=0.095322	R^2=0.715478
[12/500]	cv_eval_time=1.33 sec	RMSE=0.094432	R^2=0.720722
[13/500]	cv_eval_time=1.27 sec	RMSE=0.095897	R^2=0.712052
[14/500]	cv_eval_time=1.43 sec	RMSE=0.095236	R^2=0.715799
[15/500]	cv_eval_time=1.22 sec	RMSE=0.095023	R^2=0.717004
[16/500]	cv_eval_time=1.25 sec	RMSE=0.095767	R^2=0.712722
[17/500]	cv_eval_time=1.71 sec	RMSE=0.09359

[142/500]	cv_eval_time=1.62 sec	RMSE=0.092595	R^2=0.731345
[143/500]	cv_eval_time=1.68 sec	RMSE=0.094753	R^2=0.718783
[144/500]	cv_eval_time=1.57 sec	RMSE=0.094968	R^2=0.717382
[145/500]	cv_eval_time=1.43 sec	RMSE=0.095560	R^2=0.714006
[146/500]	cv_eval_time=1.62 sec	RMSE=0.092816	R^2=0.730215
[147/500]	cv_eval_time=1.73 sec	RMSE=0.093045	R^2=0.728845
[148/500]	cv_eval_time=1.74 sec	RMSE=0.093517	R^2=0.725917
[149/500]	cv_eval_time=1.48 sec	RMSE=0.093727	R^2=0.724934
[150/500]	cv_eval_time=1.41 sec	RMSE=0.092603	R^2=0.731313
[151/500]	cv_eval_time=1.04 sec	RMSE=0.093230	R^2=0.727623
[152/500]	cv_eval_time=1.14 sec	RMSE=0.093364	R^2=0.726701
[153/500]	cv_eval_time=1.36 sec	RMSE=0.092915	R^2=0.729511
[154/500]	cv_eval_time=1.15 sec	RMSE=0.093983	R^2=0.723128
[155/500]	cv_eval_time=1.17 sec	RMSE=0.094118	R^2=0.722615
[156/500]	cv_eval_time=1.47 sec	RMSE=0.093529	R^2=0.725825
[157/500]	cv_eval_time=1.64 sec	RMSE=0.092586	R^2=0.731379
[158/500]	cv_eval_time=1.16 sec	RMSE=0.092551	R^2=0.7316

[281/500]	cv_eval_time=1.65 sec	RMSE=0.093185	R^2=0.727856
[282/500]	cv_eval_time=1.83 sec	RMSE=0.095373	R^2=0.715203
[283/500]	cv_eval_time=1.71 sec	RMSE=0.092790	R^2=0.730305
[284/500]	cv_eval_time=1.70 sec	RMSE=0.092992	R^2=0.728993
[285/500]	cv_eval_time=1.71 sec	RMSE=0.093795	R^2=0.724423
[286/500]	cv_eval_time=1.71 sec	RMSE=0.093968	R^2=0.723402
[287/500]	cv_eval_time=1.73 sec	RMSE=0.094139	R^2=0.722497
[288/500]	cv_eval_time=1.60 sec	RMSE=0.093496	R^2=0.726112
[289/500]	cv_eval_time=1.71 sec	RMSE=0.092591	R^2=0.731527
[290/500]	cv_eval_time=1.83 sec	RMSE=0.094265	R^2=0.721525
[291/500]	cv_eval_time=1.80 sec	RMSE=0.093318	R^2=0.727191
[292/500]	cv_eval_time=1.74 sec	RMSE=0.093663	R^2=0.725154
[293/500]	cv_eval_time=1.69 sec	RMSE=0.092547	R^2=0.731479
[294/500]	cv_eval_time=1.61 sec	RMSE=0.092951	R^2=0.729208
[295/500]	cv_eval_time=1.57 sec	RMSE=0.092739	R^2=0.730530
[296/500]	cv_eval_time=1.79 sec	RMSE=0.095744	R^2=0.712924
[297/500]	cv_eval_time=1.59 sec	RMSE=0.093148	R^2=0.7282

[420/500]	cv_eval_time=1.64 sec	RMSE=0.093529	R^2=0.725831
[421/500]	cv_eval_time=1.53 sec	RMSE=0.093198	R^2=0.727725
[422/500]	cv_eval_time=1.72 sec	RMSE=0.092919	R^2=0.729705
[423/500]	cv_eval_time=1.84 sec	RMSE=0.092705	R^2=0.730674
[424/500]	cv_eval_time=1.79 sec	RMSE=0.093784	R^2=0.724252
[425/500]	cv_eval_time=2.01 sec	RMSE=0.093379	R^2=0.726763
[426/500]	cv_eval_time=2.08 sec	RMSE=0.093623	R^2=0.725258
[427/500]	cv_eval_time=2.07 sec	RMSE=0.093068	R^2=0.728670
[428/500]	cv_eval_time=2.22 sec	RMSE=0.094007	R^2=0.723373
[429/500]	cv_eval_time=2.36 sec	RMSE=0.092730	R^2=0.730633
[430/500]	cv_eval_time=1.91 sec	RMSE=0.092564	R^2=0.731667
[431/500]	cv_eval_time=1.97 sec	RMSE=0.093493	R^2=0.726036
[432/500]	cv_eval_time=1.85 sec	RMSE=0.093149	R^2=0.728278
[433/500]	cv_eval_time=2.12 sec	RMSE=0.093335	R^2=0.727064
[434/500]	cv_eval_time=2.17 sec	RMSE=0.092923	R^2=0.729448
[435/500]	cv_eval_time=2.12 sec	RMSE=0.093848	R^2=0.724044
[436/500]	cv_eval_time=1.94 sec	RMSE=0.093687	R^2=0.7250