In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_quantitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.07359702430424347}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=ridge.X_train, y=ridge.y_train)
        ridge.model = ridge.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=1.16 sec	RMSE=0.105461	R^2=0.722569
[2/500]	cv_eval_time=1.14 sec	RMSE=0.105748	R^2=0.721048
[3/500]	cv_eval_time=1.26 sec	RMSE=0.105481	R^2=0.722401
[4/500]	cv_eval_time=1.28 sec	RMSE=0.105185	R^2=0.724114
[5/500]	cv_eval_time=1.31 sec	RMSE=0.106520	R^2=0.716944
[6/500]	cv_eval_time=1.45 sec	RMSE=0.105587	R^2=0.721736
[7/500]	cv_eval_time=1.29 sec	RMSE=0.106061	R^2=0.719455
[8/500]	cv_eval_time=1.32 sec	RMSE=0.104979	R^2=0.725097
[9/500]	cv_eval_time=1.45 sec	RMSE=0.106584	R^2=0.716730
[10/500]	cv_eval_time=1.34 sec	RMSE=0.105609	R^2=0.721823
[11/500]	cv_eval_time=1.20 sec	RMSE=0.106528	R^2=0.716947
[12/500]	cv_eval_time=1.14 sec	RMSE=0.105220	R^2=0.723827
[13/500]	cv_eval_time=1.13 sec	RMSE=0.105679	R^2=0.721472
[14/500]	cv_eval_time=1.15 sec	RMSE=0.106258	R^2=0.718381
[15/500]	cv_eval_time=1.15 sec	RMSE=0.105522	R^2=0.722251
[16/500]	cv_eval_time=1.26 sec	RMSE=0.104367	R^2=0.728228
[17/500]	cv_eval_time=1.78 sec	RMSE=0.10542

[142/500]	cv_eval_time=2.44 sec	RMSE=0.104587	R^2=0.727092
[143/500]	cv_eval_time=2.23 sec	RMSE=0.105441	R^2=0.722569
[144/500]	cv_eval_time=2.53 sec	RMSE=0.105271	R^2=0.723620
[145/500]	cv_eval_time=2.37 sec	RMSE=0.106113	R^2=0.719079
[146/500]	cv_eval_time=2.71 sec	RMSE=0.104412	R^2=0.728162
[147/500]	cv_eval_time=2.47 sec	RMSE=0.104365	R^2=0.728351
[148/500]	cv_eval_time=1.88 sec	RMSE=0.104687	R^2=0.726638
[149/500]	cv_eval_time=1.75 sec	RMSE=0.104354	R^2=0.728324
[150/500]	cv_eval_time=1.63 sec	RMSE=0.104490	R^2=0.727624
[151/500]	cv_eval_time=1.79 sec	RMSE=0.105160	R^2=0.724182
[152/500]	cv_eval_time=1.74 sec	RMSE=0.104795	R^2=0.726050
[153/500]	cv_eval_time=1.79 sec	RMSE=0.104916	R^2=0.725357
[154/500]	cv_eval_time=1.70 sec	RMSE=0.105042	R^2=0.724826
[155/500]	cv_eval_time=1.58 sec	RMSE=0.104569	R^2=0.727333
[156/500]	cv_eval_time=1.54 sec	RMSE=0.104734	R^2=0.726312
[157/500]	cv_eval_time=1.59 sec	RMSE=0.104432	R^2=0.727820
[158/500]	cv_eval_time=1.58 sec	RMSE=0.104653	R^2=0.7268

[281/500]	cv_eval_time=1.48 sec	RMSE=0.104590	R^2=0.727250
[282/500]	cv_eval_time=1.71 sec	RMSE=0.104374	R^2=0.728263
[283/500]	cv_eval_time=1.52 sec	RMSE=0.106376	R^2=0.717825
[284/500]	cv_eval_time=1.55 sec	RMSE=0.104770	R^2=0.726234
[285/500]	cv_eval_time=1.94 sec	RMSE=0.104362	R^2=0.728273
[286/500]	cv_eval_time=1.89 sec	RMSE=0.104506	R^2=0.727645
[287/500]	cv_eval_time=1.60 sec	RMSE=0.106277	R^2=0.718266
[288/500]	cv_eval_time=1.40 sec	RMSE=0.106224	R^2=0.718661
[289/500]	cv_eval_time=1.42 sec	RMSE=0.104369	R^2=0.728289
[290/500]	cv_eval_time=1.46 sec	RMSE=0.104882	R^2=0.725658
[291/500]	cv_eval_time=1.40 sec	RMSE=0.105366	R^2=0.723164
[292/500]	cv_eval_time=1.32 sec	RMSE=0.105069	R^2=0.724537
[293/500]	cv_eval_time=1.41 sec	RMSE=0.105126	R^2=0.724317
[294/500]	cv_eval_time=1.45 sec	RMSE=0.104642	R^2=0.726847
[295/500]	cv_eval_time=1.42 sec	RMSE=0.105015	R^2=0.724901
[296/500]	cv_eval_time=1.47 sec	RMSE=0.104732	R^2=0.726396
[297/500]	cv_eval_time=1.35 sec	RMSE=0.106473	R^2=0.7172

[420/500]	cv_eval_time=1.51 sec	RMSE=0.104364	R^2=0.728240
[421/500]	cv_eval_time=1.40 sec	RMSE=0.104650	R^2=0.726894
[422/500]	cv_eval_time=1.32 sec	RMSE=0.104957	R^2=0.725181
[423/500]	cv_eval_time=1.29 sec	RMSE=0.105065	R^2=0.724629
[424/500]	cv_eval_time=1.27 sec	RMSE=0.104368	R^2=0.728322
[425/500]	cv_eval_time=1.30 sec	RMSE=0.104743	R^2=0.726088
[426/500]	cv_eval_time=1.26 sec	RMSE=0.104526	R^2=0.727497
[427/500]	cv_eval_time=1.22 sec	RMSE=0.104686	R^2=0.726574
[428/500]	cv_eval_time=1.21 sec	RMSE=0.104358	R^2=0.728331
[429/500]	cv_eval_time=1.21 sec	RMSE=0.104338	R^2=0.728529
[430/500]	cv_eval_time=1.19 sec	RMSE=0.104840	R^2=0.725844
[431/500]	cv_eval_time=1.20 sec	RMSE=0.104484	R^2=0.727844
[432/500]	cv_eval_time=1.20 sec	RMSE=0.104968	R^2=0.725206
[433/500]	cv_eval_time=1.38 sec	RMSE=0.106223	R^2=0.718776
[434/500]	cv_eval_time=1.68 sec	RMSE=0.104613	R^2=0.727095
[435/500]	cv_eval_time=1.35 sec	RMSE=0.104890	R^2=0.725583
[436/500]	cv_eval_time=1.39 sec	RMSE=0.104796	R^2=0.7260