In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_quantitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.07359702430424347}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=ridge.X_train, y=ridge.y_train)
        ridge.model = ridge.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=1.97 sec	RMSE=0.096817	R^2=0.766226
[2/500]	cv_eval_time=1.19 sec	RMSE=0.096271	R^2=0.768789
[3/500]	cv_eval_time=1.24 sec	RMSE=0.095738	R^2=0.771342
[4/500]	cv_eval_time=1.21 sec	RMSE=0.096298	R^2=0.768694
[5/500]	cv_eval_time=1.32 sec	RMSE=0.097176	R^2=0.764429
[6/500]	cv_eval_time=1.27 sec	RMSE=0.096687	R^2=0.766819
[7/500]	cv_eval_time=1.19 sec	RMSE=0.097233	R^2=0.764188
[8/500]	cv_eval_time=1.27 sec	RMSE=0.094764	R^2=0.776010
[9/500]	cv_eval_time=1.15 sec	RMSE=0.096753	R^2=0.766465
[10/500]	cv_eval_time=1.36 sec	RMSE=0.094139	R^2=0.778946
[11/500]	cv_eval_time=1.27 sec	RMSE=0.095572	R^2=0.772156
[12/500]	cv_eval_time=1.30 sec	RMSE=0.097803	R^2=0.761398
[13/500]	cv_eval_time=1.25 sec	RMSE=0.095854	R^2=0.770811
[14/500]	cv_eval_time=1.25 sec	RMSE=0.096689	R^2=0.766806
[15/500]	cv_eval_time=1.19 sec	RMSE=0.095620	R^2=0.771977
[16/500]	cv_eval_time=1.24 sec	RMSE=0.096388	R^2=0.768287
[17/500]	cv_eval_time=1.21 sec	RMSE=0.09668

[142/500]	cv_eval_time=1.07 sec	RMSE=0.096481	R^2=0.767824
[143/500]	cv_eval_time=1.09 sec	RMSE=0.095626	R^2=0.771838
[144/500]	cv_eval_time=1.06 sec	RMSE=0.095473	R^2=0.772630
[145/500]	cv_eval_time=1.13 sec	RMSE=0.093474	R^2=0.782048
[146/500]	cv_eval_time=1.09 sec	RMSE=0.094008	R^2=0.779515
[147/500]	cv_eval_time=1.12 sec	RMSE=0.093408	R^2=0.782355
[148/500]	cv_eval_time=1.12 sec	RMSE=0.094577	R^2=0.776922
[149/500]	cv_eval_time=1.13 sec	RMSE=0.093213	R^2=0.783302
[150/500]	cv_eval_time=1.11 sec	RMSE=0.094383	R^2=0.777764
[151/500]	cv_eval_time=1.05 sec	RMSE=0.094843	R^2=0.775641
[152/500]	cv_eval_time=1.06 sec	RMSE=0.094763	R^2=0.776032
[153/500]	cv_eval_time=1.06 sec	RMSE=0.093896	R^2=0.780082
[154/500]	cv_eval_time=1.09 sec	RMSE=0.094190	R^2=0.778662
[155/500]	cv_eval_time=1.08 sec	RMSE=0.094665	R^2=0.776430
[156/500]	cv_eval_time=1.08 sec	RMSE=0.095020	R^2=0.774795
[157/500]	cv_eval_time=1.10 sec	RMSE=0.094506	R^2=0.777208
[158/500]	cv_eval_time=1.06 sec	RMSE=0.094346	R^2=0.7779

[281/500]	cv_eval_time=1.04 sec	RMSE=0.094667	R^2=0.776466
[282/500]	cv_eval_time=1.03 sec	RMSE=0.093094	R^2=0.783877
[283/500]	cv_eval_time=1.04 sec	RMSE=0.093109	R^2=0.783739
[284/500]	cv_eval_time=1.04 sec	RMSE=0.093696	R^2=0.781059
[285/500]	cv_eval_time=1.08 sec	RMSE=0.094461	R^2=0.777457
[286/500]	cv_eval_time=1.07 sec	RMSE=0.094920	R^2=0.775238
[287/500]	cv_eval_time=1.06 sec	RMSE=0.094269	R^2=0.778298
[288/500]	cv_eval_time=1.07 sec	RMSE=0.094777	R^2=0.775957
[289/500]	cv_eval_time=1.08 sec	RMSE=0.095183	R^2=0.773976
[290/500]	cv_eval_time=1.13 sec	RMSE=0.094083	R^2=0.779170
[291/500]	cv_eval_time=1.07 sec	RMSE=0.094575	R^2=0.776890
[292/500]	cv_eval_time=1.05 sec	RMSE=0.093837	R^2=0.780316
[293/500]	cv_eval_time=1.10 sec	RMSE=0.094990	R^2=0.774934
[294/500]	cv_eval_time=1.11 sec	RMSE=0.094371	R^2=0.777871
[295/500]	cv_eval_time=1.09 sec	RMSE=0.094648	R^2=0.776588
[296/500]	cv_eval_time=1.09 sec	RMSE=0.094852	R^2=0.775620
[297/500]	cv_eval_time=1.10 sec	RMSE=0.095120	R^2=0.7743

[420/500]	cv_eval_time=1.06 sec	RMSE=0.093157	R^2=0.783510
[421/500]	cv_eval_time=1.07 sec	RMSE=0.094782	R^2=0.775933
[422/500]	cv_eval_time=1.02 sec	RMSE=0.093856	R^2=0.780302
[423/500]	cv_eval_time=1.06 sec	RMSE=0.094320	R^2=0.778146
[424/500]	cv_eval_time=1.04 sec	RMSE=0.094505	R^2=0.777216
[425/500]	cv_eval_time=1.17 sec	RMSE=0.093120	R^2=0.783572
[426/500]	cv_eval_time=1.10 sec	RMSE=0.093973	R^2=0.779722
[427/500]	cv_eval_time=1.04 sec	RMSE=0.094713	R^2=0.776234
[428/500]	cv_eval_time=1.09 sec	RMSE=0.094435	R^2=0.777482
[429/500]	cv_eval_time=1.08 sec	RMSE=0.094861	R^2=0.775522
[430/500]	cv_eval_time=1.10 sec	RMSE=0.093747	R^2=0.780803
[431/500]	cv_eval_time=1.09 sec	RMSE=0.094561	R^2=0.776984
[432/500]	cv_eval_time=1.09 sec	RMSE=0.097429	R^2=0.763227
[433/500]	cv_eval_time=1.11 sec	RMSE=0.094157	R^2=0.778894
[434/500]	cv_eval_time=1.09 sec	RMSE=0.094619	R^2=0.776656
[435/500]	cv_eval_time=1.07 sec	RMSE=0.093709	R^2=0.780986
[436/500]	cv_eval_time=1.03 sec	RMSE=0.094317	R^2=0.7781