In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_qualitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.13701375084782796}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=ridge.X_train, y=ridge.y_train)
        ridge.model = ridge.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=0.94 sec	RMSE=0.123200	R^2=0.367730
[2/500]	cv_eval_time=0.75 sec	RMSE=0.123239	R^2=0.367298
[3/500]	cv_eval_time=0.94 sec	RMSE=0.122948	R^2=0.370252
[4/500]	cv_eval_time=1.16 sec	RMSE=0.123146	R^2=0.368192
[5/500]	cv_eval_time=1.26 sec	RMSE=0.122920	R^2=0.370592
[6/500]	cv_eval_time=0.84 sec	RMSE=0.122771	R^2=0.371900
[7/500]	cv_eval_time=0.76 sec	RMSE=0.122959	R^2=0.369994
[8/500]	cv_eval_time=0.89 sec	RMSE=0.122932	R^2=0.370467
[9/500]	cv_eval_time=1.10 sec	RMSE=0.122724	R^2=0.372431
[10/500]	cv_eval_time=1.01 sec	RMSE=0.123000	R^2=0.369702
[11/500]	cv_eval_time=1.39 sec	RMSE=0.122724	R^2=0.372541
[12/500]	cv_eval_time=0.97 sec	RMSE=0.122834	R^2=0.371501
[13/500]	cv_eval_time=0.85 sec	RMSE=0.123193	R^2=0.367474
[14/500]	cv_eval_time=0.89 sec	RMSE=0.123242	R^2=0.367197
[15/500]	cv_eval_time=0.84 sec	RMSE=0.123193	R^2=0.367633
[16/500]	cv_eval_time=1.22 sec	RMSE=0.122834	R^2=0.371618
[17/500]	cv_eval_time=0.90 sec	RMSE=0.12293

[142/500]	cv_eval_time=0.68 sec	RMSE=0.122768	R^2=0.372153
[143/500]	cv_eval_time=0.70 sec	RMSE=0.122983	R^2=0.369712
[144/500]	cv_eval_time=0.77 sec	RMSE=0.122670	R^2=0.372913
[145/500]	cv_eval_time=0.73 sec	RMSE=0.123025	R^2=0.369432
[146/500]	cv_eval_time=0.92 sec	RMSE=0.122658	R^2=0.373493
[147/500]	cv_eval_time=0.80 sec	RMSE=0.122723	R^2=0.372698
[148/500]	cv_eval_time=0.75 sec	RMSE=0.122734	R^2=0.372507
[149/500]	cv_eval_time=0.78 sec	RMSE=0.122632	R^2=0.373447
[150/500]	cv_eval_time=0.71 sec	RMSE=0.122659	R^2=0.373204
[151/500]	cv_eval_time=0.86 sec	RMSE=0.122694	R^2=0.372962
[152/500]	cv_eval_time=0.79 sec	RMSE=0.122699	R^2=0.372711
[153/500]	cv_eval_time=0.84 sec	RMSE=0.122749	R^2=0.372495
[154/500]	cv_eval_time=0.88 sec	RMSE=0.122641	R^2=0.373428
[155/500]	cv_eval_time=1.12 sec	RMSE=0.122721	R^2=0.372664
[156/500]	cv_eval_time=0.80 sec	RMSE=0.122751	R^2=0.372175
[157/500]	cv_eval_time=0.76 sec	RMSE=0.122780	R^2=0.371887
[158/500]	cv_eval_time=0.77 sec	RMSE=0.122622	R^2=0.3735

[281/500]	cv_eval_time=0.66 sec	RMSE=0.122752	R^2=0.372293
[282/500]	cv_eval_time=0.65 sec	RMSE=0.122770	R^2=0.372001
[283/500]	cv_eval_time=0.66 sec	RMSE=0.122712	R^2=0.372649
[284/500]	cv_eval_time=0.65 sec	RMSE=0.122670	R^2=0.373069
[285/500]	cv_eval_time=0.63 sec	RMSE=0.122767	R^2=0.372192
[286/500]	cv_eval_time=0.63 sec	RMSE=0.122716	R^2=0.372553
[287/500]	cv_eval_time=0.63 sec	RMSE=0.122647	R^2=0.373411
[288/500]	cv_eval_time=0.64 sec	RMSE=0.122747	R^2=0.372463
[289/500]	cv_eval_time=0.67 sec	RMSE=0.122706	R^2=0.372771
[290/500]	cv_eval_time=0.70 sec	RMSE=0.122785	R^2=0.371812
[291/500]	cv_eval_time=0.66 sec	RMSE=0.122780	R^2=0.371892
[292/500]	cv_eval_time=0.63 sec	RMSE=0.122683	R^2=0.372953
[293/500]	cv_eval_time=0.62 sec	RMSE=0.122757	R^2=0.372233
[294/500]	cv_eval_time=0.64 sec	RMSE=0.122729	R^2=0.372563
[295/500]	cv_eval_time=0.63 sec	RMSE=0.122664	R^2=0.373332
[296/500]	cv_eval_time=0.64 sec	RMSE=0.122767	R^2=0.371963
[297/500]	cv_eval_time=0.64 sec	RMSE=0.122646	R^2=0.3736

[420/500]	cv_eval_time=0.69 sec	RMSE=0.122751	R^2=0.372361
[421/500]	cv_eval_time=0.68 sec	RMSE=0.122704	R^2=0.372960
[422/500]	cv_eval_time=0.69 sec	RMSE=0.122709	R^2=0.372767
[423/500]	cv_eval_time=0.64 sec	RMSE=0.122652	R^2=0.373359
[424/500]	cv_eval_time=0.68 sec	RMSE=0.122742	R^2=0.372551
[425/500]	cv_eval_time=0.67 sec	RMSE=0.122684	R^2=0.373127
[426/500]	cv_eval_time=0.65 sec	RMSE=0.122692	R^2=0.372806
[427/500]	cv_eval_time=0.65 sec	RMSE=0.122716	R^2=0.372560
[428/500]	cv_eval_time=0.64 sec	RMSE=0.122750	R^2=0.372243
[429/500]	cv_eval_time=0.64 sec	RMSE=0.122703	R^2=0.372672
[430/500]	cv_eval_time=0.65 sec	RMSE=0.122754	R^2=0.372234
[431/500]	cv_eval_time=0.63 sec	RMSE=0.122723	R^2=0.372570
[432/500]	cv_eval_time=1.03 sec	RMSE=0.122660	R^2=0.373169
[433/500]	cv_eval_time=0.84 sec	RMSE=0.122741	R^2=0.372281
[434/500]	cv_eval_time=0.92 sec	RMSE=0.122680	R^2=0.373038
[435/500]	cv_eval_time=0.65 sec	RMSE=0.122632	R^2=0.373458
[436/500]	cv_eval_time=0.66 sec	RMSE=0.122639	R^2=0.3734