In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_hindex.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)
print(df.shape)

(99178, 36)


In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.


In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.05231780585024858}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=ridge.X_train, y=ridge.y_train)
        ridge.model = ridge.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=1.10 sec	RMSE=0.088507	R^2=0.754586
[2/500]	cv_eval_time=1.17 sec	RMSE=0.086429	R^2=0.765933
[3/500]	cv_eval_time=1.19 sec	RMSE=0.087877	R^2=0.758038
[4/500]	cv_eval_time=1.12 sec	RMSE=0.087365	R^2=0.760910
[5/500]	cv_eval_time=1.17 sec	RMSE=0.085883	R^2=0.768877
[6/500]	cv_eval_time=1.09 sec	RMSE=0.089204	R^2=0.750738
[7/500]	cv_eval_time=1.25 sec	RMSE=0.089088	R^2=0.751386
[8/500]	cv_eval_time=1.10 sec	RMSE=0.086730	R^2=0.764361
[9/500]	cv_eval_time=1.25 sec	RMSE=0.088269	R^2=0.755919
[10/500]	cv_eval_time=1.09 sec	RMSE=0.086269	R^2=0.766801
[11/500]	cv_eval_time=1.25 sec	RMSE=0.087326	R^2=0.761138
[12/500]	cv_eval_time=1.12 sec	RMSE=0.088400	R^2=0.755145
[13/500]	cv_eval_time=1.18 sec	RMSE=0.087108	R^2=0.762267
[14/500]	cv_eval_time=1.11 sec	RMSE=0.088596	R^2=0.754070
[15/500]	cv_eval_time=1.13 sec	RMSE=0.087650	R^2=0.759316
[16/500]	cv_eval_time=1.17 sec	RMSE=0.088230	R^2=0.756119
[17/500]	cv_eval_time=1.10 sec	RMSE=0.08751

[142/500]	cv_eval_time=1.01 sec	RMSE=0.085883	R^2=0.768969
[143/500]	cv_eval_time=0.99 sec	RMSE=0.088198	R^2=0.756306
[144/500]	cv_eval_time=1.00 sec	RMSE=0.087552	R^2=0.759846
[145/500]	cv_eval_time=0.97 sec	RMSE=0.088771	R^2=0.753087
[146/500]	cv_eval_time=0.97 sec	RMSE=0.085780	R^2=0.769453
[147/500]	cv_eval_time=1.00 sec	RMSE=0.085623	R^2=0.770290
[148/500]	cv_eval_time=0.99 sec	RMSE=0.085294	R^2=0.771959
[149/500]	cv_eval_time=0.96 sec	RMSE=0.085322	R^2=0.771962
[150/500]	cv_eval_time=0.99 sec	RMSE=0.085919	R^2=0.768724
[151/500]	cv_eval_time=1.03 sec	RMSE=0.086201	R^2=0.767180
[152/500]	cv_eval_time=0.98 sec	RMSE=0.086110	R^2=0.767682
[153/500]	cv_eval_time=0.96 sec	RMSE=0.086034	R^2=0.768125
[154/500]	cv_eval_time=1.01 sec	RMSE=0.085851	R^2=0.769074
[155/500]	cv_eval_time=0.99 sec	RMSE=0.085962	R^2=0.768463
[156/500]	cv_eval_time=0.97 sec	RMSE=0.085677	R^2=0.770076
[157/500]	cv_eval_time=1.07 sec	RMSE=0.086372	R^2=0.766246
[158/500]	cv_eval_time=0.98 sec	RMSE=0.085808	R^2=0.7693

[281/500]	cv_eval_time=0.96 sec	RMSE=0.085754	R^2=0.769645
[282/500]	cv_eval_time=0.96 sec	RMSE=0.086088	R^2=0.767762
[283/500]	cv_eval_time=0.96 sec	RMSE=0.086022	R^2=0.768146
[284/500]	cv_eval_time=0.97 sec	RMSE=0.085959	R^2=0.768508
[285/500]	cv_eval_time=0.95 sec	RMSE=0.085831	R^2=0.769148
[286/500]	cv_eval_time=0.94 sec	RMSE=0.085644	R^2=0.770146
[287/500]	cv_eval_time=0.97 sec	RMSE=0.085926	R^2=0.768717
[288/500]	cv_eval_time=0.95 sec	RMSE=0.085331	R^2=0.771911
[289/500]	cv_eval_time=0.95 sec	RMSE=0.086384	R^2=0.766178
[290/500]	cv_eval_time=0.94 sec	RMSE=0.085749	R^2=0.769610
[291/500]	cv_eval_time=0.94 sec	RMSE=0.086250	R^2=0.766963
[292/500]	cv_eval_time=0.95 sec	RMSE=0.085851	R^2=0.769043
[293/500]	cv_eval_time=0.99 sec	RMSE=0.085645	R^2=0.770226
[294/500]	cv_eval_time=0.96 sec	RMSE=0.086100	R^2=0.767845
[295/500]	cv_eval_time=0.97 sec	RMSE=0.086003	R^2=0.768297
[296/500]	cv_eval_time=0.95 sec	RMSE=0.086163	R^2=0.767426
[297/500]	cv_eval_time=0.96 sec	RMSE=0.085943	R^2=0.7685

[420/500]	cv_eval_time=0.97 sec	RMSE=0.086048	R^2=0.767982
[421/500]	cv_eval_time=0.95 sec	RMSE=0.085884	R^2=0.768906
[422/500]	cv_eval_time=0.97 sec	RMSE=0.085343	R^2=0.771896
[423/500]	cv_eval_time=0.98 sec	RMSE=0.085745	R^2=0.769707
[424/500]	cv_eval_time=0.97 sec	RMSE=0.085852	R^2=0.769084
[425/500]	cv_eval_time=0.97 sec	RMSE=0.085635	R^2=0.770276
[426/500]	cv_eval_time=0.96 sec	RMSE=0.086094	R^2=0.767849
[427/500]	cv_eval_time=0.99 sec	RMSE=0.085955	R^2=0.768631
[428/500]	cv_eval_time=0.97 sec	RMSE=0.086017	R^2=0.768228
[429/500]	cv_eval_time=0.95 sec	RMSE=0.085310	R^2=0.771971
[430/500]	cv_eval_time=0.97 sec	RMSE=0.085785	R^2=0.769459
[431/500]	cv_eval_time=0.94 sec	RMSE=0.085888	R^2=0.768923
[432/500]	cv_eval_time=0.95 sec	RMSE=0.085648	R^2=0.770236
[433/500]	cv_eval_time=0.96 sec	RMSE=0.085945	R^2=0.768616
[434/500]	cv_eval_time=0.97 sec	RMSE=0.085750	R^2=0.769526
[435/500]	cv_eval_time=0.97 sec	RMSE=0.085839	R^2=0.769126
[436/500]	cv_eval_time=0.96 sec	RMSE=0.085339	R^2=0.7718