In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_science_qualitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 79342
Number of observations in the test data: 19836


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Lasso with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')

    lasso.pipeline = Pipeline([
        ('estimate', linear_model.Lasso(**{'alpha': 0.0007414388017055629}))
    ])
    
    lasso.raw_features = features
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = lasso.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=0.94 sec	RMSE=0.147420	R^2=0.094725
[2/500]	cv_eval_time=0.96 sec	RMSE=0.151428	R^2=0.044811
[3/500]	cv_eval_time=0.79 sec	RMSE=0.148134	R^2=0.085851
[4/500]	cv_eval_time=0.81 sec	RMSE=0.147907	R^2=0.088510
[5/500]	cv_eval_time=1.53 sec	RMSE=0.145564	R^2=0.117923
[6/500]	cv_eval_time=0.81 sec	RMSE=0.147713	R^2=0.091115
[7/500]	cv_eval_time=0.89 sec	RMSE=0.149865	R^2=0.064417
[8/500]	cv_eval_time=1.20 sec	RMSE=0.147276	R^2=0.096445
[9/500]	cv_eval_time=0.82 sec	RMSE=0.147988	R^2=0.087884
[10/500]	cv_eval_time=0.87 sec	RMSE=0.148287	R^2=0.084091
[11/500]	cv_eval_time=1.63 sec	RMSE=0.146099	R^2=0.110695
[12/500]	cv_eval_time=0.87 sec	RMSE=0.147679	R^2=0.091258
[13/500]	cv_eval_time=0.84 sec	RMSE=0.148927	R^2=0.075955
[14/500]	cv_eval_time=0.97 sec	RMSE=0.147374	R^2=0.095288
[15/500]	cv_eval_time=1.63 sec	RMSE=0.147166	R^2=0.098025
[16/500]	cv_eval_time=1.01 sec	RMSE=0.147425	R^2=0.094557
[17/500]	cv_eval_time=0.85 sec	RMSE=0.14814

[142/500]	cv_eval_time=1.11 sec	RMSE=0.148344	R^2=0.083280
[143/500]	cv_eval_time=0.93 sec	RMSE=0.147303	R^2=0.095742
[144/500]	cv_eval_time=0.79 sec	RMSE=0.147396	R^2=0.094886
[145/500]	cv_eval_time=0.83 sec	RMSE=0.141572	R^2=0.165128
[146/500]	cv_eval_time=1.26 sec	RMSE=0.145311	R^2=0.120564
[147/500]	cv_eval_time=1.37 sec	RMSE=0.146246	R^2=0.109383
[148/500]	cv_eval_time=1.63 sec	RMSE=0.133911	R^2=0.252930
[149/500]	cv_eval_time=0.90 sec	RMSE=0.137900	R^2=0.207819
[150/500]	cv_eval_time=0.85 sec	RMSE=0.147340	R^2=0.095674
[151/500]	cv_eval_time=1.63 sec	RMSE=0.133840	R^2=0.253739
[152/500]	cv_eval_time=1.14 sec	RMSE=0.144554	R^2=0.129705
[153/500]	cv_eval_time=0.78 sec	RMSE=0.147525	R^2=0.093364
[154/500]	cv_eval_time=1.35 sec	RMSE=0.145826	R^2=0.114251
[155/500]	cv_eval_time=3.10 sec	RMSE=0.122913	R^2=0.370489
[156/500]	cv_eval_time=1.43 sec	RMSE=0.146463	R^2=0.106093
[157/500]	cv_eval_time=1.35 sec	RMSE=0.147235	R^2=0.096853
[158/500]	cv_eval_time=2.67 sec	RMSE=0.125812	R^2=0.3406

[281/500]	cv_eval_time=1.33 sec	RMSE=0.144484	R^2=0.130397
[282/500]	cv_eval_time=1.71 sec	RMSE=0.146323	R^2=0.107949
[283/500]	cv_eval_time=1.12 sec	RMSE=0.147328	R^2=0.095617
[284/500]	cv_eval_time=4.61 sec	RMSE=0.125061	R^2=0.348343
[285/500]	cv_eval_time=0.95 sec	RMSE=0.139289	R^2=0.191638
[286/500]	cv_eval_time=1.55 sec	RMSE=0.145339	R^2=0.119723
[287/500]	cv_eval_time=1.36 sec	RMSE=0.147270	R^2=0.096192
[288/500]	cv_eval_time=0.96 sec	RMSE=0.147385	R^2=0.095078
[289/500]	cv_eval_time=2.74 sec	RMSE=0.145940	R^2=0.112810
[290/500]	cv_eval_time=1.89 sec	RMSE=0.146751	R^2=0.103017
[291/500]	cv_eval_time=1.02 sec	RMSE=0.137841	R^2=0.208407
[292/500]	cv_eval_time=0.87 sec	RMSE=0.149400	R^2=0.070173
[293/500]	cv_eval_time=1.14 sec	RMSE=0.143627	R^2=0.140947
[294/500]	cv_eval_time=0.90 sec	RMSE=0.147525	R^2=0.093445
[295/500]	cv_eval_time=2.61 sec	RMSE=0.129713	R^2=0.299337
[296/500]	cv_eval_time=1.58 sec	RMSE=0.145292	R^2=0.120636
[297/500]	cv_eval_time=1.09 sec	RMSE=0.147305	R^2=0.0958

[420/500]	cv_eval_time=1.07 sec	RMSE=0.149179	R^2=0.072979
[421/500]	cv_eval_time=1.17 sec	RMSE=0.147296	R^2=0.096020
[422/500]	cv_eval_time=2.42 sec	RMSE=0.146236	R^2=0.109315
[423/500]	cv_eval_time=1.11 sec	RMSE=0.143859	R^2=0.137825
[424/500]	cv_eval_time=1.56 sec	RMSE=0.147226	R^2=0.097021
[425/500]	cv_eval_time=1.58 sec	RMSE=0.145201	R^2=0.121822
[426/500]	cv_eval_time=3.00 sec	RMSE=0.129157	R^2=0.305160
[427/500]	cv_eval_time=1.03 sec	RMSE=0.147342	R^2=0.095704
[428/500]	cv_eval_time=1.07 sec	RMSE=0.137728	R^2=0.209832
[429/500]	cv_eval_time=1.76 sec	RMSE=0.146262	R^2=0.108626
[430/500]	cv_eval_time=2.21 sec	RMSE=0.131915	R^2=0.275125
[431/500]	cv_eval_time=1.10 sec	RMSE=0.141504	R^2=0.165802
[432/500]	cv_eval_time=1.04 sec	RMSE=0.147382	R^2=0.095168
[433/500]	cv_eval_time=2.03 sec	RMSE=0.145662	R^2=0.116314
[434/500]	cv_eval_time=1.84 sec	RMSE=0.147125	R^2=0.098169
[435/500]	cv_eval_time=1.13 sec	RMSE=0.136453	R^2=0.224634
[436/500]	cv_eval_time=1.44 sec	RMSE=0.144562	R^2=0.1293