In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_quantitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=10, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.008782998998436944}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=cat.X_train, y=cat.y_train)
        ridge.model = cat.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=1.12 sec	RMSE=0.113252	R^2=0.667825
[2/500]	cv_eval_time=1.02 sec	RMSE=0.114450	R^2=0.660855
[3/500]	cv_eval_time=0.98 sec	RMSE=0.113819	R^2=0.662607
[4/500]	cv_eval_time=1.01 sec	RMSE=0.111564	R^2=0.677740
[5/500]	cv_eval_time=1.15 sec	RMSE=0.114761	R^2=0.658483
[6/500]	cv_eval_time=1.05 sec	RMSE=0.112247	R^2=0.673166
[7/500]	cv_eval_time=1.34 sec	RMSE=0.113154	R^2=0.667896
[8/500]	cv_eval_time=1.46 sec	RMSE=0.112565	R^2=0.670762
[9/500]	cv_eval_time=1.17 sec	RMSE=0.113682	R^2=0.664928
[10/500]	cv_eval_time=1.02 sec	RMSE=0.117648	R^2=0.641498
[11/500]	cv_eval_time=0.94 sec	RMSE=0.113095	R^2=0.668356
[12/500]	cv_eval_time=1.00 sec	RMSE=0.110640	R^2=0.682619
[13/500]	cv_eval_time=0.99 sec	RMSE=0.112267	R^2=0.673399
[14/500]	cv_eval_time=1.03 sec	RMSE=0.110532	R^2=0.682791
[15/500]	cv_eval_time=1.00 sec	RMSE=0.117312	R^2=0.642346
[16/500]	cv_eval_time=0.97 sec	RMSE=0.115004	R^2=0.657041
[17/500]	cv_eval_time=0.95 sec	RMSE=0.11644

[142/500]	cv_eval_time=1.09 sec	RMSE=0.115112	R^2=0.655807
[143/500]	cv_eval_time=1.09 sec	RMSE=0.111519	R^2=0.677600
[144/500]	cv_eval_time=1.05 sec	RMSE=0.110811	R^2=0.682068
[145/500]	cv_eval_time=1.02 sec	RMSE=0.111226	R^2=0.678849
[146/500]	cv_eval_time=0.95 sec	RMSE=0.110044	R^2=0.686564
[147/500]	cv_eval_time=0.97 sec	RMSE=0.111849	R^2=0.676076
[148/500]	cv_eval_time=0.99 sec	RMSE=0.109322	R^2=0.690494
[149/500]	cv_eval_time=1.04 sec	RMSE=0.110601	R^2=0.682808
[150/500]	cv_eval_time=1.10 sec	RMSE=0.111024	R^2=0.680281
[151/500]	cv_eval_time=1.47 sec	RMSE=0.110317	R^2=0.683704
[152/500]	cv_eval_time=1.99 sec	RMSE=0.109112	R^2=0.691827
[153/500]	cv_eval_time=1.98 sec	RMSE=0.111422	R^2=0.677864
[154/500]	cv_eval_time=1.59 sec	RMSE=0.110719	R^2=0.681681
[155/500]	cv_eval_time=1.03 sec	RMSE=0.110122	R^2=0.685801
[156/500]	cv_eval_time=1.03 sec	RMSE=0.111790	R^2=0.676233
[157/500]	cv_eval_time=0.97 sec	RMSE=0.108780	R^2=0.692995
[158/500]	cv_eval_time=1.02 sec	RMSE=0.111194	R^2=0.6789

[281/500]	cv_eval_time=0.93 sec	RMSE=0.110880	R^2=0.681237
[282/500]	cv_eval_time=0.93 sec	RMSE=0.111650	R^2=0.676768
[283/500]	cv_eval_time=1.17 sec	RMSE=0.111170	R^2=0.679414
[284/500]	cv_eval_time=1.92 sec	RMSE=0.110005	R^2=0.686007
[285/500]	cv_eval_time=1.87 sec	RMSE=0.111977	R^2=0.674391
[286/500]	cv_eval_time=1.81 sec	RMSE=0.110646	R^2=0.682115
[287/500]	cv_eval_time=1.91 sec	RMSE=0.111406	R^2=0.678389
[288/500]	cv_eval_time=1.11 sec	RMSE=0.108560	R^2=0.694416
[289/500]	cv_eval_time=0.88 sec	RMSE=0.108626	R^2=0.693606
[290/500]	cv_eval_time=0.93 sec	RMSE=0.112319	R^2=0.672804
[291/500]	cv_eval_time=0.91 sec	RMSE=0.111782	R^2=0.676052
[292/500]	cv_eval_time=0.83 sec	RMSE=0.110388	R^2=0.684197
[293/500]	cv_eval_time=0.83 sec	RMSE=0.115551	R^2=0.653421
[294/500]	cv_eval_time=0.87 sec	RMSE=0.111011	R^2=0.679833
[295/500]	cv_eval_time=0.89 sec	RMSE=0.110858	R^2=0.681337
[296/500]	cv_eval_time=0.88 sec	RMSE=0.114395	R^2=0.660478
[297/500]	cv_eval_time=0.93 sec	RMSE=0.110220	R^2=0.6852

[420/500]	cv_eval_time=1.07 sec	RMSE=0.110500	R^2=0.683762
[421/500]	cv_eval_time=1.52 sec	RMSE=0.111256	R^2=0.678972
[422/500]	cv_eval_time=0.95 sec	RMSE=0.110093	R^2=0.685998
[423/500]	cv_eval_time=1.96 sec	RMSE=0.108591	R^2=0.692961
[424/500]	cv_eval_time=1.25 sec	RMSE=0.111785	R^2=0.675871
[425/500]	cv_eval_time=0.93 sec	RMSE=0.110638	R^2=0.682365
[426/500]	cv_eval_time=0.93 sec	RMSE=0.111067	R^2=0.680486
[427/500]	cv_eval_time=0.93 sec	RMSE=0.111931	R^2=0.675114
[428/500]	cv_eval_time=0.93 sec	RMSE=0.110837	R^2=0.682085
[429/500]	cv_eval_time=0.95 sec	RMSE=0.109001	R^2=0.691321
[430/500]	cv_eval_time=1.00 sec	RMSE=0.110394	R^2=0.684198
[431/500]	cv_eval_time=1.30 sec	RMSE=0.111406	R^2=0.678693
[432/500]	cv_eval_time=0.93 sec	RMSE=0.111112	R^2=0.679960
[433/500]	cv_eval_time=0.93 sec	RMSE=0.110085	R^2=0.685088
[434/500]	cv_eval_time=1.01 sec	RMSE=0.110480	R^2=0.683759
[435/500]	cv_eval_time=1.03 sec	RMSE=0.108729	R^2=0.693550
[436/500]	cv_eval_time=0.99 sec	RMSE=0.111591	R^2=0.6768