In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_quantitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.054806875409410205}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=cat.X_train, y=cat.y_train)
        ridge.model = cat.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=0.43 sec	RMSE=0.114079	R^2=0.662537
[2/500]	cv_eval_time=0.37 sec	RMSE=0.114622	R^2=0.658266
[3/500]	cv_eval_time=0.34 sec	RMSE=0.111371	R^2=0.678371
[4/500]	cv_eval_time=0.33 sec	RMSE=0.115779	R^2=0.653634
[5/500]	cv_eval_time=0.37 sec	RMSE=0.118223	R^2=0.638710
[6/500]	cv_eval_time=0.36 sec	RMSE=0.115786	R^2=0.655230
[7/500]	cv_eval_time=0.34 sec	RMSE=0.110844	R^2=0.682333
[8/500]	cv_eval_time=0.37 sec	RMSE=0.113322	R^2=0.668592
[9/500]	cv_eval_time=0.39 sec	RMSE=0.113386	R^2=0.666691
[10/500]	cv_eval_time=0.33 sec	RMSE=0.117214	R^2=0.642903
[11/500]	cv_eval_time=0.32 sec	RMSE=0.115409	R^2=0.655297
[12/500]	cv_eval_time=0.37 sec	RMSE=0.114202	R^2=0.661587
[13/500]	cv_eval_time=0.37 sec	RMSE=0.117156	R^2=0.644686
[14/500]	cv_eval_time=0.32 sec	RMSE=0.115024	R^2=0.656094
[15/500]	cv_eval_time=0.32 sec	RMSE=0.112070	R^2=0.673905
[16/500]	cv_eval_time=0.36 sec	RMSE=0.118027	R^2=0.638927
[17/500]	cv_eval_time=0.38 sec	RMSE=0.11150

[142/500]	cv_eval_time=0.43 sec	RMSE=0.115820	R^2=0.651470
[143/500]	cv_eval_time=0.38 sec	RMSE=0.115463	R^2=0.657605
[144/500]	cv_eval_time=0.38 sec	RMSE=0.113420	R^2=0.666207
[145/500]	cv_eval_time=0.53 sec	RMSE=0.112103	R^2=0.674679
[146/500]	cv_eval_time=0.47 sec	RMSE=0.111287	R^2=0.679506
[147/500]	cv_eval_time=0.37 sec	RMSE=0.109066	R^2=0.691606
[148/500]	cv_eval_time=0.39 sec	RMSE=0.108802	R^2=0.693562
[149/500]	cv_eval_time=0.40 sec	RMSE=0.110344	R^2=0.684385
[150/500]	cv_eval_time=0.51 sec	RMSE=0.111725	R^2=0.675835
[151/500]	cv_eval_time=0.41 sec	RMSE=0.110622	R^2=0.681301
[152/500]	cv_eval_time=0.34 sec	RMSE=0.109428	R^2=0.690720
[153/500]	cv_eval_time=0.34 sec	RMSE=0.111224	R^2=0.681072
[154/500]	cv_eval_time=0.39 sec	RMSE=0.112975	R^2=0.668442
[155/500]	cv_eval_time=0.41 sec	RMSE=0.111502	R^2=0.678792
[156/500]	cv_eval_time=0.53 sec	RMSE=0.111970	R^2=0.675966
[157/500]	cv_eval_time=0.42 sec	RMSE=0.111662	R^2=0.676746
[158/500]	cv_eval_time=0.53 sec	RMSE=0.108734	R^2=0.6932

[281/500]	cv_eval_time=0.33 sec	RMSE=0.110805	R^2=0.681863
[282/500]	cv_eval_time=0.31 sec	RMSE=0.112518	R^2=0.671599
[283/500]	cv_eval_time=0.31 sec	RMSE=0.112879	R^2=0.670910
[284/500]	cv_eval_time=0.32 sec	RMSE=0.111408	R^2=0.678004
[285/500]	cv_eval_time=0.34 sec	RMSE=0.110828	R^2=0.683474
[286/500]	cv_eval_time=0.48 sec	RMSE=0.108673	R^2=0.693734
[287/500]	cv_eval_time=0.38 sec	RMSE=0.111703	R^2=0.677002
[288/500]	cv_eval_time=0.67 sec	RMSE=0.110993	R^2=0.681004
[289/500]	cv_eval_time=0.80 sec	RMSE=0.112368	R^2=0.674677
[290/500]	cv_eval_time=0.68 sec	RMSE=0.108743	R^2=0.692612
[291/500]	cv_eval_time=0.93 sec	RMSE=0.110306	R^2=0.684596
[292/500]	cv_eval_time=0.81 sec	RMSE=0.111247	R^2=0.678982
[293/500]	cv_eval_time=0.86 sec	RMSE=0.112285	R^2=0.673023
[294/500]	cv_eval_time=0.38 sec	RMSE=0.111878	R^2=0.675420
[295/500]	cv_eval_time=0.48 sec	RMSE=0.108760	R^2=0.694262
[296/500]	cv_eval_time=0.35 sec	RMSE=0.110925	R^2=0.681307
[297/500]	cv_eval_time=0.33 sec	RMSE=0.114089	R^2=0.6631

[420/500]	cv_eval_time=0.63 sec	RMSE=0.111209	R^2=0.678204
[421/500]	cv_eval_time=0.48 sec	RMSE=0.109130	R^2=0.690900
[422/500]	cv_eval_time=0.64 sec	RMSE=0.110709	R^2=0.683076
[423/500]	cv_eval_time=0.38 sec	RMSE=0.111765	R^2=0.677138
[424/500]	cv_eval_time=0.33 sec	RMSE=0.109204	R^2=0.690057
[425/500]	cv_eval_time=0.31 sec	RMSE=0.108972	R^2=0.691913
[426/500]	cv_eval_time=0.33 sec	RMSE=0.111092	R^2=0.679493
[427/500]	cv_eval_time=0.31 sec	RMSE=0.112288	R^2=0.672658
[428/500]	cv_eval_time=0.70 sec	RMSE=0.111668	R^2=0.676146
[429/500]	cv_eval_time=0.46 sec	RMSE=0.112065	R^2=0.674619
[430/500]	cv_eval_time=0.51 sec	RMSE=0.110508	R^2=0.684419
[431/500]	cv_eval_time=0.37 sec	RMSE=0.108726	R^2=0.693548
[432/500]	cv_eval_time=0.54 sec	RMSE=0.110728	R^2=0.681352
[433/500]	cv_eval_time=0.29 sec	RMSE=0.111122	R^2=0.678699
[434/500]	cv_eval_time=0.36 sec	RMSE=0.110213	R^2=0.685631
[435/500]	cv_eval_time=0.32 sec	RMSE=0.111586	R^2=0.676705
[436/500]	cv_eval_time=0.37 sec	RMSE=0.110896	R^2=0.6811