In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_qualitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.45462273654931473}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=cat.X_train, y=cat.y_train)
        ridge.model = cat.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=0.22 sec	RMSE=0.011784	R^2=0.015323
[2/500]	cv_eval_time=0.31 sec	RMSE=0.011784	R^2=0.018780
[3/500]	cv_eval_time=0.25 sec	RMSE=0.011782	R^2=0.016573
[4/500]	cv_eval_time=0.22 sec	RMSE=0.011785	R^2=0.014632
[5/500]	cv_eval_time=0.24 sec	RMSE=0.011784	R^2=0.014506
[6/500]	cv_eval_time=0.25 sec	RMSE=0.011784	R^2=0.018741
[7/500]	cv_eval_time=0.22 sec	RMSE=0.011786	R^2=0.013602
[8/500]	cv_eval_time=0.21 sec	RMSE=0.011782	R^2=0.018838
[9/500]	cv_eval_time=0.20 sec	RMSE=0.011784	R^2=0.018912
[10/500]	cv_eval_time=0.21 sec	RMSE=0.011783	R^2=0.014235
[11/500]	cv_eval_time=0.21 sec	RMSE=0.011785	R^2=0.018596
[12/500]	cv_eval_time=0.20 sec	RMSE=0.011783	R^2=0.018458
[13/500]	cv_eval_time=0.22 sec	RMSE=0.011784	R^2=0.018994
[14/500]	cv_eval_time=0.20 sec	RMSE=0.011785	R^2=0.015851
[15/500]	cv_eval_time=0.20 sec	RMSE=0.011786	R^2=0.016010
[16/500]	cv_eval_time=0.20 sec	RMSE=0.011787	R^2=0.014143
[17/500]	cv_eval_time=0.26 sec	RMSE=0.01178

[142/500]	cv_eval_time=0.26 sec	RMSE=0.011785	R^2=0.019304
[143/500]	cv_eval_time=0.28 sec	RMSE=0.011792	R^2=0.022762
[144/500]	cv_eval_time=0.26 sec	RMSE=0.011782	R^2=0.018248
[145/500]	cv_eval_time=0.29 sec	RMSE=0.011782	R^2=0.020332
[146/500]	cv_eval_time=0.27 sec	RMSE=0.011786	R^2=0.017984
[147/500]	cv_eval_time=0.28 sec	RMSE=0.011784	R^2=0.016838
[148/500]	cv_eval_time=0.27 sec	RMSE=0.011794	R^2=0.019467
[149/500]	cv_eval_time=0.26 sec	RMSE=0.011781	R^2=0.016635
[150/500]	cv_eval_time=0.27 sec	RMSE=0.011786	R^2=0.017330
[151/500]	cv_eval_time=0.27 sec	RMSE=0.011782	R^2=0.017179
[152/500]	cv_eval_time=0.27 sec	RMSE=0.011788	R^2=0.017956
[153/500]	cv_eval_time=0.26 sec	RMSE=0.011782	R^2=0.016136
[154/500]	cv_eval_time=0.27 sec	RMSE=0.011782	R^2=0.015405
[155/500]	cv_eval_time=0.26 sec	RMSE=0.011788	R^2=0.017045
[156/500]	cv_eval_time=0.26 sec	RMSE=0.011785	R^2=0.020447
[157/500]	cv_eval_time=0.26 sec	RMSE=0.011783	R^2=0.017653
[158/500]	cv_eval_time=0.32 sec	RMSE=0.011786	R^2=0.0213

[281/500]	cv_eval_time=0.26 sec	RMSE=0.011781	R^2=0.016052
[282/500]	cv_eval_time=0.26 sec	RMSE=0.011787	R^2=0.016010
[283/500]	cv_eval_time=0.28 sec	RMSE=0.011782	R^2=0.017281
[284/500]	cv_eval_time=0.42 sec	RMSE=0.011786	R^2=0.016078
[285/500]	cv_eval_time=0.43 sec	RMSE=0.011783	R^2=0.016837
[286/500]	cv_eval_time=0.27 sec	RMSE=0.011788	R^2=0.017010
[287/500]	cv_eval_time=0.24 sec	RMSE=0.011785	R^2=0.017445
[288/500]	cv_eval_time=0.31 sec	RMSE=0.011780	R^2=0.010682
[289/500]	cv_eval_time=0.28 sec	RMSE=0.011781	R^2=0.020547
[290/500]	cv_eval_time=0.31 sec	RMSE=0.011785	R^2=0.016602
[291/500]	cv_eval_time=0.30 sec	RMSE=0.011785	R^2=0.016847
[292/500]	cv_eval_time=0.26 sec	RMSE=0.011786	R^2=0.016660
[293/500]	cv_eval_time=0.23 sec	RMSE=0.011784	R^2=0.016943
[294/500]	cv_eval_time=0.33 sec	RMSE=0.011790	R^2=0.018517
[295/500]	cv_eval_time=0.31 sec	RMSE=0.011787	R^2=0.018249
[296/500]	cv_eval_time=0.32 sec	RMSE=0.011785	R^2=0.017045
[297/500]	cv_eval_time=0.30 sec	RMSE=0.011788	R^2=0.0178

[420/500]	cv_eval_time=0.30 sec	RMSE=0.011782	R^2=0.015167
[421/500]	cv_eval_time=0.28 sec	RMSE=0.011788	R^2=0.019329
[422/500]	cv_eval_time=0.28 sec	RMSE=0.011782	R^2=0.015068
[423/500]	cv_eval_time=0.50 sec	RMSE=0.011786	R^2=0.013840
[424/500]	cv_eval_time=0.30 sec	RMSE=0.011783	R^2=0.017183
[425/500]	cv_eval_time=0.28 sec	RMSE=0.011783	R^2=0.017031
[426/500]	cv_eval_time=0.29 sec	RMSE=0.011788	R^2=0.019121
[427/500]	cv_eval_time=0.33 sec	RMSE=0.011782	R^2=0.017446
[428/500]	cv_eval_time=0.36 sec	RMSE=0.011779	R^2=0.017022
[429/500]	cv_eval_time=0.32 sec	RMSE=0.011783	R^2=0.016051
[430/500]	cv_eval_time=0.37 sec	RMSE=0.011784	R^2=0.017130
[431/500]	cv_eval_time=0.37 sec	RMSE=0.011785	R^2=0.016775
[432/500]	cv_eval_time=0.35 sec	RMSE=0.011784	R^2=0.019420
[433/500]	cv_eval_time=0.36 sec	RMSE=0.011788	R^2=0.017648
[434/500]	cv_eval_time=0.49 sec	RMSE=0.011788	R^2=0.017569
[435/500]	cv_eval_time=0.33 sec	RMSE=0.011784	R^2=0.017457
[436/500]	cv_eval_time=0.35 sec	RMSE=0.011787	R^2=0.0190