In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_Monika.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.053441185014741655}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=cat.X_train, y=cat.y_train)
        ridge.model = cat.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=0.30 sec	RMSE=0.068445	R^2=0.206186
[2/500]	cv_eval_time=0.31 sec	RMSE=0.068719	R^2=0.200138
[3/500]	cv_eval_time=0.30 sec	RMSE=0.068209	R^2=0.211132
[4/500]	cv_eval_time=0.29 sec	RMSE=0.068562	R^2=0.202274
[5/500]	cv_eval_time=0.32 sec	RMSE=0.068555	R^2=0.202862
[6/500]	cv_eval_time=0.39 sec	RMSE=0.068137	R^2=0.214683
[7/500]	cv_eval_time=0.46 sec	RMSE=0.068335	R^2=0.209966
[8/500]	cv_eval_time=0.29 sec	RMSE=0.068455	R^2=0.204949
[9/500]	cv_eval_time=0.32 sec	RMSE=0.068348	R^2=0.207661
[10/500]	cv_eval_time=0.34 sec	RMSE=0.068673	R^2=0.198867
[11/500]	cv_eval_time=0.32 sec	RMSE=0.068595	R^2=0.203305
[12/500]	cv_eval_time=0.29 sec	RMSE=0.068598	R^2=0.204272
[13/500]	cv_eval_time=0.29 sec	RMSE=0.068592	R^2=0.203272
[14/500]	cv_eval_time=0.32 sec	RMSE=0.068518	R^2=0.207277
[15/500]	cv_eval_time=0.34 sec	RMSE=0.068247	R^2=0.211006
[16/500]	cv_eval_time=0.34 sec	RMSE=0.067994	R^2=0.218644
[17/500]	cv_eval_time=0.32 sec	RMSE=0.06863

[142/500]	cv_eval_time=0.52 sec	RMSE=0.068025	R^2=0.217143
[143/500]	cv_eval_time=0.53 sec	RMSE=0.068012	R^2=0.214963
[144/500]	cv_eval_time=0.42 sec	RMSE=0.067909	R^2=0.218846
[145/500]	cv_eval_time=0.45 sec	RMSE=0.068281	R^2=0.210617
[146/500]	cv_eval_time=0.56 sec	RMSE=0.067827	R^2=0.222259
[147/500]	cv_eval_time=0.39 sec	RMSE=0.067587	R^2=0.226078
[148/500]	cv_eval_time=0.34 sec	RMSE=0.067323	R^2=0.236627
[149/500]	cv_eval_time=0.55 sec	RMSE=0.067732	R^2=0.225024
[150/500]	cv_eval_time=0.24 sec	RMSE=0.067944	R^2=0.217272
[151/500]	cv_eval_time=0.28 sec	RMSE=0.067316	R^2=0.232515
[152/500]	cv_eval_time=0.32 sec	RMSE=0.067898	R^2=0.220345
[153/500]	cv_eval_time=0.28 sec	RMSE=0.067776	R^2=0.221638
[154/500]	cv_eval_time=0.30 sec	RMSE=0.067798	R^2=0.221474
[155/500]	cv_eval_time=0.28 sec	RMSE=0.067727	R^2=0.223643
[156/500]	cv_eval_time=0.29 sec	RMSE=0.067961	R^2=0.220159
[157/500]	cv_eval_time=0.26 sec	RMSE=0.067942	R^2=0.218397
[158/500]	cv_eval_time=0.35 sec	RMSE=0.068006	R^2=0.2151

[281/500]	cv_eval_time=0.34 sec	RMSE=0.067941	R^2=0.217898
[282/500]	cv_eval_time=0.34 sec	RMSE=0.067345	R^2=0.231627
[283/500]	cv_eval_time=0.30 sec	RMSE=0.067883	R^2=0.219945
[284/500]	cv_eval_time=0.29 sec	RMSE=0.067780	R^2=0.222410
[285/500]	cv_eval_time=0.33 sec	RMSE=0.067968	R^2=0.217858
[286/500]	cv_eval_time=0.67 sec	RMSE=0.067703	R^2=0.221829
[287/500]	cv_eval_time=0.44 sec	RMSE=0.067835	R^2=0.219616
[288/500]	cv_eval_time=0.38 sec	RMSE=0.067474	R^2=0.231820
[289/500]	cv_eval_time=0.39 sec	RMSE=0.068067	R^2=0.216459
[290/500]	cv_eval_time=0.37 sec	RMSE=0.068153	R^2=0.213913
[291/500]	cv_eval_time=0.58 sec	RMSE=0.067752	R^2=0.222477
[292/500]	cv_eval_time=0.72 sec	RMSE=0.067711	R^2=0.223623
[293/500]	cv_eval_time=0.43 sec	RMSE=0.067946	R^2=0.219712
[294/500]	cv_eval_time=0.59 sec	RMSE=0.067834	R^2=0.218907
[295/500]	cv_eval_time=0.30 sec	RMSE=0.068615	R^2=0.202725
[296/500]	cv_eval_time=0.35 sec	RMSE=0.068016	R^2=0.216177
[297/500]	cv_eval_time=0.56 sec	RMSE=0.067901	R^2=0.2198

[420/500]	cv_eval_time=0.33 sec	RMSE=0.067901	R^2=0.221936
[421/500]	cv_eval_time=0.31 sec	RMSE=0.067957	R^2=0.218904
[422/500]	cv_eval_time=0.34 sec	RMSE=0.067749	R^2=0.223122
[423/500]	cv_eval_time=0.31 sec	RMSE=0.067832	R^2=0.219786
[424/500]	cv_eval_time=0.32 sec	RMSE=0.067350	R^2=0.231055
[425/500]	cv_eval_time=0.35 sec	RMSE=0.067901	R^2=0.218359
[426/500]	cv_eval_time=0.30 sec	RMSE=0.067421	R^2=0.230589
[427/500]	cv_eval_time=0.30 sec	RMSE=0.068020	R^2=0.216766
[428/500]	cv_eval_time=0.33 sec	RMSE=0.067787	R^2=0.221520
[429/500]	cv_eval_time=0.30 sec	RMSE=0.067361	R^2=0.233004
[430/500]	cv_eval_time=0.27 sec	RMSE=0.067895	R^2=0.217793
[431/500]	cv_eval_time=0.25 sec	RMSE=0.067695	R^2=0.223954
[432/500]	cv_eval_time=0.18 sec	RMSE=0.067856	R^2=0.221314
[433/500]	cv_eval_time=0.18 sec	RMSE=0.067796	R^2=0.220857
[434/500]	cv_eval_time=0.17 sec	RMSE=0.067945	R^2=0.216407
[435/500]	cv_eval_time=0.19 sec	RMSE=0.067630	R^2=0.223228
[436/500]	cv_eval_time=0.18 sec	RMSE=0.067858	R^2=0.2205