In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_qualitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.0162}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=cat.X_train, y=cat.y_train)
        ridge.model = cat.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=0.17 sec	RMSE=0.019059	R^2=0.079727
[2/500]	cv_eval_time=0.15 sec	RMSE=0.019022	R^2=0.084567
[3/500]	cv_eval_time=0.14 sec	RMSE=0.019050	R^2=0.078653
[4/500]	cv_eval_time=0.15 sec	RMSE=0.019049	R^2=0.080047
[5/500]	cv_eval_time=0.14 sec	RMSE=0.019076	R^2=0.076827
[6/500]	cv_eval_time=0.15 sec	RMSE=0.019058	R^2=0.080958
[7/500]	cv_eval_time=0.15 sec	RMSE=0.019043	R^2=0.080491
[8/500]	cv_eval_time=0.15 sec	RMSE=0.019065	R^2=0.077938
[9/500]	cv_eval_time=0.15 sec	RMSE=0.019034	R^2=0.079134
[10/500]	cv_eval_time=0.14 sec	RMSE=0.019063	R^2=0.077091
[11/500]	cv_eval_time=0.14 sec	RMSE=0.019037	R^2=0.080357
[12/500]	cv_eval_time=0.14 sec	RMSE=0.019068	R^2=0.076402
[13/500]	cv_eval_time=0.15 sec	RMSE=0.019062	R^2=0.079968
[14/500]	cv_eval_time=0.15 sec	RMSE=0.019027	R^2=0.082321
[15/500]	cv_eval_time=0.15 sec	RMSE=0.019053	R^2=0.080256
[16/500]	cv_eval_time=0.15 sec	RMSE=0.019074	R^2=0.077541
[17/500]	cv_eval_time=0.15 sec	RMSE=0.01906

[142/500]	cv_eval_time=0.14 sec	RMSE=0.019059	R^2=0.080672
[143/500]	cv_eval_time=0.13 sec	RMSE=0.019026	R^2=0.081178
[144/500]	cv_eval_time=0.16 sec	RMSE=0.019033	R^2=0.080358
[145/500]	cv_eval_time=0.14 sec	RMSE=0.019024	R^2=0.081150
[146/500]	cv_eval_time=0.15 sec	RMSE=0.019009	R^2=0.082546
[147/500]	cv_eval_time=0.15 sec	RMSE=0.019020	R^2=0.084199
[148/500]	cv_eval_time=0.14 sec	RMSE=0.019021	R^2=0.083592
[149/500]	cv_eval_time=0.13 sec	RMSE=0.018983	R^2=0.083835
[150/500]	cv_eval_time=0.15 sec	RMSE=0.018993	R^2=0.085925
[151/500]	cv_eval_time=0.14 sec	RMSE=0.019018	R^2=0.082805
[152/500]	cv_eval_time=0.15 sec	RMSE=0.019014	R^2=0.083280
[153/500]	cv_eval_time=0.14 sec	RMSE=0.018997	R^2=0.084954
[154/500]	cv_eval_time=0.15 sec	RMSE=0.019006	R^2=0.082900
[155/500]	cv_eval_time=0.13 sec	RMSE=0.019023	R^2=0.082139
[156/500]	cv_eval_time=0.16 sec	RMSE=0.019011	R^2=0.080333
[157/500]	cv_eval_time=0.14 sec	RMSE=0.019033	R^2=0.079690
[158/500]	cv_eval_time=0.14 sec	RMSE=0.018987	R^2=0.0862

[281/500]	cv_eval_time=0.28 sec	RMSE=0.019024	R^2=0.081770
[282/500]	cv_eval_time=0.20 sec	RMSE=0.019020	R^2=0.082060
[283/500]	cv_eval_time=0.14 sec	RMSE=0.019026	R^2=0.081617
[284/500]	cv_eval_time=0.15 sec	RMSE=0.018991	R^2=0.083938
[285/500]	cv_eval_time=0.15 sec	RMSE=0.019031	R^2=0.079705
[286/500]	cv_eval_time=0.15 sec	RMSE=0.019014	R^2=0.083914
[287/500]	cv_eval_time=0.14 sec	RMSE=0.019023	R^2=0.084178
[288/500]	cv_eval_time=0.14 sec	RMSE=0.019011	R^2=0.081723
[289/500]	cv_eval_time=0.13 sec	RMSE=0.019029	R^2=0.077876
[290/500]	cv_eval_time=0.15 sec	RMSE=0.019058	R^2=0.077646
[291/500]	cv_eval_time=0.13 sec	RMSE=0.018993	R^2=0.085727
[292/500]	cv_eval_time=0.16 sec	RMSE=0.019036	R^2=0.080391
[293/500]	cv_eval_time=0.14 sec	RMSE=0.019048	R^2=0.080101
[294/500]	cv_eval_time=0.15 sec	RMSE=0.019001	R^2=0.083168
[295/500]	cv_eval_time=0.13 sec	RMSE=0.019026	R^2=0.083542
[296/500]	cv_eval_time=0.15 sec	RMSE=0.019015	R^2=0.082848
[297/500]	cv_eval_time=0.14 sec	RMSE=0.019007	R^2=0.0821

[420/500]	cv_eval_time=0.15 sec	RMSE=0.018989	R^2=0.086363
[421/500]	cv_eval_time=0.14 sec	RMSE=0.019024	R^2=0.082003
[422/500]	cv_eval_time=0.14 sec	RMSE=0.019016	R^2=0.081742
[423/500]	cv_eval_time=0.14 sec	RMSE=0.019010	R^2=0.084176
[424/500]	cv_eval_time=0.14 sec	RMSE=0.019028	R^2=0.080976
[425/500]	cv_eval_time=0.14 sec	RMSE=0.019010	R^2=0.085194
[426/500]	cv_eval_time=0.14 sec	RMSE=0.018987	R^2=0.086104
[427/500]	cv_eval_time=0.14 sec	RMSE=0.019017	R^2=0.082331
[428/500]	cv_eval_time=0.15 sec	RMSE=0.019006	R^2=0.082568
[429/500]	cv_eval_time=0.15 sec	RMSE=0.019012	R^2=0.078909
[430/500]	cv_eval_time=0.16 sec	RMSE=0.019019	R^2=0.080845
[431/500]	cv_eval_time=0.14 sec	RMSE=0.018988	R^2=0.086713
[432/500]	cv_eval_time=0.14 sec	RMSE=0.019029	R^2=0.081227
[433/500]	cv_eval_time=0.15 sec	RMSE=0.019013	R^2=0.083120
[434/500]	cv_eval_time=0.14 sec	RMSE=0.019007	R^2=0.082394
[435/500]	cv_eval_time=0.14 sec	RMSE=0.019016	R^2=0.081149
[436/500]	cv_eval_time=0.14 sec	RMSE=0.018989	R^2=0.0836