In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_Monika.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=10, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.004956167083152496}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=cat.X_train, y=cat.y_train)
        ridge.model = cat.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=1.71 sec	RMSE=0.068462	R^2=0.207561
[2/500]	cv_eval_time=0.97 sec	RMSE=0.067959	R^2=0.219729
[3/500]	cv_eval_time=0.92 sec	RMSE=0.068552	R^2=0.205374
[4/500]	cv_eval_time=0.93 sec	RMSE=0.068587	R^2=0.202992
[5/500]	cv_eval_time=0.98 sec	RMSE=0.068598	R^2=0.201049
[6/500]	cv_eval_time=0.95 sec	RMSE=0.068537	R^2=0.204460
[7/500]	cv_eval_time=0.96 sec	RMSE=0.067882	R^2=0.219806
[8/500]	cv_eval_time=0.80 sec	RMSE=0.068329	R^2=0.212124
[9/500]	cv_eval_time=0.80 sec	RMSE=0.068383	R^2=0.207671
[10/500]	cv_eval_time=0.79 sec	RMSE=0.067930	R^2=0.218908
[11/500]	cv_eval_time=1.02 sec	RMSE=0.068378	R^2=0.210556
[12/500]	cv_eval_time=0.95 sec	RMSE=0.068635	R^2=0.202624
[13/500]	cv_eval_time=1.23 sec	RMSE=0.067995	R^2=0.217312
[14/500]	cv_eval_time=0.81 sec	RMSE=0.068374	R^2=0.208178
[15/500]	cv_eval_time=0.81 sec	RMSE=0.068472	R^2=0.207815
[16/500]	cv_eval_time=0.82 sec	RMSE=0.067831	R^2=0.223934
[17/500]	cv_eval_time=0.81 sec	RMSE=0.06806

[142/500]	cv_eval_time=0.85 sec	RMSE=0.067794	R^2=0.222816
[143/500]	cv_eval_time=0.85 sec	RMSE=0.067874	R^2=0.219386
[144/500]	cv_eval_time=0.89 sec	RMSE=0.067824	R^2=0.220638
[145/500]	cv_eval_time=1.20 sec	RMSE=0.068266	R^2=0.210534
[146/500]	cv_eval_time=0.92 sec	RMSE=0.067722	R^2=0.224859
[147/500]	cv_eval_time=1.07 sec	RMSE=0.067294	R^2=0.235028
[148/500]	cv_eval_time=0.76 sec	RMSE=0.067562	R^2=0.228677
[149/500]	cv_eval_time=0.82 sec	RMSE=0.067795	R^2=0.223475
[150/500]	cv_eval_time=0.78 sec	RMSE=0.067346	R^2=0.232872
[151/500]	cv_eval_time=0.79 sec	RMSE=0.067909	R^2=0.221407
[152/500]	cv_eval_time=0.80 sec	RMSE=0.067771	R^2=0.223013
[153/500]	cv_eval_time=0.79 sec	RMSE=0.067674	R^2=0.225933
[154/500]	cv_eval_time=0.79 sec	RMSE=0.067990	R^2=0.218673
[155/500]	cv_eval_time=0.81 sec	RMSE=0.067865	R^2=0.219544
[156/500]	cv_eval_time=0.80 sec	RMSE=0.067383	R^2=0.231464
[157/500]	cv_eval_time=0.82 sec	RMSE=0.067860	R^2=0.222644
[158/500]	cv_eval_time=0.85 sec	RMSE=0.067763	R^2=0.2232

[281/500]	cv_eval_time=0.75 sec	RMSE=0.067910	R^2=0.222304
[282/500]	cv_eval_time=0.75 sec	RMSE=0.067637	R^2=0.226011
[283/500]	cv_eval_time=0.85 sec	RMSE=0.067973	R^2=0.220903
[284/500]	cv_eval_time=0.79 sec	RMSE=0.067265	R^2=0.235695
[285/500]	cv_eval_time=0.88 sec	RMSE=0.067756	R^2=0.224973
[286/500]	cv_eval_time=0.80 sec	RMSE=0.067417	R^2=0.230466
[287/500]	cv_eval_time=0.82 sec	RMSE=0.067932	R^2=0.218533
[288/500]	cv_eval_time=0.81 sec	RMSE=0.067774	R^2=0.223145
[289/500]	cv_eval_time=0.80 sec	RMSE=0.067858	R^2=0.220815
[290/500]	cv_eval_time=0.81 sec	RMSE=0.067356	R^2=0.231898
[291/500]	cv_eval_time=0.82 sec	RMSE=0.067706	R^2=0.225297
[292/500]	cv_eval_time=0.78 sec	RMSE=0.067859	R^2=0.220998
[293/500]	cv_eval_time=0.76 sec	RMSE=0.067940	R^2=0.219815
[294/500]	cv_eval_time=0.76 sec	RMSE=0.067727	R^2=0.223213
[295/500]	cv_eval_time=0.79 sec	RMSE=0.068226	R^2=0.214450
[296/500]	cv_eval_time=0.77 sec	RMSE=0.067997	R^2=0.218579
[297/500]	cv_eval_time=0.75 sec	RMSE=0.067777	R^2=0.2251

[420/500]	cv_eval_time=0.72 sec	RMSE=0.067822	R^2=0.220858
[421/500]	cv_eval_time=0.71 sec	RMSE=0.067362	R^2=0.232819
[422/500]	cv_eval_time=0.71 sec	RMSE=0.067757	R^2=0.222453
[423/500]	cv_eval_time=0.71 sec	RMSE=0.067317	R^2=0.232922
[424/500]	cv_eval_time=0.72 sec	RMSE=0.067873	R^2=0.224888
[425/500]	cv_eval_time=0.72 sec	RMSE=0.067809	R^2=0.223782
[426/500]	cv_eval_time=0.72 sec	RMSE=0.067695	R^2=0.225769
[427/500]	cv_eval_time=0.71 sec	RMSE=0.067973	R^2=0.217450
[428/500]	cv_eval_time=0.72 sec	RMSE=0.067915	R^2=0.223433
[429/500]	cv_eval_time=0.72 sec	RMSE=0.067177	R^2=0.234784
[430/500]	cv_eval_time=0.71 sec	RMSE=0.067829	R^2=0.221283
[431/500]	cv_eval_time=0.72 sec	RMSE=0.067835	R^2=0.221481
[432/500]	cv_eval_time=0.76 sec	RMSE=0.067912	R^2=0.220690
[433/500]	cv_eval_time=0.76 sec	RMSE=0.067648	R^2=0.226277
[434/500]	cv_eval_time=0.75 sec	RMSE=0.067743	R^2=0.222317
[435/500]	cv_eval_time=0.85 sec	RMSE=0.067776	R^2=0.224890
[436/500]	cv_eval_time=0.71 sec	RMSE=0.067957	R^2=0.2178