In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_Klout.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.6283063893828044}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=cat.X_train, y=cat.y_train)
        ridge.model = cat.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=0.84 sec	RMSE=0.149596	R^2=0.477526
[2/500]	cv_eval_time=0.21 sec	RMSE=0.150717	R^2=0.471200
[3/500]	cv_eval_time=0.24 sec	RMSE=0.152566	R^2=0.458137
[4/500]	cv_eval_time=0.20 sec	RMSE=0.150476	R^2=0.472760
[5/500]	cv_eval_time=0.22 sec	RMSE=0.152664	R^2=0.457023
[6/500]	cv_eval_time=0.21 sec	RMSE=0.152841	R^2=0.456008
[7/500]	cv_eval_time=0.22 sec	RMSE=0.152549	R^2=0.458198
[8/500]	cv_eval_time=0.24 sec	RMSE=0.148132	R^2=0.488259
[9/500]	cv_eval_time=0.20 sec	RMSE=0.151035	R^2=0.469144
[10/500]	cv_eval_time=0.22 sec	RMSE=0.146841	R^2=0.497474
[11/500]	cv_eval_time=0.20 sec	RMSE=0.146574	R^2=0.499239
[12/500]	cv_eval_time=0.41 sec	RMSE=0.152066	R^2=0.461875
[13/500]	cv_eval_time=0.19 sec	RMSE=0.151208	R^2=0.467987
[14/500]	cv_eval_time=0.23 sec	RMSE=0.150731	R^2=0.470797
[15/500]	cv_eval_time=0.21 sec	RMSE=0.150673	R^2=0.469806
[16/500]	cv_eval_time=0.25 sec	RMSE=0.149708	R^2=0.478214
[17/500]	cv_eval_time=0.20 sec	RMSE=0.15125

[141/500]	cv_eval_time=0.20 sec	RMSE=0.151949	R^2=0.462989
[142/500]	cv_eval_time=0.25 sec	RMSE=0.144250	R^2=0.515176
[143/500]	cv_eval_time=0.22 sec	RMSE=0.148781	R^2=0.483371
[144/500]	cv_eval_time=0.24 sec	RMSE=0.148463	R^2=0.487294
[145/500]	cv_eval_time=0.21 sec	RMSE=0.145038	R^2=0.509656
[146/500]	cv_eval_time=0.22 sec	RMSE=0.143315	R^2=0.521079
[147/500]	cv_eval_time=0.33 sec	RMSE=0.145780	R^2=0.504637
[148/500]	cv_eval_time=0.35 sec	RMSE=0.146952	R^2=0.496997
[149/500]	cv_eval_time=0.30 sec	RMSE=0.146689	R^2=0.498383
[150/500]	cv_eval_time=0.32 sec	RMSE=0.147293	R^2=0.495241
[151/500]	cv_eval_time=0.28 sec	RMSE=0.142598	R^2=0.526699
[152/500]	cv_eval_time=0.40 sec	RMSE=0.145442	R^2=0.506693
[153/500]	cv_eval_time=0.52 sec	RMSE=0.142437	R^2=0.527176
[154/500]	cv_eval_time=0.31 sec	RMSE=0.147836	R^2=0.489996
[155/500]	cv_eval_time=0.26 sec	RMSE=0.144873	R^2=0.511957
[156/500]	cv_eval_time=0.35 sec	RMSE=0.143243	R^2=0.521134
[157/500]	cv_eval_time=0.63 sec	RMSE=0.144431	R^2=0.5149

[280/500]	cv_eval_time=0.25 sec	RMSE=0.144218	R^2=0.515764
[281/500]	cv_eval_time=0.29 sec	RMSE=0.142414	R^2=0.527386
[282/500]	cv_eval_time=0.31 sec	RMSE=0.146611	R^2=0.498803
[283/500]	cv_eval_time=0.29 sec	RMSE=0.142349	R^2=0.527630
[284/500]	cv_eval_time=0.35 sec	RMSE=0.144941	R^2=0.509673
[285/500]	cv_eval_time=0.40 sec	RMSE=0.145914	R^2=0.503353
[286/500]	cv_eval_time=0.28 sec	RMSE=0.142479	R^2=0.526515
[287/500]	cv_eval_time=0.26 sec	RMSE=0.151945	R^2=0.461749
[288/500]	cv_eval_time=0.29 sec	RMSE=0.147763	R^2=0.492231
[289/500]	cv_eval_time=0.26 sec	RMSE=0.147135	R^2=0.495556
[290/500]	cv_eval_time=0.39 sec	RMSE=0.143258	R^2=0.521227
[291/500]	cv_eval_time=0.35 sec	RMSE=0.145608	R^2=0.506272
[292/500]	cv_eval_time=0.26 sec	RMSE=0.148116	R^2=0.488147
[293/500]	cv_eval_time=0.53 sec	RMSE=0.146783	R^2=0.497632
[294/500]	cv_eval_time=0.26 sec	RMSE=0.144342	R^2=0.514722
[295/500]	cv_eval_time=0.27 sec	RMSE=0.151228	R^2=0.466756
[296/500]	cv_eval_time=0.24 sec	RMSE=0.150118	R^2=0.4746

[419/500]	cv_eval_time=0.21 sec	RMSE=0.145444	R^2=0.507929
[420/500]	cv_eval_time=0.23 sec	RMSE=0.143023	R^2=0.522744
[421/500]	cv_eval_time=0.27 sec	RMSE=0.142395	R^2=0.527522
[422/500]	cv_eval_time=0.42 sec	RMSE=0.145947	R^2=0.505100
[423/500]	cv_eval_time=0.44 sec	RMSE=0.151409	R^2=0.465806
[424/500]	cv_eval_time=0.32 sec	RMSE=0.144729	R^2=0.512094
[425/500]	cv_eval_time=0.21 sec	RMSE=0.143568	R^2=0.520039
[426/500]	cv_eval_time=0.19 sec	RMSE=0.146575	R^2=0.499323
[427/500]	cv_eval_time=0.21 sec	RMSE=0.144495	R^2=0.512203
[428/500]	cv_eval_time=0.21 sec	RMSE=0.147276	R^2=0.493657
[429/500]	cv_eval_time=0.23 sec	RMSE=0.145419	R^2=0.506070
[430/500]	cv_eval_time=0.20 sec	RMSE=0.142420	R^2=0.526778
[431/500]	cv_eval_time=0.23 sec	RMSE=0.145977	R^2=0.503396
[432/500]	cv_eval_time=0.24 sec	RMSE=0.143573	R^2=0.518861
[433/500]	cv_eval_time=0.27 sec	RMSE=0.142381	R^2=0.527399
[434/500]	cv_eval_time=0.30 sec	RMSE=0.144038	R^2=0.516434
[435/500]	cv_eval_time=0.66 sec	RMSE=0.142424	R^2=0.5274