In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_Klout.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Ridge Regression with hyperopt optimization
    ridge = HyperoptModel(train.copy(), test.copy(),'ridge', cv=10, max_evals = 500)
    features = list(df)
    features.remove('score')
    ridge.raw_features = features

    ridge.pipeline = Pipeline([
        ('estimate', linear_model.Ridge(**{'alpha': 0.11311210193052756}))
    ])
    
    ridge.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 1000),
    }

    if hyperopt:
        ridge.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        ridge.pipeline.fit(X=cat.X_train, y=cat.y_train)
        ridge.model = cat.pipeline
        ridge.stats()
        ridge.plot_feature_importance()
        ridge.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        ridge.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=2.76 sec	RMSE=0.150251	R^2=0.474110
[2/500]	cv_eval_time=0.80 sec	RMSE=0.149902	R^2=0.475907
[3/500]	cv_eval_time=0.99 sec	RMSE=0.146093	R^2=0.502609
[4/500]	cv_eval_time=1.02 sec	RMSE=0.150858	R^2=0.468716
[5/500]	cv_eval_time=0.80 sec	RMSE=0.145190	R^2=0.509163
[6/500]	cv_eval_time=0.77 sec	RMSE=0.149661	R^2=0.477578
[7/500]	cv_eval_time=1.22 sec	RMSE=0.148026	R^2=0.490385
[8/500]	cv_eval_time=1.47 sec	RMSE=0.150703	R^2=0.471272
[9/500]	cv_eval_time=0.72 sec	RMSE=0.150271	R^2=0.473598
[10/500]	cv_eval_time=0.77 sec	RMSE=0.145189	R^2=0.508446
[11/500]	cv_eval_time=0.74 sec	RMSE=0.149651	R^2=0.478244
[12/500]	cv_eval_time=0.75 sec	RMSE=0.151220	R^2=0.466590
[13/500]	cv_eval_time=0.73 sec	RMSE=0.145313	R^2=0.508063
[14/500]	cv_eval_time=0.90 sec	RMSE=0.145625	R^2=0.505431
[15/500]	cv_eval_time=0.88 sec	RMSE=0.148531	R^2=0.485928
[16/500]	cv_eval_time=0.92 sec	RMSE=0.141920	R^2=0.530330
[17/500]	cv_eval_time=0.92 sec	RMSE=0.15153

[142/500]	cv_eval_time=0.91 sec	RMSE=0.144687	R^2=0.512077
[143/500]	cv_eval_time=0.97 sec	RMSE=0.145608	R^2=0.506152
[144/500]	cv_eval_time=0.95 sec	RMSE=0.146051	R^2=0.502559
[145/500]	cv_eval_time=0.96 sec	RMSE=0.147221	R^2=0.494721
[146/500]	cv_eval_time=0.81 sec	RMSE=0.142676	R^2=0.525493
[147/500]	cv_eval_time=0.79 sec	RMSE=0.142006	R^2=0.530077
[148/500]	cv_eval_time=0.80 sec	RMSE=0.143620	R^2=0.519186
[149/500]	cv_eval_time=0.79 sec	RMSE=0.144415	R^2=0.513754
[150/500]	cv_eval_time=0.79 sec	RMSE=0.143103	R^2=0.522561
[151/500]	cv_eval_time=0.80 sec	RMSE=0.141903	R^2=0.530447
[152/500]	cv_eval_time=1.08 sec	RMSE=0.145413	R^2=0.507241
[153/500]	cv_eval_time=1.05 sec	RMSE=0.146787	R^2=0.497379
[154/500]	cv_eval_time=1.06 sec	RMSE=0.143832	R^2=0.518082
[155/500]	cv_eval_time=1.01 sec	RMSE=0.142028	R^2=0.529569
[156/500]	cv_eval_time=1.02 sec	RMSE=0.144922	R^2=0.510580
[157/500]	cv_eval_time=0.96 sec	RMSE=0.146462	R^2=0.499652
[158/500]	cv_eval_time=0.98 sec	RMSE=0.141940	R^2=0.5304

[281/500]	cv_eval_time=1.15 sec	RMSE=0.146260	R^2=0.501629
[282/500]	cv_eval_time=1.44 sec	RMSE=0.144652	R^2=0.512988
[283/500]	cv_eval_time=1.17 sec	RMSE=0.147374	R^2=0.493738
[284/500]	cv_eval_time=1.52 sec	RMSE=0.141937	R^2=0.530446
[285/500]	cv_eval_time=0.96 sec	RMSE=0.145304	R^2=0.507649
[286/500]	cv_eval_time=0.92 sec	RMSE=0.143898	R^2=0.516461
[287/500]	cv_eval_time=0.94 sec	RMSE=0.142461	R^2=0.527208
[288/500]	cv_eval_time=0.84 sec	RMSE=0.143545	R^2=0.519612
[289/500]	cv_eval_time=0.86 sec	RMSE=0.145878	R^2=0.503540
[290/500]	cv_eval_time=0.82 sec	RMSE=0.144707	R^2=0.511806
[291/500]	cv_eval_time=0.89 sec	RMSE=0.146853	R^2=0.497107
[292/500]	cv_eval_time=0.92 sec	RMSE=0.146476	R^2=0.499482
[293/500]	cv_eval_time=1.06 sec	RMSE=0.142836	R^2=0.524065
[294/500]	cv_eval_time=1.01 sec	RMSE=0.145201	R^2=0.508894
[295/500]	cv_eval_time=0.99 sec	RMSE=0.147200	R^2=0.495412
[296/500]	cv_eval_time=0.96 sec	RMSE=0.151611	R^2=0.464845
[297/500]	cv_eval_time=1.00 sec	RMSE=0.144226	R^2=0.5147

[420/500]	cv_eval_time=1.18 sec	RMSE=0.145811	R^2=0.504638
[421/500]	cv_eval_time=1.23 sec	RMSE=0.144917	R^2=0.510281
[422/500]	cv_eval_time=1.25 sec	RMSE=0.144219	R^2=0.514985
[423/500]	cv_eval_time=1.11 sec	RMSE=0.142910	R^2=0.524390
[424/500]	cv_eval_time=1.00 sec	RMSE=0.141939	R^2=0.530064
[425/500]	cv_eval_time=1.05 sec	RMSE=0.143668	R^2=0.518473
[426/500]	cv_eval_time=1.32 sec	RMSE=0.146286	R^2=0.500680
[427/500]	cv_eval_time=1.50 sec	RMSE=0.145446	R^2=0.506741
[428/500]	cv_eval_time=1.03 sec	RMSE=0.141959	R^2=0.530716
[429/500]	cv_eval_time=1.05 sec	RMSE=0.145766	R^2=0.505368
[430/500]	cv_eval_time=1.03 sec	RMSE=0.145020	R^2=0.509535
[431/500]	cv_eval_time=1.04 sec	RMSE=0.144020	R^2=0.516517
[432/500]	cv_eval_time=1.07 sec	RMSE=0.146502	R^2=0.499526
[433/500]	cv_eval_time=1.27 sec	RMSE=0.142559	R^2=0.526137
[434/500]	cv_eval_time=1.52 sec	RMSE=0.141949	R^2=0.530428
[435/500]	cv_eval_time=1.35 sec	RMSE=0.144503	R^2=0.513112
[436/500]	cv_eval_time=1.12 sec	RMSE=0.146091	R^2=0.5030