In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_Klout.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)
print(df.columns)

Index(['FOLLOWER_INDEGREE', 'FOLLOWER_OUTDEGREE', 'FOLLOWER_CLOSENESS',
       'FOLLOWER_CLOSENESS_REV', 'FOLLOWER_BETWEENNESS',
       'FOLLOWER_EIGENVECTOR', 'FOLLOWER_PAGERANK',
       'RETWEET_INDEGREE_UNWEIGHTED', 'RETWEET_INDEGREE_WEIGHTED',
       'RETWEET_OUTDEGREE_UNWEIGHTED', 'RETWEET_OUTDEGREE_WEIGHTED',
       'RETWEET_CLOSENESS_UNWEIGHTED', 'RETWEET_CLOSENESS_WEIGHTED',
       'RETWEET_CLOSENESS_REV_UNWEIGHTED', 'RETWEET_CLOSENESS_REV_WEIGHTED',
       'RETWEET_BETWEENNESS_UNWEIGHTED', 'RETWEET_BETWEENNESS_WEIGHTED',
       'RETWEET_EIGENVECTOR_UNWEIGHTED', 'RETWEET_EIGENVECTOR_WEIGHTED',
       'RETWEET_PAGERANK_UNWEIGHTED', 'RETWEET_PAGERANK_WEIGHTED',
       'REPLY_INDEGREE_UNWEIGHTED', 'REPLY_INDEGREE_WEIGHTED',
       'REPLY_OUTDEGREE_UNWEIGHTED', 'REPLY_OUTDEGREE_WEIGHTED',
       'REPLY_CLOSENESS_UNWEIGHTED', 'REPLY_CLOSENESS_WEIGHTED',
       'REPLY_CLOSENESS_REV_UNWEIGHTED', 'REPLY_CLOSENESS_REV_WEIGHTED',
       'REPLY_BETWEENNESS_UNWEIGHTED', 'REPLY_BETWEENNESS_

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Lasso with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')

    lasso.pipeline = Pipeline([
        ('estimate', linear_model.Lasso(**{'alpha': 0.00021497565278938424}))
    ])
    
    lasso.raw_features = features
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = cat.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=1.28 sec	RMSE=0.173041	R^2=0.301547
[2/500]	cv_eval_time=0.26 sec	RMSE=0.171359	R^2=0.315375
[3/500]	cv_eval_time=0.23 sec	RMSE=0.175544	R^2=0.283478
[4/500]	cv_eval_time=0.24 sec	RMSE=0.175338	R^2=0.283742
[5/500]	cv_eval_time=0.25 sec	RMSE=0.175496	R^2=0.282376
[6/500]	cv_eval_time=0.31 sec	RMSE=0.168327	R^2=0.342907
[7/500]	cv_eval_time=0.26 sec	RMSE=0.172264	R^2=0.308867
[8/500]	cv_eval_time=0.27 sec	RMSE=0.171759	R^2=0.312536
[9/500]	cv_eval_time=0.24 sec	RMSE=0.173233	R^2=0.301051
[10/500]	cv_eval_time=0.28 sec	RMSE=0.170914	R^2=0.318579
[11/500]	cv_eval_time=0.27 sec	RMSE=0.171532	R^2=0.315375
[12/500]	cv_eval_time=0.30 sec	RMSE=0.170843	R^2=0.319293
[13/500]	cv_eval_time=0.25 sec	RMSE=0.176557	R^2=0.274123
[14/500]	cv_eval_time=0.26 sec	RMSE=0.173727	R^2=0.297160
[15/500]	cv_eval_time=0.30 sec	RMSE=0.169281	R^2=0.330224
[16/500]	cv_eval_time=0.27 sec	RMSE=0.171504	R^2=0.314551
[17/500]	cv_eval_time=0.25 sec	RMSE=0.17167

[142/500]	cv_eval_time=0.33 sec	RMSE=0.165260	R^2=0.364346
[143/500]	cv_eval_time=0.25 sec	RMSE=0.172849	R^2=0.303542
[144/500]	cv_eval_time=0.31 sec	RMSE=0.170302	R^2=0.323680
[145/500]	cv_eval_time=0.36 sec	RMSE=0.163682	R^2=0.376463
[146/500]	cv_eval_time=0.34 sec	RMSE=0.170209	R^2=0.325805
[147/500]	cv_eval_time=0.43 sec	RMSE=0.159927	R^2=0.404891
[148/500]	cv_eval_time=0.32 sec	RMSE=0.168530	R^2=0.338490
[149/500]	cv_eval_time=0.32 sec	RMSE=0.165799	R^2=0.359843
[150/500]	cv_eval_time=0.50 sec	RMSE=0.159620	R^2=0.405547
[151/500]	cv_eval_time=0.92 sec	RMSE=0.144177	R^2=0.515238
[152/500]	cv_eval_time=0.34 sec	RMSE=0.169635	R^2=0.329337
[153/500]	cv_eval_time=0.32 sec	RMSE=0.170105	R^2=0.327006
[154/500]	cv_eval_time=0.35 sec	RMSE=0.164399	R^2=0.373164
[155/500]	cv_eval_time=0.33 sec	RMSE=0.170157	R^2=0.325173
[156/500]	cv_eval_time=0.31 sec	RMSE=0.169916	R^2=0.327238
[157/500]	cv_eval_time=0.32 sec	RMSE=0.170577	R^2=0.322156
[158/500]	cv_eval_time=0.33 sec	RMSE=0.166437	R^2=0.3529

[281/500]	cv_eval_time=1.57 sec	RMSE=0.145026	R^2=0.509581
[282/500]	cv_eval_time=2.22 sec	RMSE=0.152551	R^2=0.456676
[283/500]	cv_eval_time=0.35 sec	RMSE=0.167785	R^2=0.343484
[284/500]	cv_eval_time=0.37 sec	RMSE=0.170576	R^2=0.323627
[285/500]	cv_eval_time=0.37 sec	RMSE=0.164992	R^2=0.366708
[286/500]	cv_eval_time=0.39 sec	RMSE=0.170737	R^2=0.322851
[287/500]	cv_eval_time=0.38 sec	RMSE=0.169998	R^2=0.324418
[288/500]	cv_eval_time=0.40 sec	RMSE=0.170084	R^2=0.329027
[289/500]	cv_eval_time=0.36 sec	RMSE=0.170339	R^2=0.322969
[290/500]	cv_eval_time=0.46 sec	RMSE=0.163697	R^2=0.377421
[291/500]	cv_eval_time=0.37 sec	RMSE=0.166104	R^2=0.356897
[292/500]	cv_eval_time=0.34 sec	RMSE=0.169875	R^2=0.326792
[293/500]	cv_eval_time=0.36 sec	RMSE=0.170530	R^2=0.323012
[294/500]	cv_eval_time=0.33 sec	RMSE=0.164252	R^2=0.371932
[295/500]	cv_eval_time=0.61 sec	RMSE=0.159742	R^2=0.405879
[296/500]	cv_eval_time=0.36 sec	RMSE=0.170221	R^2=0.327950
[297/500]	cv_eval_time=0.38 sec	RMSE=0.170315	R^2=0.3249

[420/500]	cv_eval_time=0.41 sec	RMSE=0.163917	R^2=0.375957
[421/500]	cv_eval_time=0.36 sec	RMSE=0.170170	R^2=0.325370
[422/500]	cv_eval_time=0.35 sec	RMSE=0.169680	R^2=0.330781
[423/500]	cv_eval_time=0.56 sec	RMSE=0.159593	R^2=0.406853
[424/500]	cv_eval_time=0.35 sec	RMSE=0.164938	R^2=0.365098
[425/500]	cv_eval_time=0.35 sec	RMSE=0.169901	R^2=0.327168
[426/500]	cv_eval_time=0.40 sec	RMSE=0.170351	R^2=0.324018
[427/500]	cv_eval_time=0.35 sec	RMSE=0.167181	R^2=0.348083
[428/500]	cv_eval_time=0.66 sec	RMSE=0.159490	R^2=0.408168
[429/500]	cv_eval_time=0.36 sec	RMSE=0.169771	R^2=0.327938
[430/500]	cv_eval_time=0.39 sec	RMSE=0.170359	R^2=0.322416
[431/500]	cv_eval_time=0.38 sec	RMSE=0.163537	R^2=0.377630
[432/500]	cv_eval_time=0.46 sec	RMSE=0.170026	R^2=0.325262
[433/500]	cv_eval_time=0.36 sec	RMSE=0.165849	R^2=0.360356
[434/500]	cv_eval_time=0.39 sec	RMSE=0.169064	R^2=0.332898
[435/500]	cv_eval_time=0.37 sec	RMSE=0.164058	R^2=0.374985
[436/500]	cv_eval_time=0.28 sec	RMSE=0.172926	R^2=0.3028