In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_quantitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)
print(df.columns)

Index(['FOLLOWER_INDEGREE', 'FOLLOWER_OUTDEGREE', 'FOLLOWER_CLOSENESS',
       'FOLLOWER_CLOSENESS_REV', 'FOLLOWER_BETWEENNESS',
       'FOLLOWER_EIGENVECTOR', 'FOLLOWER_PAGERANK', 'score',
       'RETWEET_OUTDEGREE_UNWEIGHTED', 'RETWEET_OUTDEGREE_WEIGHTED',
       'RETWEET_CLOSENESS_UNWEIGHTED', 'RETWEET_CLOSENESS_WEIGHTED',
       'RETWEET_BETWEENNESS_UNWEIGHTED', 'RETWEET_BETWEENNESS_WEIGHTED',
       'REPLY_INDEGREE_UNWEIGHTED', 'REPLY_INDEGREE_WEIGHTED',
       'REPLY_OUTDEGREE_UNWEIGHTED', 'REPLY_OUTDEGREE_WEIGHTED',
       'REPLY_CLOSENESS_UNWEIGHTED', 'REPLY_CLOSENESS_WEIGHTED',
       'REPLY_CLOSENESS_REV_UNWEIGHTED', 'REPLY_CLOSENESS_REV_WEIGHTED',
       'REPLY_BETWEENNESS_UNWEIGHTED', 'REPLY_BETWEENNESS_WEIGHTED',
       'REPLY_EIGENVECTOR_UNWEIGHTED', 'REPLY_EIGENVECTOR_WEIGHTED',
       'REPLY_PAGERANK_UNWEIGHTED', 'REPLY_PAGERANK_WEIGHTED', 'NUM_TWEETS',
       'AVG_NUM_TWEETS_PER_DAY', 'NUM_DAYS_SINCE_SIGNUP', 'AVG_LENGTH_TWEETS',
       'NUM_MENTIONS', 'NUM_TWEETS_WITH

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Lasso with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')

    lasso.pipeline = Pipeline([
        ('estimate', linear_model.Lasso(**{'alpha': 0.0005945334828994508}))
    ])
    
    lasso.raw_features = features
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = cat.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=0.73 sec	RMSE=0.144039	R^2=0.461216
[2/500]	cv_eval_time=0.65 sec	RMSE=0.150755	R^2=0.414323
[3/500]	cv_eval_time=0.91 sec	RMSE=0.154491	R^2=0.383272
[4/500]	cv_eval_time=0.83 sec	RMSE=0.156055	R^2=0.369323
[5/500]	cv_eval_time=0.73 sec	RMSE=0.155554	R^2=0.372495
[6/500]	cv_eval_time=0.86 sec	RMSE=0.155637	R^2=0.371535
[7/500]	cv_eval_time=0.78 sec	RMSE=0.153259	R^2=0.393070
[8/500]	cv_eval_time=0.70 sec	RMSE=0.154788	R^2=0.377034
[9/500]	cv_eval_time=0.80 sec	RMSE=0.157234	R^2=0.359132
[10/500]	cv_eval_time=0.75 sec	RMSE=0.140100	R^2=0.493783
[11/500]	cv_eval_time=1.37 sec	RMSE=0.132076	R^2=0.545089
[12/500]	cv_eval_time=0.69 sec	RMSE=0.158738	R^2=0.346786
[13/500]	cv_eval_time=1.65 sec	RMSE=0.132437	R^2=0.544757
[14/500]	cv_eval_time=1.47 sec	RMSE=0.152337	R^2=0.401718
[15/500]	cv_eval_time=0.96 sec	RMSE=0.154874	R^2=0.372795
[16/500]	cv_eval_time=0.82 sec	RMSE=0.145075	R^2=0.455800
[17/500]	cv_eval_time=0.57 sec	RMSE=0.15637

[142/500]	cv_eval_time=0.44 sec	RMSE=0.145438	R^2=0.449750
[143/500]	cv_eval_time=0.52 sec	RMSE=0.137490	R^2=0.510371
[144/500]	cv_eval_time=0.71 sec	RMSE=0.138985	R^2=0.502151
[145/500]	cv_eval_time=0.69 sec	RMSE=0.156525	R^2=0.365612
[146/500]	cv_eval_time=3.13 sec	RMSE=0.131634	R^2=0.549104
[147/500]	cv_eval_time=1.79 sec	RMSE=0.133811	R^2=0.536197
[148/500]	cv_eval_time=0.76 sec	RMSE=0.137925	R^2=0.505169
[149/500]	cv_eval_time=0.50 sec	RMSE=0.144095	R^2=0.462085
[150/500]	cv_eval_time=1.31 sec	RMSE=0.131342	R^2=0.555854
[151/500]	cv_eval_time=0.84 sec	RMSE=0.136853	R^2=0.516320
[152/500]	cv_eval_time=1.05 sec	RMSE=0.140140	R^2=0.492073
[153/500]	cv_eval_time=1.57 sec	RMSE=0.112271	R^2=0.674051
[154/500]	cv_eval_time=0.48 sec	RMSE=0.142261	R^2=0.475360
[155/500]	cv_eval_time=0.50 sec	RMSE=0.147271	R^2=0.434279
[156/500]	cv_eval_time=0.40 sec	RMSE=0.146072	R^2=0.448286
[157/500]	cv_eval_time=0.59 sec	RMSE=0.135194	R^2=0.527935
[158/500]	cv_eval_time=0.56 sec	RMSE=0.139618	R^2=0.4982

[281/500]	cv_eval_time=0.46 sec	RMSE=0.132856	R^2=0.543671
[282/500]	cv_eval_time=0.29 sec	RMSE=0.143809	R^2=0.461917
[283/500]	cv_eval_time=0.29 sec	RMSE=0.140455	R^2=0.488503
[284/500]	cv_eval_time=0.37 sec	RMSE=0.134751	R^2=0.529924
[285/500]	cv_eval_time=0.26 sec	RMSE=0.146012	R^2=0.447657
[286/500]	cv_eval_time=0.28 sec	RMSE=0.137793	R^2=0.509687
[287/500]	cv_eval_time=0.59 sec	RMSE=0.129944	R^2=0.562420
[288/500]	cv_eval_time=0.40 sec	RMSE=0.139413	R^2=0.497883
[289/500]	cv_eval_time=0.43 sec	RMSE=0.142091	R^2=0.472624
[290/500]	cv_eval_time=0.34 sec	RMSE=0.133785	R^2=0.536732
[291/500]	cv_eval_time=0.28 sec	RMSE=0.150112	R^2=0.414170
[292/500]	cv_eval_time=0.27 sec	RMSE=0.144625	R^2=0.458435
[293/500]	cv_eval_time=0.56 sec	RMSE=0.112771	R^2=0.670843
[294/500]	cv_eval_time=0.23 sec	RMSE=0.138097	R^2=0.503076
[295/500]	cv_eval_time=0.24 sec	RMSE=0.140435	R^2=0.484976
[296/500]	cv_eval_time=0.21 sec	RMSE=0.147182	R^2=0.438196
[297/500]	cv_eval_time=0.21 sec	RMSE=0.156706	R^2=0.3641

[420/500]	cv_eval_time=0.25 sec	RMSE=0.138055	R^2=0.508538
[421/500]	cv_eval_time=0.43 sec	RMSE=0.133167	R^2=0.538785
[422/500]	cv_eval_time=0.33 sec	RMSE=0.156871	R^2=0.363621
[423/500]	cv_eval_time=0.26 sec	RMSE=0.139526	R^2=0.495696
[424/500]	cv_eval_time=0.29 sec	RMSE=0.136710	R^2=0.515436
[425/500]	cv_eval_time=0.30 sec	RMSE=0.132862	R^2=0.542416
[426/500]	cv_eval_time=0.23 sec	RMSE=0.138153	R^2=0.506265
[427/500]	cv_eval_time=0.47 sec	RMSE=0.134256	R^2=0.534508
[428/500]	cv_eval_time=0.38 sec	RMSE=0.142503	R^2=0.474661
[429/500]	cv_eval_time=1.18 sec	RMSE=0.111689	R^2=0.676127
[430/500]	cv_eval_time=0.68 sec	RMSE=0.140208	R^2=0.489618
[431/500]	cv_eval_time=0.86 sec	RMSE=0.138865	R^2=0.501598
[432/500]	cv_eval_time=0.57 sec	RMSE=0.135684	R^2=0.523172
[433/500]	cv_eval_time=1.22 sec	RMSE=0.137991	R^2=0.507058
[434/500]	cv_eval_time=0.96 sec	RMSE=0.143587	R^2=0.467958
[435/500]	cv_eval_time=1.05 sec	RMSE=0.134024	R^2=0.534408
[436/500]	cv_eval_time=0.61 sec	RMSE=0.139672	R^2=0.4942