In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_qualitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Lasso with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=10, max_evals = 500)
    features = list(df)
    features.remove('score')

    lasso.pipeline = Pipeline([
        ('estimate', linear_model.Lasso(**{'alpha': 0.0003215187905283301}))
    ])
    
    lasso.raw_features = features
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = cat.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=1.22 sec	RMSE=0.011790	R^2=0.017253
[2/500]	cv_eval_time=2.10 sec	RMSE=0.011790	R^2=0.009488
[3/500]	cv_eval_time=1.24 sec	RMSE=0.011790	R^2=0.016157
[4/500]	cv_eval_time=0.98 sec	RMSE=0.011790	R^2=0.012358
[5/500]	cv_eval_time=1.73 sec	RMSE=0.011791	R^2=0.014911
[6/500]	cv_eval_time=1.57 sec	RMSE=0.011792	R^2=0.001466
[7/500]	cv_eval_time=1.20 sec	RMSE=0.011790	R^2=0.017460
[8/500]	cv_eval_time=0.86 sec	RMSE=0.011791	R^2=0.010855
[9/500]	cv_eval_time=0.82 sec	RMSE=0.011792	R^2=0.010953
[10/500]	cv_eval_time=0.86 sec	RMSE=0.011789	R^2=0.012400
[11/500]	cv_eval_time=0.84 sec	RMSE=0.011792	R^2=0.019614
[12/500]	cv_eval_time=0.81 sec	RMSE=0.011789	R^2=0.000152
[13/500]	cv_eval_time=0.80 sec	RMSE=0.011791	R^2=0.015845
[14/500]	cv_eval_time=0.90 sec	RMSE=0.011792	R^2=0.014877
[15/500]	cv_eval_time=0.88 sec	RMSE=0.011790	R^2=0.012699
[16/500]	cv_eval_time=12.00 sec	RMSE=0.011780	R^2=0.016343
[17/500]	cv_eval_time=1.15 sec	RMSE=0.0117

[141/500]	cv_eval_time=1.37 sec	RMSE=0.011791	R^2=0.013770
[142/500]	cv_eval_time=1.37 sec	RMSE=0.011792	R^2=0.012823
[143/500]	cv_eval_time=1.00 sec	RMSE=0.011791	R^2=0.005139
[144/500]	cv_eval_time=1.06 sec	RMSE=0.011792	R^2=0.015407
[145/500]	cv_eval_time=0.99 sec	RMSE=0.011791	R^2=0.010296
[146/500]	cv_eval_time=1.09 sec	RMSE=0.011791	R^2=0.017186
[147/500]	cv_eval_time=1.84 sec	RMSE=0.011792	R^2=0.014871
[148/500]	cv_eval_time=2.03 sec	RMSE=0.011790	R^2=0.017410
[149/500]	cv_eval_time=0.85 sec	RMSE=0.011792	R^2=0.013889
[150/500]	cv_eval_time=0.86 sec	RMSE=0.011793	R^2=0.008662
[151/500]	cv_eval_time=0.86 sec	RMSE=0.011790	R^2=0.015569
[152/500]	cv_eval_time=18.07 sec	RMSE=0.011766	R^2=0.017724
[153/500]	cv_eval_time=0.95 sec	RMSE=0.011796	R^2=0.013459
[154/500]	cv_eval_time=0.96 sec	RMSE=0.011790	R^2=0.014118
[155/500]	cv_eval_time=0.95 sec	RMSE=0.011790	R^2=0.012035
[156/500]	cv_eval_time=1.03 sec	RMSE=0.011792	R^2=0.014721
[157/500]	cv_eval_time=10.80 sec	RMSE=0.011780	R^2=0.00

[280/500]	cv_eval_time=0.79 sec	RMSE=0.011792	R^2=0.011330
[281/500]	cv_eval_time=0.80 sec	RMSE=0.011790	R^2=0.011468
[282/500]	cv_eval_time=0.81 sec	RMSE=0.011792	R^2=0.016009
[283/500]	cv_eval_time=0.82 sec	RMSE=0.011790	R^2=0.012589
[284/500]	cv_eval_time=0.80 sec	RMSE=0.011790	R^2=0.018170
[285/500]	cv_eval_time=0.80 sec	RMSE=0.011792	R^2=0.016021
[286/500]	cv_eval_time=0.85 sec	RMSE=0.011794	R^2=0.017847
[287/500]	cv_eval_time=0.80 sec	RMSE=0.011790	R^2=0.016924
[288/500]	cv_eval_time=0.80 sec	RMSE=0.011789	R^2=0.013120
[289/500]	cv_eval_time=0.83 sec	RMSE=0.011791	R^2=0.018015
[290/500]	cv_eval_time=0.83 sec	RMSE=0.011792	R^2=0.016058
[291/500]	cv_eval_time=0.81 sec	RMSE=0.011794	R^2=0.014402
[292/500]	cv_eval_time=0.81 sec	RMSE=0.011793	R^2=0.016401
[293/500]	cv_eval_time=8.52 sec	RMSE=0.011775	R^2=0.015819
[294/500]	cv_eval_time=0.82 sec	RMSE=0.011791	R^2=0.004465
[295/500]	cv_eval_time=0.80 sec	RMSE=0.011792	R^2=0.012860
[296/500]	cv_eval_time=0.81 sec	RMSE=0.011790	R^2=0.0158

[419/500]	cv_eval_time=8.86 sec	RMSE=0.011784	R^2=0.018727
[420/500]	cv_eval_time=0.81 sec	RMSE=0.011792	R^2=0.005677
[421/500]	cv_eval_time=6.79 sec	RMSE=0.011783	R^2=0.015217
[422/500]	cv_eval_time=0.81 sec	RMSE=0.011792	R^2=0.017700
[423/500]	cv_eval_time=0.81 sec	RMSE=0.011789	R^2=0.014270
[424/500]	cv_eval_time=0.81 sec	RMSE=0.011792	R^2=0.016438
[425/500]	cv_eval_time=8.33 sec	RMSE=0.011779	R^2=0.010890
[426/500]	cv_eval_time=0.86 sec	RMSE=0.011790	R^2=0.013681
[427/500]	cv_eval_time=0.83 sec	RMSE=0.011795	R^2=0.013443
[428/500]	cv_eval_time=0.84 sec	RMSE=0.011790	R^2=0.013822
[429/500]	cv_eval_time=10.90 sec	RMSE=0.011770	R^2=0.017462
[430/500]	cv_eval_time=0.81 sec	RMSE=0.011790	R^2=0.013281
[431/500]	cv_eval_time=0.83 sec	RMSE=0.011790	R^2=0.015391
[432/500]	cv_eval_time=0.95 sec	RMSE=0.011792	R^2=0.015112
[433/500]	cv_eval_time=0.99 sec	RMSE=0.011792	R^2=0.016840
[434/500]	cv_eval_time=0.91 sec	RMSE=0.011791	R^2=0.014603
[435/500]	cv_eval_time=0.95 sec	RMSE=0.011791	R^2=0.009