In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_qualitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Lasso with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=4, max_evals = 500)
    features = list(df)
    features.remove('score')

    lasso.pipeline = Pipeline([
        ('estimate', linear_model.Lasso(**{'alpha': 0.00046442069128786186}))
    ])
    
    lasso.raw_features = features
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = cat.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=0.33 sec	RMSE=0.011870	R^2=-0.000168
[2/500]	cv_eval_time=0.33 sec	RMSE=0.011871	R^2=-0.000687
[3/500]	cv_eval_time=0.31 sec	RMSE=0.011871	R^2=-0.000404
[4/500]	cv_eval_time=0.35 sec	RMSE=0.011871	R^2=-0.000740
[5/500]	cv_eval_time=0.32 sec	RMSE=0.011870	R^2=-0.000408
[6/500]	cv_eval_time=0.36 sec	RMSE=0.011870	R^2=-0.000523
[7/500]	cv_eval_time=0.31 sec	RMSE=0.011871	R^2=-0.000118
[8/500]	cv_eval_time=0.34 sec	RMSE=0.011870	R^2=-0.000077
[9/500]	cv_eval_time=0.37 sec	RMSE=0.011870	R^2=-0.000472
[10/500]	cv_eval_time=0.44 sec	RMSE=0.011870	R^2=-0.000178
[11/500]	cv_eval_time=0.34 sec	RMSE=0.011870	R^2=-0.000479
[12/500]	cv_eval_time=0.32 sec	RMSE=0.011870	R^2=-0.000773
[13/500]	cv_eval_time=0.35 sec	RMSE=0.011871	R^2=-0.000354
[14/500]	cv_eval_time=0.34 sec	RMSE=0.011870	R^2=-0.000988
[15/500]	cv_eval_time=0.50 sec	RMSE=0.011870	R^2=-0.000056
[16/500]	cv_eval_time=0.35 sec	RMSE=0.011871	R^2=-0.000216
[17/500]	cv_eval_time=0.32 

[140/500]	cv_eval_time=0.40 sec	RMSE=0.011871	R^2=-0.000287
[141/500]	cv_eval_time=0.35 sec	RMSE=0.011871	R^2=-0.001304
[142/500]	cv_eval_time=0.32 sec	RMSE=0.011870	R^2=-0.000256
[143/500]	cv_eval_time=0.34 sec	RMSE=0.011870	R^2=-0.000239
[144/500]	cv_eval_time=0.46 sec	RMSE=0.011870	R^2=-0.000557
[145/500]	cv_eval_time=0.33 sec	RMSE=0.011821	R^2=0.011169
[146/500]	cv_eval_time=0.46 sec	RMSE=0.011815	R^2=0.012482
[147/500]	cv_eval_time=0.64 sec	RMSE=0.011871	R^2=-0.000731
[148/500]	cv_eval_time=0.61 sec	RMSE=0.011805	R^2=0.011242
[149/500]	cv_eval_time=0.40 sec	RMSE=0.011845	R^2=0.004849
[150/500]	cv_eval_time=0.54 sec	RMSE=0.011871	R^2=-0.000335
[151/500]	cv_eval_time=0.34 sec	RMSE=0.011870	R^2=-0.000831
[152/500]	cv_eval_time=0.50 sec	RMSE=0.011870	R^2=-0.000301
[153/500]	cv_eval_time=0.47 sec	RMSE=0.011834	R^2=0.010302
[154/500]	cv_eval_time=0.28 sec	RMSE=0.011871	R^2=-0.000153
[155/500]	cv_eval_time=0.30 sec	RMSE=0.011871	R^2=-0.000115
[156/500]	cv_eval_time=0.31 sec	RMSE=0.011870

[278/500]	cv_eval_time=0.44 sec	RMSE=0.011807	R^2=0.013306
[279/500]	cv_eval_time=0.41 sec	RMSE=0.011794	R^2=0.018685
[280/500]	cv_eval_time=0.38 sec	RMSE=0.011871	R^2=-0.000676
[281/500]	cv_eval_time=0.29 sec	RMSE=0.011871	R^2=-0.001341
[282/500]	cv_eval_time=0.47 sec	RMSE=0.011870	R^2=0.000430
[283/500]	cv_eval_time=0.35 sec	RMSE=0.011870	R^2=-0.000249
[284/500]	cv_eval_time=0.55 sec	RMSE=0.011827	R^2=0.010736
[285/500]	cv_eval_time=0.30 sec	RMSE=0.011871	R^2=-0.000813
[286/500]	cv_eval_time=0.53 sec	RMSE=0.011805	R^2=0.012594
[287/500]	cv_eval_time=0.39 sec	RMSE=0.011870	R^2=-0.000364
[288/500]	cv_eval_time=0.39 sec	RMSE=0.011870	R^2=-0.000171
[289/500]	cv_eval_time=0.38 sec	RMSE=0.011871	R^2=-0.000283
[290/500]	cv_eval_time=0.32 sec	RMSE=0.011870	R^2=-0.000173
[291/500]	cv_eval_time=0.32 sec	RMSE=0.011870	R^2=-0.000315
[292/500]	cv_eval_time=0.33 sec	RMSE=0.011870	R^2=-0.000695
[293/500]	cv_eval_time=0.35 sec	RMSE=0.011843	R^2=0.007798
[294/500]	cv_eval_time=0.59 sec	RMSE=0.011791	

[416/500]	cv_eval_time=0.48 sec	RMSE=0.011871	R^2=-0.000406
[417/500]	cv_eval_time=0.86 sec	RMSE=0.011870	R^2=-0.000222
[418/500]	cv_eval_time=0.61 sec	RMSE=0.011871	R^2=-0.001553
[419/500]	cv_eval_time=0.58 sec	RMSE=0.011819	R^2=0.012212
[420/500]	cv_eval_time=0.78 sec	RMSE=0.011811	R^2=0.014311
[421/500]	cv_eval_time=0.36 sec	RMSE=0.011857	R^2=0.003252
[422/500]	cv_eval_time=0.60 sec	RMSE=0.011790	R^2=0.016205
[423/500]	cv_eval_time=0.38 sec	RMSE=0.011871	R^2=-0.000250
[424/500]	cv_eval_time=0.36 sec	RMSE=0.011871	R^2=-0.000353
[425/500]	cv_eval_time=0.38 sec	RMSE=0.011834	R^2=0.007387
[426/500]	cv_eval_time=0.46 sec	RMSE=0.011870	R^2=-0.000354
[427/500]	cv_eval_time=0.69 sec	RMSE=0.011863	R^2=0.002493
[428/500]	cv_eval_time=0.69 sec	RMSE=0.011870	R^2=-0.000548
[429/500]	cv_eval_time=0.59 sec	RMSE=0.011870	R^2=-0.000012
[430/500]	cv_eval_time=0.55 sec	RMSE=0.011821	R^2=0.010832
[431/500]	cv_eval_time=0.44 sec	RMSE=0.011870	R^2=-0.002082
[432/500]	cv_eval_time=0.44 sec	RMSE=0.011870	R