In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('../data/preprocessed_twitter_quantitative.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 18680
Number of observations in the test data: 4670


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run Lasso with hyperopt optimization
    lasso = HyperoptModel(train.copy(), test.copy(),'lasso', cv=10, max_evals = 500)
    features = list(df)
    features.remove('score')

    lasso.pipeline = Pipeline([
        ('estimate', linear_model.Lasso(**{'alpha': 0.0006392743399505317}))
    ])
    
    lasso.raw_features = features
    lasso.space = {
        'estimate__alpha': hp.uniform('estimate__alpha', 0, 10),
    }

    if hyperopt:
        lasso.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        lasso.pipeline.fit(X=lasso.X_train, y=lasso.y_train)
        lasso.model = cat.pipeline
        lasso.stats()
        lasso.plot_feature_importance()
        lasso.plot_predicted_vs_actual(do_lowess=DO_LOWESS)
        lasso.qq_plot()

Performing parameters optimization...
[1/500]	cv_eval_time=3.33 sec	RMSE=0.144874	R^2=0.456309
[2/500]	cv_eval_time=3.29 sec	RMSE=0.150661	R^2=0.411526
[3/500]	cv_eval_time=3.40 sec	RMSE=0.141857	R^2=0.478214
[4/500]	cv_eval_time=2.83 sec	RMSE=0.156469	R^2=0.364659
[5/500]	cv_eval_time=4.15 sec	RMSE=0.155380	R^2=0.373738
[6/500]	cv_eval_time=3.44 sec	RMSE=0.153156	R^2=0.392450
[7/500]	cv_eval_time=3.05 sec	RMSE=0.152963	R^2=0.393969
[8/500]	cv_eval_time=2.44 sec	RMSE=0.158578	R^2=0.347098
[9/500]	cv_eval_time=3.36 sec	RMSE=0.150845	R^2=0.411332
[10/500]	cv_eval_time=2.19 sec	RMSE=0.145873	R^2=0.447141
[11/500]	cv_eval_time=2.69 sec	RMSE=0.153410	R^2=0.390165
[12/500]	cv_eval_time=1.49 sec	RMSE=0.157225	R^2=0.359747
[13/500]	cv_eval_time=2.02 sec	RMSE=0.156039	R^2=0.367627
[14/500]	cv_eval_time=2.38 sec	RMSE=0.144639	R^2=0.456948
[15/500]	cv_eval_time=2.44 sec	RMSE=0.157004	R^2=0.361103
[16/500]	cv_eval_time=3.63 sec	RMSE=0.154627	R^2=0.379739
[17/500]	cv_eval_time=2.74 sec	RMSE=0.15524

[141/500]	cv_eval_time=1.77 sec	RMSE=0.144819	R^2=0.455996
[142/500]	cv_eval_time=1.22 sec	RMSE=0.157467	R^2=0.357895
[143/500]	cv_eval_time=6.01 sec	RMSE=0.131741	R^2=0.549791
[144/500]	cv_eval_time=1.36 sec	RMSE=0.148349	R^2=0.430077
[145/500]	cv_eval_time=1.21 sec	RMSE=0.158818	R^2=0.346005
[146/500]	cv_eval_time=9.86 sec	RMSE=0.131112	R^2=0.554706
[147/500]	cv_eval_time=7.55 sec	RMSE=0.136058	R^2=0.521297
[148/500]	cv_eval_time=2.03 sec	RMSE=0.138112	R^2=0.505289
[149/500]	cv_eval_time=7.15 sec	RMSE=0.130376	R^2=0.560079
[150/500]	cv_eval_time=1.91 sec	RMSE=0.139403	R^2=0.496038
[151/500]	cv_eval_time=7.04 sec	RMSE=0.133411	R^2=0.537669
[152/500]	cv_eval_time=9.93 sec	RMSE=0.129944	R^2=0.562177
[153/500]	cv_eval_time=1.79 sec	RMSE=0.142437	R^2=0.473736
[154/500]	cv_eval_time=3.23 sec	RMSE=0.137578	R^2=0.509653
[155/500]	cv_eval_time=1.94 sec	RMSE=0.138533	R^2=0.501528
[156/500]	cv_eval_time=5.55 sec	RMSE=0.135250	R^2=0.525759
[157/500]	cv_eval_time=2.01 sec	RMSE=0.140294	R^2=0.4894

[280/500]	cv_eval_time=1.97 sec	RMSE=0.132866	R^2=0.542593
[281/500]	cv_eval_time=0.99 sec	RMSE=0.137597	R^2=0.508958
[282/500]	cv_eval_time=0.80 sec	RMSE=0.140228	R^2=0.489666
[283/500]	cv_eval_time=0.78 sec	RMSE=0.143605	R^2=0.463961
[284/500]	cv_eval_time=1.79 sec	RMSE=0.136806	R^2=0.513727
[285/500]	cv_eval_time=2.29 sec	RMSE=0.132874	R^2=0.542653
[286/500]	cv_eval_time=0.95 sec	RMSE=0.139197	R^2=0.497094
[287/500]	cv_eval_time=0.83 sec	RMSE=0.147353	R^2=0.437299
[288/500]	cv_eval_time=2.51 sec	RMSE=0.129916	R^2=0.563004
[289/500]	cv_eval_time=2.16 sec	RMSE=0.134832	R^2=0.528964
[290/500]	cv_eval_time=1.08 sec	RMSE=0.137722	R^2=0.508836
[291/500]	cv_eval_time=1.02 sec	RMSE=0.138362	R^2=0.503974
[292/500]	cv_eval_time=0.90 sec	RMSE=0.146029	R^2=0.447742
[293/500]	cv_eval_time=0.94 sec	RMSE=0.140982	R^2=0.484927
[294/500]	cv_eval_time=0.94 sec	RMSE=0.144625	R^2=0.458090
[295/500]	cv_eval_time=2.18 sec	RMSE=0.135013	R^2=0.527728
[296/500]	cv_eval_time=0.88 sec	RMSE=0.155376	R^2=0.3751

[419/500]	cv_eval_time=2.17 sec	RMSE=0.134262	R^2=0.532109
[420/500]	cv_eval_time=0.92 sec	RMSE=0.142181	R^2=0.475351
[421/500]	cv_eval_time=1.06 sec	RMSE=0.137849	R^2=0.507915
[422/500]	cv_eval_time=2.59 sec	RMSE=0.129805	R^2=0.562274
[423/500]	cv_eval_time=0.98 sec	RMSE=0.139038	R^2=0.499190
[424/500]	cv_eval_time=2.23 sec	RMSE=0.132992	R^2=0.542763
[425/500]	cv_eval_time=1.02 sec	RMSE=0.140324	R^2=0.488176
[426/500]	cv_eval_time=2.36 sec	RMSE=0.135298	R^2=0.526348
[427/500]	cv_eval_time=0.96 sec	RMSE=0.138369	R^2=0.502020
[428/500]	cv_eval_time=2.77 sec	RMSE=0.117108	R^2=0.643943
[429/500]	cv_eval_time=3.30 sec	RMSE=0.112273	R^2=0.673260
[430/500]	cv_eval_time=1.41 sec	RMSE=0.137578	R^2=0.510307
[431/500]	cv_eval_time=0.95 sec	RMSE=0.143077	R^2=0.469777
[432/500]	cv_eval_time=0.90 sec	RMSE=0.141461	R^2=0.480501
[433/500]	cv_eval_time=2.41 sec	RMSE=0.133013	R^2=0.540068
[434/500]	cv_eval_time=0.96 sec	RMSE=0.139360	R^2=0.495770
[435/500]	cv_eval_time=3.11 sec	RMSE=0.112205	R^2=0.6735