In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVR

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel
from models.utils import CustomDataFrameMapper
from models.utils import get_svr_features

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('./data/preprocesed_data.pd')

# Create a dataframe
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 9303
Number of observations in the test data: 2326


In [5]:
    #some parameters
    DO_LOWESS = False
    hyperopt = True
    # Run CatBoostRegressor with hyperopt optimization
    comment_features, mention_features, post_features, user_features = get_svr_features()
    svm = HyperoptModel(train.copy(), test.copy(), 'svr', cv=3)
    svm.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_stats', user_features),
            ('mention_stats', mention_features),
            ('posts_stats', post_features),
            # ('comments_stats', comment_features)
        ])),
        ('estimate', SVR())
    ])
    
    svm.raw_features = []
    for transformer in svm.pipeline.named_steps['prepare_features'].transformer_list:
        svm.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
   
    svm.space = {
        'estimate__C': hp.uniform('estimate__C', 0, 10.),
        'estimate__kernel': hp.choice('estimate__kernel', ['linear', 'sigmoid', 'rbf']),
        'estimate__gamma': hp.uniform('estimate__gamma', 0, 10.),
        #'estimate__scale': hp.choice('estimate__scale', [0, 1.]),
        #'estimate__normalize': hp.choice('estimate__normalize', [0, 1.]),
    }
    
    if hyperopt:
        svm.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        svm.pipeline.fit(X=svm.X_train, y=svm.y_train)
        svm.model = svm.pipeline
        svm.stats()
        svm.plot_predicted_vs_actual()
        svm.plot_feature_importance()

Performing parameters optimization...




[1/50]	cv_eval_time=24.36 sec	RMSE=4102.557064	R^2=-816520087.978090




[2/50]	cv_eval_time=21.65 sec	RMSE=5813.126105	R^2=-1653541600.152730




[3/50]	cv_eval_time=19.95 sec	RMSE=0.120550	R^2=0.293316




[4/50]	cv_eval_time=25.18 sec	RMSE=4289.676023	R^2=-894540436.123056




[5/50]	cv_eval_time=21.61 sec	RMSE=4957.865069	R^2=-1203438469.723669




[6/50]	cv_eval_time=24.18 sec	RMSE=2157.578620	R^2=-229317105.673729




[7/50]	cv_eval_time=21.75 sec	RMSE=471.353714	R^2=-10924350.034420




[8/50]	cv_eval_time=105.79 sec	RMSE=0.120570	R^2=0.293785




[9/50]	cv_eval_time=32.11 sec	RMSE=0.120651	R^2=0.293717




[10/50]	cv_eval_time=10.73 sec	RMSE=0.119516	R^2=0.302221




[11/50]	cv_eval_time=275.12 sec	RMSE=0.120437	R^2=0.293053




[12/50]	cv_eval_time=220.56 sec	RMSE=0.120432	R^2=0.293709




[13/50]	cv_eval_time=23.15 sec	RMSE=4338.446134	R^2=-922299103.055589




[14/50]	cv_eval_time=18.54 sec	RMSE=0.117389	R^2=0.332960




[15/50]	cv_eval_time=16.75 sec	RMSE=0.107980	R^2=0.445991




[16/50]	cv_eval_time=12.32 sec	RMSE=0.107367	R^2=0.431485




[17/50]	cv_eval_time=69.15 sec	RMSE=0.120579	R^2=0.294041




[18/50]	cv_eval_time=20.18 sec	RMSE=0.110418	R^2=0.406742




[19/50]	cv_eval_time=22.84 sec	RMSE=0.122922	R^2=0.261892




[20/50]	cv_eval_time=52.82 sec	RMSE=0.120412	R^2=0.292955




[21/50]	cv_eval_time=8.34 sec	RMSE=0.100305	R^2=0.510087




[22/50]	cv_eval_time=14.88 sec	RMSE=0.112709	R^2=0.382310




[23/50]	cv_eval_time=15.15 sec	RMSE=0.110443	R^2=0.401588




[24/50]	cv_eval_time=16.36 sec	RMSE=0.116935	R^2=0.326237




[25/50]	cv_eval_time=11.43 sec	RMSE=0.111186	R^2=0.399062




[26/50]	cv_eval_time=16.69 sec	RMSE=0.114820	R^2=0.358765




[27/50]	cv_eval_time=17.24 sec	RMSE=0.116962	R^2=0.322871




[28/50]	cv_eval_time=11.13 sec	RMSE=0.101521	R^2=0.492033




[29/50]	cv_eval_time=9.60 sec	RMSE=0.100302	R^2=0.504086




[30/50]	cv_eval_time=18.18 sec	RMSE=0.117840	R^2=0.335809




[31/50]	cv_eval_time=19.95 sec	RMSE=0.118669	R^2=0.306360




[32/50]	cv_eval_time=20.84 sec	RMSE=0.117116	R^2=0.341466




[33/50]	cv_eval_time=17.46 sec	RMSE=0.117405	R^2=0.321996




[34/50]	cv_eval_time=26.63 sec	RMSE=5751.562250	R^2=-1601885042.222001




[35/50]	cv_eval_time=17.71 sec	RMSE=0.113797	R^2=0.368726




[36/50]	cv_eval_time=26.79 sec	RMSE=4179.343959	R^2=-854733371.072833




[37/50]	cv_eval_time=10.45 sec	RMSE=0.100418	R^2=0.506499




[38/50]	cv_eval_time=24.14 sec	RMSE=2772.184890	R^2=-375523478.098804




[39/50]	cv_eval_time=15.49 sec	RMSE=0.116685	R^2=0.339873




[40/50]	cv_eval_time=81.91 sec	RMSE=0.120584	R^2=0.294123




[41/50]	cv_eval_time=24.10 sec	RMSE=2137.850436	R^2=-221424100.384732




[42/50]	cv_eval_time=10.07 sec	RMSE=0.108140	R^2=0.429717




[43/50]	cv_eval_time=220.31 sec	RMSE=0.120507	R^2=0.294127




[44/50]	cv_eval_time=17.11 sec	RMSE=0.118584	R^2=0.328689




[45/50]	cv_eval_time=22.80 sec	RMSE=4473.454312	R^2=-978750846.415799




[46/50]	cv_eval_time=14.98 sec	RMSE=0.120719	R^2=0.294178




[47/50]	cv_eval_time=9.89 sec	RMSE=0.111561	R^2=0.388514




[48/50]	cv_eval_time=13.38 sec	RMSE=0.107944	R^2=0.428412




[49/50]	cv_eval_time=25.79 sec	RMSE=4013.909172	R^2=-785021424.732444




[50/50]	cv_eval_time=159.17 sec	RMSE=0.120494	R^2=0.292888
 elapsed time: 32min 37s





Stats (train | test):
	R^2 score:		0.5723
					0.5492
	RMSE:			0.0938
					0.0964
	Mean error:		0.0729
					0.0742

Best parameters set:
{'estimate__C': 4.045401335891951,
 'estimate__gamma': 0.04505270350679513,
 'estimate__kernel': 'rbf'}

Plotting predicted vs. actual ...done

