In [1]:
#imports
import numpy as np
import pandas as pd
from hyperopt import hp
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.hyperopt_model import HyperoptModel
from models.utils import CustomDataFrameMapper
from models.utils import get_features

  from pandas.core import datetools


In [2]:
# read the data
preprocessed_data = pd.read_pickle('./data/preprocesed_data.pd')

# Create a dataframe 
df = pd.DataFrame(preprocessed_data)

In [3]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test=train_test_split(df, test_size=0.2, random_state=0)

In [4]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 9303
Number of observations in the test data: 2326


In [5]:
    #parameters
    DO_LOWESS = False
    hyperopt = True
    # Run RandomForestRegressor with hyperopt optimization
    rf = HyperoptModel(train.copy(), test.copy(), 'rf', cv=3)
    rf.raw_features = []
    comment_features, mention_features, post_features, user_features = get_features()
    rf.pipeline = Pipeline([
        ('prepare_features', FeatureUnion([
            ('user_stats', user_features),
            ('mention_stats', mention_features),
            ('posts_stats', post_features),
            ('comments_stats', comment_features)
        ])),
        ('estimate', RandomForestRegressor())
    ])
    
    for transformer in rf.pipeline.named_steps['prepare_features'].transformer_list:
        rf.raw_features += [t[0] if isinstance(t[0], str) else t[0][0] for t in transformer[1].features]
     # for transformer in rf.pipeline.named_steps['prepare_features'].transformer_list:
      #    rf.raw_features += [t if isinstance(t, basestring) else t[0] for t, _, _ in transformer[1].features]
    #print(rf.get_params().keys())
    rf.space = {
        'estimate__random_state': hp.choice('estimate__random_state', [0]),
        'estimate__oob_score': hp.choice('estimate__oob_score', [True]),

        'estimate__max_features': hp.uniform('estimate__max_features', 0, 1.),
        'estimate__n_estimators': hp.choice('estimate__n_estimators', range(1, 3000 + 1)),
        'estimate__min_samples_leaf': hp.choice('estimate__min_samples_leaf', range(1, 100 + 1)),
        #'estimate__scale': hp.choice('estimate__scale', [0, 1.]),
        #'estimate__normalize': hp.choice('estimate__normalize', [0, 1.]),
    }

    if hyperopt:
        rf.run(do_lowess=DO_LOWESS)
    else:
        # train with default params
        rf.pipeline.fit(X=rf.X_train, y=rf.y_train)
        rf.model = rf.pipeline
        rf.stats()
        rf.plot_predicted_vs_actual()
        rf.plot_feature_importance()

Performing parameters optimization...
[1/50]	cv_eval_time=24.28 sec	RMSE=0.083662	R^2=0.659396
[2/50]	cv_eval_time=92.22 sec	RMSE=0.085712	R^2=0.642622
[3/50]	cv_eval_time=112.74 sec	RMSE=0.085345	R^2=0.645933
[4/50]	cv_eval_time=65.11 sec	RMSE=0.089595	R^2=0.609314
[5/50]	cv_eval_time=54.11 sec	RMSE=0.080308	R^2=0.683831
[6/50]	cv_eval_time=41.31 sec	RMSE=0.111041	R^2=0.398703
[7/50]	cv_eval_time=23.25 sec	RMSE=0.086015	R^2=0.639675
[8/50]	cv_eval_time=42.34 sec	RMSE=0.079808	R^2=0.689596
[9/50]	cv_eval_time=276.77 sec	RMSE=0.083854	R^2=0.657404
[10/50]	cv_eval_time=95.85 sec	RMSE=0.084738	R^2=0.652100
[11/50]	cv_eval_time=67.73 sec	RMSE=0.088098	R^2=0.622818
[12/50]	cv_eval_time=497.09 sec	RMSE=0.074084	R^2=0.732794
[13/50]	cv_eval_time=103.13 sec	RMSE=0.089511	R^2=0.611259
[14/50]	cv_eval_time=179.38 sec	RMSE=0.082014	R^2=0.673883
[15/50]	cv_eval_time=69.65 sec	RMSE=0.086260	R^2=0.638139
[16/50]	cv_eval_time=2.33 sec	RMSE=0.089558	R^2=0.606968
[17/50]	cv_eval_time=380.93 sec	RMSE=0.

  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "


[24/50]	cv_eval_time=1.79 sec	RMSE=0.087045	R^2=0.631361
[25/50]	cv_eval_time=95.19 sec	RMSE=0.092078	R^2=0.586126
[26/50]	cv_eval_time=437.02 sec	RMSE=0.075074	R^2=0.731276
[27/50]	cv_eval_time=182.76 sec	RMSE=0.074401	R^2=0.730452
[28/50]	cv_eval_time=332.82 sec	RMSE=0.075748	R^2=0.717617
[29/50]	cv_eval_time=27.20 sec	RMSE=0.077392	R^2=0.708600
[30/50]	cv_eval_time=199.63 sec	RMSE=0.079670	R^2=0.690837
[31/50]	cv_eval_time=61.23 sec	RMSE=0.101811	R^2=0.495304
[32/50]	cv_eval_time=387.12 sec	RMSE=0.073592	R^2=0.737347
[33/50]	cv_eval_time=3.21 sec	RMSE=0.088410	R^2=0.617213
[34/50]	cv_eval_time=70.18 sec	RMSE=0.073331	R^2=0.738516
[35/50]	cv_eval_time=494.26 sec	RMSE=0.073460	R^2=0.737012
[36/50]	cv_eval_time=17.92 sec	RMSE=0.080267	R^2=0.686592
[37/50]	cv_eval_time=468.14 sec	RMSE=0.073732	R^2=0.736815
[38/50]	cv_eval_time=14.25 sec	RMSE=0.086115	R^2=0.635087
[39/50]	cv_eval_time=235.14 sec	RMSE=0.080497	R^2=0.684810
[40/50]	cv_eval_time=103.78 sec	RMSE=0.088256	R^2=0.620840
[41/50]