In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
from matplotlib import rc
rc('figure', figsize=(15, 5))
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from timeit import default_timer as timer
import pickle
import hyperopt
from hyperopt import hp
from hyperopt.pyll.stochastic import sample
import seaborn as sns
from timeit import default_timer as timer
from numpy.random import RandomState
from hyperopt import STATUS_OK
import csv
import ast
from Utils_pfs import * 

In [2]:
sales_test = pd.read_csv(r'test.csv')

In [7]:
with open(r"full_data_2.pkl", "rb") as input_file:
    full_data = pickle.load(input_file)

In [8]:
dict_cat = create_cv_sets2([9,21,33], full_data, sales_test)

In [36]:
def objective(params):
    global ITERATION
    ITERATION +=1
    print('ITERATION: %d' %(ITERATION))
    print('params: ', params)
    start = timer()
    rf = RandomForestRegressor(n_estimators = 50, max_depth = params['max_depth'], max_features = params['max_features'], 
                               min_samples_leaf=params['min_samples_leaf'], n_jobs = -1)
    rf.fit(dict_cat['train'], dict_cat['train_y'])
    loss = error(rf.predict(dict_cat['val']), dict_cat['val_y'])
    train_time = timer()-start
    print('loss: %.5f' %(loss))
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([ITERATION, loss, params, train_time])
    return {'iteration': ITERATION, 'loss': loss, 'params': params,   
            'train_time': train_time, 'status': STATUS_OK}

In [45]:
params_space = {
    'max_depth': hp.choice('subsample', np.arange(1,11,1, dtype = int)),
    'max_features': hp.quniform('max_features', 0.05, 1, 0.05),
    'min_samples_leaf': hp.choice('min_samples_leaf', np.arange(10, 101, 5, dtype = int))
}

In [46]:
sample(params_space)

{'max_depth': 9, 'max_features': 0.5, 'min_samples_leaf': 95}

In [47]:
out_file = 'rf_trials_pfs4.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['iteration', 'loss', 'params', 'train_time'])
of_connection.close()

In [None]:
trials = hyperopt.Trials()
global  ITERATION
ITERATION = 0
best = hyperopt.fmin(
    objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=100,
    trials=trials,
    rstate=RandomState(ITERATION)
)

In [57]:
rf_results = pd.read_csv('rf_trials_pfs4.csv')
rf_results.sort_values('loss', ascending=True,inplace = True)
rf_results.reset_index(inplace = True)

In [58]:
rf_results.iloc[:5,:]

Unnamed: 0,index,iteration,loss,params,train_time
0,31,32,1.652485,"{'max_depth': 10, 'max_features': 0.75, 'min_s...",617.687029
1,68,69,1.653236,"{'max_depth': 10, 'max_features': 0.65, 'min_s...",532.923826
2,94,95,1.653618,"{'max_depth': 10, 'max_features': 0.65, 'min_s...",531.906359
3,58,59,1.653962,"{'max_depth': 10, 'max_features': 0.55, 'min_s...",458.894292
4,86,87,1.654012,"{'max_depth': 10, 'max_features': 0.75, 'min_s...",618.132501


In [59]:
best_params = ast.literal_eval(rf_results.loc[0, 'params'])

In [60]:
best_params

{'max_depth': 10, 'max_features': 0.75, 'min_samples_leaf': 25}

In [61]:
rf = RandomForestRegressor(n_estimators = 50, max_depth = 15, max_features = best_params['max_features'], 
                               min_samples_leaf=best_params['min_samples_leaf'], n_jobs = -1)
rf.fit(dict_cat['train'], dict_cat['train_y'])
print('rmse with max_depth = 15 is {}'.format(error(rf.predict(dict_cat['val']), dict_cat['val_y'])))

rmse with max_depth = 15 is 1.5890628824680237


In [62]:
rf = RandomForestRegressor(n_estimators = 50, max_depth = 20, max_features = best_params['max_features'], 
                               min_samples_leaf=best_params['min_samples_leaf'], n_jobs = -1)
rf.fit(dict_cat['train'], dict_cat['train_y'])
print('rmse with max_depth = 20 is {}'.format(error(rf.predict(dict_cat['val']), dict_cat['val_y'])))

rmse with max_depth = 20 is 1.5795238900710495


Prepare data for stacking

In [67]:
stack_months = np.arange(28,34)

In [68]:
stack_x = []
for i in stack_months:
  data_train_x = full_data.loc[full_data['date_block_num']<i,:].copy()
  data_test_x = full_data.loc[full_data['date_block_num']==i,:].copy()
  data_train_x.drop('cnt_shop_item', axis = 1, inplace = True)
  data_test_x.drop('cnt_shop_item', axis = 1, inplace = True)
  data_train_y = full_data.loc[full_data['date_block_num']<i,'cnt_shop_item'].copy()
  data_test_y = full_data.loc[full_data['date_block_num']==i,'cnt_shop_item'].copy()
  print('training for month', i)
  rf_stack = RandomForestRegressor(n_estimators = 50, max_depth = 20, max_features = best_params['max_features'], 
                                   min_samples_leaf=best_params['min_samples_leaf'], n_jobs = -1)
  rf_stack.fit(data_train_x, data_train_y)
  stack_x.extend(np.squeeze(rf_stack.predict(data_test_x)).tolist())

training for month 28
training for month 29
training for month 30
training for month 31
training for month 32
training for month 33


In [71]:
np.savetxt('Submission Time Series/Stacking/rf_train_level2.csv', stack_x, delimiter=',')

In [72]:
stack_x = pd.concat([dict_cat['train'], dict_cat['val']], axis=0)
stack_y = pd.concat([dict_cat['train_y'], dict_cat['val_y']], axis = 0)
rf_stack = RandomForestRegressor(n_estimators = 50, max_depth = 20, max_features = best_params['max_features'], 
                                 min_samples_leaf=best_params['min_samples_leaf'], n_jobs = -1)
rf_stack.fit(stack_x, stack_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=0.75, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=25, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [81]:
stack_level2_pred = rf_stack.predict(dict_cat['test'])

In [82]:
stack_level2_pred_y = stack_level2_pred-1
#stack_level2_pred_y = create_test_data(stack_level2_pred_y, dict_cat)
np.savetxt('Submission Time Series/Stacking/rf_test_level2.csv', stack_level2_pred_y, delimiter=',')

In [83]:
import pickle 
with open('Submission Time Series/Stacking/rf_final.pkl','wb') as handle:
    pickle.dump(rf_stack,handle,protocol=-1)