In [1]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-7.3.0-posix-seh-rt_v5-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
from matplotlib import rc
rc('figure', figsize=(15, 5))
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from timeit import default_timer as timer
import pickle
import hyperopt
from hyperopt import hp
from hyperopt.pyll.stochastic import sample
import seaborn as sns
from numpy.random import RandomState
from hyperopt import STATUS_OK
import csv
import ast
from Utils_pfs import *

In [3]:
sales_test = pd.read_csv(r'test.csv')

In [7]:
with open(r"full_data_2.pkl", "rb") as input_file:
    full_data = pickle.load(input_file)

In [8]:
dict_cat = create_cv_sets2([9,21,33], full_data, sales_test)

In [22]:
model = XGBRegressor(learning_rate = 0.3, n_estimators = 1000, max_depth = 3, min_child_weight = 1, gamma = 0, 
                     subsample = 0.8, colsample_bytree=0.8, eval_metric='rmse', seed = 0, n_jobs = -1)
xgtrain = xgb.DMatrix(dict_cat['train'], dict_cat['train_y'])
xgval = xgb.DMatrix(dict_cat['val'], dict_cat['val_y'])
xgb_params = model.get_xgb_params()
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
xgbresults = xgb.train(xgb_params, xgtrain, num_boost_round = 1000, evals = watchlist,
                          early_stopping_rounds = 20, verbose_eval = 100)

[0]	train-rmse:2.52364	val-rmse:2.7565
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 20 rounds.
[100]	train-rmse:1.60392	val-rmse:1.72258
[200]	train-rmse:1.55306	val-rmse:1.68639
Stopping. Best iteration:
[213]	train-rmse:1.54882	val-rmse:1.68014



In [23]:
test_y = xgbresults.predict(xgb.DMatrix(dict_cat['test']))

In [24]:
test_ids = sales_test['item_id'].isin(dict_cat['train']['item_id']) & sales_test['shop_id'].isin(dict_cat['train']['shop_id'])
test_y = [test_y[j] if test_ids[j] else 0 for j in range(len(test_ids))]

In [26]:
create_submission_file(test_y, 'xgb_1', sales_test)

In [60]:
def objective(params):
    global ITERATION
    ITERATION +=1
    print('ITERATION: %d' %(ITERATION))
    print('params: ', params)
    model = XGBRegressor(learning_rate = 0.2, max_depth = params['depth'], 
                         min_child_weight = params['min_child_weight'], gamma = params['gamma'], 
                         subsample = params['subsample'], colsample_bytree=params['colsample_bytree'], 
                         eval_metric='rmse', seed = 24, n_jobs = -1)
    start = timer()
    srtc = []
    xgtrain = xgb.DMatrix(dict_cat['train'], dict_cat['train_y'])
    xgval = xgb.DMatrix(dict_cat['val'], dict_cat['val_y'])
    xgb_params = model.get_xgb_params()
    watchlist = [(xgtrain, 'train'), (xgval, 'val')]
    xgbresults = xgb.train(xgb_params, xgtrain, num_boost_round = 5000, evals = watchlist, 
                           early_stopping_rounds = 50, verbose_eval = False)
    n_tree = xgbresults.best_ntree_limit
    print('n_tree: {}'.format(n_tree))
    srtc.append(error(xgbresults.predict(xgb.DMatrix(dict_cat['val'])), dict_cat['val_y']))
    train_time = timer()-start
    loss = np.mean(srtc)
    print('loss: %.5f' %(loss))
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([ITERATION, loss, params, train_time, n_tree])
    return {'iteration': ITERATION, 'loss': loss, 'params': params,   
            'train_time': train_time, 'n_trees': n_tree, 'status': STATUS_OK}

In [68]:
params_space = {
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.7, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'depth': hp.choice('depth',np.arange(1,16, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1,15,2),
    'eval_metric': 'rmse',
    'objective': 'reg:linear'
}


In [54]:
sample(params_space)

{'colsample_bytree': 0.9,
 'depth': 11,
 'eval_metric': 'rmse',
 'gamma': 0.2,
 'min_child_weight': 10.0,
 'objective': 'reg:linear',
 'subsample': 0.15000000000000002}

In [55]:
out_file = 'xgboost_trials_pfs4.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['iteration', 'loss', 'params', 'train_time', 'n_trees'])
of_connection.close()

In [None]:
trials = hyperopt.Trials()
global  ITERATION
ITERATION = 20
best = hyperopt.fmin(
    objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(ITERATION)
)

In [70]:
xgb_tune_results = pd.read_csv('xgboost_trials_pfs4.csv')
xgb_tune_results.sort_values('loss', ascending=True, inplace=True)
xgb_tune_results.reset_index(inplace = True)
best_params = ast.literal_eval(xgb_tune_results.loc[0, 'params'])
best_params

{'colsample_bytree': 0.75,
 'depth': 12,
 'eval_metric': 'rmse',
 'gamma': 0.65,
 'min_child_weight': 14.0,
 'objective': 'reg:linear',
 'subsample': 0.8}

In [71]:
model = XGBRegressor(learning_rate = 0.2, max_depth = best_params['depth'], 
                         min_child_weight = 100, gamma = best_params['gamma'], 
                         subsample = best_params['subsample'], colsample_bytree=best_params['colsample_bytree'], 
                         eval_metric='rmse', seed = 24, n_jobs = -1)
xgtrain = xgb.DMatrix(dict_cat['train'], dict_cat['train_y'])
xgval = xgb.DMatrix(dict_cat['val'], dict_cat['val_y'])
xgb_params = model.get_xgb_params()
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
xgbresults = xgb.train(xgb_params, xgtrain, num_boost_round = 5000, evals = watchlist, 
                       early_stopping_rounds = 50, verbose_eval = 50)
print(error(xgbresults.predict(xgb.DMatrix(dict_cat['val'])), dict_cat['val_y']))

[0]	train-rmse:2.5705	val-rmse:2.82747
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:1.30546	val-rmse:1.56531
[100]	train-rmse:1.25864	val-rmse:1.56444
Stopping. Best iteration:
[62]	train-rmse:1.29003	val-rmse:1.56205

382.14868


In [72]:
model = XGBRegressor(learning_rate = 0.2, max_depth = best_params['depth'], 
                         min_child_weight = 200, gamma = best_params['gamma'], 
                         subsample = best_params['subsample'], colsample_bytree=best_params['colsample_bytree'], 
                         eval_metric='rmse', seed = 24, n_jobs = -1)
xgtrain = xgb.DMatrix(dict_cat['train'], dict_cat['train_y'])
xgval = xgb.DMatrix(dict_cat['val'], dict_cat['val_y'])
xgb_params = model.get_xgb_params()
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
xgbresults = xgb.train(xgb_params, xgtrain, num_boost_round = 5000, evals = watchlist, 
                       early_stopping_rounds = 50, verbose_eval = 50)
print(error(xgbresults.predict(xgb.DMatrix(dict_cat['val'])), dict_cat['val_y']))

[0]	train-rmse:2.57637	val-rmse:2.83549
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:1.3586	val-rmse:1.5742
[100]	train-rmse:1.30745	val-rmse:1.55957
[150]	train-rmse:1.27773	val-rmse:1.55391
[200]	train-rmse:1.25364	val-rmse:1.55122
[250]	train-rmse:1.23659	val-rmse:1.55226
Stopping. Best iteration:
[228]	train-rmse:1.24362	val-rmse:1.5505

379.1354


In [73]:
params_space = {
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.7, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'depth': hp.choice('depth',np.arange(1,16, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 100,1500,50),
    'eval_metric': 'rmse',
    'objective': 'reg:linear'
}

In [None]:
trials = hyperopt.Trials()
global  ITERATION
ITERATION = 50
best = hyperopt.fmin(
    objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=100,
    trials=trials,
    rstate=RandomState(ITERATION)
)

In [9]:
xgb_tune_results = pd.read_csv('xgboost_trials_pfs4.csv')
xgb_tune_results.sort_values('loss', ascending=True, inplace=True)
xgb_tune_results.reset_index(inplace = True)
best_params = ast.literal_eval(xgb_tune_results.loc[0, 'params'])
best_params

{'colsample_bytree': 0.8500000000000001,
 'depth': 10,
 'eval_metric': 'rmse',
 'gamma': 0.65,
 'min_child_weight': 250.0,
 'objective': 'reg:linear',
 'subsample': 0.8}

In [14]:
model = XGBRegressor(learning_rate = 0.2, max_depth = best_params['depth'], 
                         min_child_weight = best_params['min_child_weight'], gamma = best_params['gamma'], 
                         subsample = best_params['subsample'], colsample_bytree=best_params['colsample_bytree'], 
                         eval_metric='rmse', seed = 24, n_jobs = -1)
xgtrain = xgb.DMatrix(dict_cat['train'], dict_cat['train_y'])
xgval = xgb.DMatrix(dict_cat['val'], dict_cat['val_y'])
xgb_params = model.get_xgb_params()
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
xgbresults = xgb.train(xgb_params, xgtrain, num_boost_round = 5000, evals = watchlist, 
                       early_stopping_rounds = 50, verbose_eval = 50)
print(error(xgbresults.predict(xgb.DMatrix(dict_cat['val'])), dict_cat['val_y']))

[0]	train-rmse:2.58377	val-rmse:2.8391
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[50]	train-rmse:1.40458	val-rmse:1.57871
[100]	train-rmse:1.35168	val-rmse:1.56063
[150]	train-rmse:1.32544	val-rmse:1.55575
[200]	train-rmse:1.3055	val-rmse:1.55166
[250]	train-rmse:1.28748	val-rmse:1.5491
[300]	train-rmse:1.27214	val-rmse:1.54785
[350]	train-rmse:1.25936	val-rmse:1.54783
Stopping. Best iteration:
[332]	train-rmse:1.26264	val-rmse:1.54721

378.75806


In [22]:
pred = xgbresults.predict(xgb.DMatrix(dict_cat['test']))
pred = pred-1
pred = create_test_data(pred, dict_cat)

In [24]:
create_submission_file(pred, 'xgboost_final', sales_test)

Creating data for stacking

In [25]:
stack_months = np.arange(28,34)

In [26]:
stack_y = full_data.loc[full_data['date_block_num'].isin(stack_months),'cnt_shop_item']

In [27]:
stack_x = []
for i in stack_months:
  data_train_x = full_data.loc[full_data['date_block_num']<i,:].copy()
  data_test_x = full_data.loc[full_data['date_block_num']==i,:].copy()
  data_train_x.drop('cnt_shop_item', axis = 1, inplace = True)
  data_test_x.drop('cnt_shop_item', axis = 1, inplace = True)
  data_train_y = full_data.loc[full_data['date_block_num']<i,'cnt_shop_item'].copy()
  data_test_y = full_data.loc[full_data['date_block_num']==i,'cnt_shop_item'].copy()
  
  model = XGBRegressor(learning_rate = 0.2, max_depth = best_params['depth'], 
                       min_child_weight = best_params['min_child_weight'], gamma = best_params['gamma'], 
                       subsample = best_params['subsample'], colsample_bytree=best_params['colsample_bytree'], 
                       eval_metric='rmse', seed = 24, n_jobs = -1)
  print('training for month', i)
  xgtrain = xgb.DMatrix(data_train_x, data_train_y)
  xgb_params = model.get_xgb_params()
  xgbresults = xgb.train(xgb_params, xgtrain, num_boost_round = 330, verbose_eval = 50)
  stack_x.extend(np.squeeze(xgbresults.predict(xgb.DMatrix(data_test_x))).tolist())

training for month 28
training for month 29
training for month 30
training for month 31
training for month 32
training for month 33


In [30]:
np.savetxt('Submission Time Series/Stacking/xgboost_train_level2.csv', stack_x, delimiter=',')

In [10]:
stack_x = pd.concat([dict_cat['train'], dict_cat['val']], axis=0)
stack_y = pd.concat([dict_cat['train_y'], dict_cat['val_y']], axis = 0)
model = XGBRegressor(learning_rate = 0.2, max_depth = best_params['depth'], 
                     min_child_weight = best_params['min_child_weight'], gamma = best_params['gamma'], 
                     subsample = best_params['subsample'], colsample_bytree=best_params['colsample_bytree'], 
                     eval_metric='rmse', seed = 24, n_jobs = -1)
xgtrain = xgb.DMatrix(stack_x, stack_y)
xgb_params = model.get_xgb_params()
xgbresults = xgb.train(xgb_params, xgtrain, num_boost_round = 330)

In [11]:
stack_test_y = xgbresults.predict(xgb.DMatrix(dict_cat['test']))

In [22]:
stack_test_level2_y = stack_test_y-1
#stack_test_level2_y = create_test_data(stack_test_level2_y, dict_cat)
stack_test_level2_y = [i if i>0 else 0 for i in stack_test_level2_y]
np.savetxt('Submission Time Series/Stacking/xgboost_test_level2_2.csv', stack_test_level2_y, delimiter=',')

In [23]:
create_submission_file(stack_test_level2_y, 'xgboost_final', sales_test)