In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
from matplotlib import rc
rc('figure', figsize=(15, 5))
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from timeit import default_timer as timer
import pickle
import hyperopt
from hyperopt import hp
from hyperopt.pyll.stochastic import sample
import seaborn as sns
from timeit import default_timer as timer
from numpy.random import RandomState
from hyperopt import STATUS_OK
import csv
import ast

In [2]:
sales_test = pd.read_csv(r'test.csv')

In [3]:
def create_test_data(prediction, dict_cat):
  trtv = pd.concat([dict_cat['train'], dict_cat['val']], axis = 0)
  test_ids = dict_cat['test']['item_id'].isin(trtv['item_id']) & dict_cat['test']['shop_id'].isin(trtv['shop_id'])
  prediction[~test_ids] = 0
  return(prediction)

In [4]:
def create_test(test):
    test = test.loc[(test['shop_id'].isin(sales_test['shop_id']))&(test['item_id'].isin(sales_test['item_id'])),:].copy()
    test_pred = test['cnt_shop_item'].copy()
    test.drop('cnt_shop_item', axis = 1, inplace = True)
    return(test, test_pred)

In [5]:
def create_cv_sets2(months, data):
    X_train = data.loc[~data['date_block_num'].isin(months+[34]),:].drop('cnt_shop_item', axis=1)
    X_val = data.loc[data['date_block_num'].isin(months),:]
    X_val, y_val = create_test(X_val)
    X_test = data.loc[data['date_block_num'] == 34,:].drop('cnt_shop_item',axis=1)
    y_train = data.loc[~data['date_block_num'].isin(months+[34]),'cnt_shop_item']
    return(dict({'train': X_train, 'val': X_val, 'test': X_test, 'train_y': y_train, 'val_y': y_val}))

In [30]:
def create_submission_file(pred, name):
    ID = np.arange(0, sales_test.shape[0]) 
    new_df = pd.DataFrame({'ID': ID, 'item_cnt_month': pred})
    new_df.to_csv('Submission Time Series/'+name+'.csv',index = False)

In [17]:
with open(r"full_data.pkl", "rb") as input_file:
    full_data = pickle.load(input_file)

In [18]:
full_data.shape

(1823324, 63)

In [19]:
dict_cat = create_cv_sets2([9,21,33], full_data)

In [10]:
def objective(params):
    global ITERATION
    ITERATION +=1
    print('ITERATION: %d' %(ITERATION))
    print('params: ', params)
    model = CatBoostRegressor(iterations = 5000, l2_leaf_reg=params['l2_leaf_reg'], learning_rate = 0.2,
                             rsm = params['rsm'], random_seed = 24, depth = params['depth'])
    start = timer()
    model.fit(dict_cat['train'], dict_cat['train_y'], eval_set = (dict_cat['val'], dict_cat['val_y']), 
              early_stopping_rounds=100, logging_level='Silent')
    n_tree = model.tree_count_
    loss = model.score(dict_cat['val'], dict_cat['val_y'])
    train_time = timer()-start
    del model
    print('loss: %.5f' %(loss))
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([ITERATION, loss, params, train_time])
    return {'iteration': ITERATION, 'loss': loss, 'params': params,   
            'train_time': train_time, 'status': STATUS_OK}

In [11]:
params_space = {
    'l2_leaf_reg': hyperopt.hp.quniform('l2_leaf_reg', 1, 100, 1),
    'rsm': 1 - hp.loguniform('rsm',np.log(0.01) , np.log(0.35)),
    'depth': hp.quniform('depth', 5, 16, 1)
}

In [12]:
sample(params_space)

{'depth': 10.0, 'l2_leaf_reg': 99.0, 'rsm': 0.9233874278606713}

In [13]:
out_file = 'cat_trials_pfs4_2.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

## Write the headers to the file
writer.writerow(['iteration', 'loss', 'params', 'train_time'])
of_connection.close()

In [None]:
trials = hyperopt.Trials()
global  ITERATION
ITERATION = 0
best = hyperopt.fmin(
    objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(ITERATION)
)

ITERATION: 1
params:  {'depth': 16.0, 'l2_leaf_reg': 69.0, 'rsm': 0.9437023664900684}


In [9]:
cat_tune_results = pd.read_csv('cat_trials_pfs4.csv')
cat_tune_results.sort_values('loss', ascending=True, inplace=True)
cat_tune_results.reset_index(inplace = True)
best_params = ast.literal_eval(cat_tune_results.loc[0, 'params'])

In [10]:
best_params

{'depth': 12.0, 'l2_leaf_reg': 42.0, 'rsm': 0.7879629501355983}

Tuning the learning rate and n_estimators

In [11]:
model = CatBoostRegressor(iterations = 15000, l2_leaf_reg=best_params['l2_leaf_reg'], learning_rate = 0.2,
                          rsm = best_params['rsm'], random_seed = 24, depth = best_params['depth'], metric_period = 200)
model.fit(dict_cat['train'], dict_cat['train_y'], eval_set = (dict_cat['val'], dict_cat['val_y']), 
          early_stopping_rounds=100)

0:	learn: 2.8338432	test: 3.0894248	best: 3.0894248 (0)	total: 1.56s	remaining: 6h 31m 4s
200:	learn: 1.3618551	test: 1.5810148	best: 1.5810148 (200)	total: 3m 51s	remaining: 4h 43m 35s
400:	learn: 1.2844462	test: 1.5742839	best: 1.5734050 (392)	total: 7m 27s	remaining: 4h 31m 17s
600:	learn: 1.2329980	test: 1.5710585	best: 1.5706888 (594)	total: 11m 6s	remaining: 4h 26m 5s
800:	learn: 1.1922826	test: 1.5699336	best: 1.5697052 (767)	total: 14m 45s	remaining: 4h 21m 33s
1000:	learn: 1.1560746	test: 1.5668903	best: 1.5668131 (999)	total: 18m 26s	remaining: 4h 17m 54s
1200:	learn: 1.1236681	test: 1.5658752	best: 1.5654592 (1192)	total: 22m 6s	remaining: 4h 14m 4s
1400:	learn: 1.0928341	test: 1.5651970	best: 1.5647866 (1341)	total: 25m 49s	remaining: 4h 10m 38s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1.5647866
bestIteration = 1341

Shrink model to first 1342 iterations.


<catboost.core.CatBoostRegressor at 0x2af8c283320>

In [26]:
with open('cat_boost_final.pkl','wb') as handle:
    pickle.dump(model,handle,protocol=-1)

Creating data for stacking

In [15]:
stack_months = np.arange(28,34)

In [20]:
stack_y = full_data.loc[full_data['date_block_num'].isin(stack_months),'cnt_shop_item']

In [22]:
stack_x = []
for i in stack_months:
  data_train_x = full_data.loc[full_data['date_block_num']<i,:].copy()
  data_test_x = full_data.loc[full_data['date_block_num']==i,:].copy()
  data_train_x.drop('cnt_shop_item', axis = 1, inplace = True)
  data_test_x.drop('cnt_shop_item', axis = 1, inplace = True)
  data_train_y = full_data.loc[full_data['date_block_num']<i,'cnt_shop_item'].copy()
  data_test_y = full_data.loc[full_data['date_block_num']==i,'cnt_shop_item'].copy()
  
  model = CatBoostRegressor(iterations = 1342, l2_leaf_reg=best_params['l2_leaf_reg'], learning_rate = 0.2,
                            rsm = best_params['rsm'], random_seed = 24, depth = best_params['depth'])
  print('training for month', i)
  model.fit(data_train_x, data_train_y, logging_level='Silent')
  stack_x.extend(np.squeeze(model.predict(data_test_x)).tolist())

training for month 28
training for month 29
training for month 30
training for month 31
training for month 32
training for month 33


In [23]:
len(stack_x)

192351

In [24]:
np.savetxt('Submission Time Series/Stacking/catboost_train_level2.csv', stack_x, delimiter=',')

In [25]:
stack_x = pd.concat([dict_cat['train'], dict_cat['val']], axis=0)
stack_y = pd.concat([dict_cat['train_y'], dict_cat['val_y']], axis = 0)
model = CatBoostRegressor(iterations = 1342, l2_leaf_reg=best_params['l2_leaf_reg'], learning_rate = 0.2,
                            rsm = best_params['rsm'], random_seed = 24, depth = best_params['depth'])
model.fit(stack_x, stack_y, verbose=200)

0:	learn: 2.8441404	total: 1.03s	remaining: 22m 58s
200:	learn: 1.3643436	total: 3m 43s	remaining: 21m 9s
400:	learn: 1.2904161	total: 7m 21s	remaining: 17m 15s
600:	learn: 1.2371631	total: 11m 2s	remaining: 13m 36s
800:	learn: 1.1966821	total: 14m 42s	remaining: 9m 56s
1000:	learn: 1.1592731	total: 18m 32s	remaining: 6m 19s
1200:	learn: 1.1278170	total: 22m 16s	remaining: 2m 36s
1341:	learn: 1.1067424	total: 24m 55s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x2af8c283898>

In [26]:
stack_test_y = model.predict(dict_cat['test'])

In [28]:
stack_test_level2_y = stack_test_y-1
#stack_test_level2_y = create_test_data(stack_test_level2_y, dict_cat)
stack_test_level2_y = [i if i>0 else 0 for i in stack_test_level2_y]
np.savetxt('Submission Time Series/Stacking/catboost_test_level2.csv', stack_test_level2_y, delimiter=',')

In [33]:
create_submission_file(stack_test_level2_y, 'catboost_final')