In [None]:
# default_exp models.catboost

# CatBoost

> API details.

https://github.com/catboost/tutorials/blob/master/python_tutorial.ipynb

https://www.kaggle.com/c/avito-demand-prediction/discussion/59880

In [None]:
import pandas as pd
import os
import sys
import inspect
import numpy as np

import time
import sys
import gc


from catboost import CatBoostRegressor, Pool, cv

import hyperopt

## Data loading

In [None]:
data_df = pd.read_csv("../data/sales_train.csv")
data_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
1892944,27.08.2014,19,56,9055,99.0,1.0
1123531,01.11.2013,10,50,1416,1499.0,1.0
654351,12.07.2013,6,30,2720,749.0,1.0
712450,26.07.2013,6,46,6433,449.0,1.0
2104279,18.10.2014,21,46,17717,799.0,1.0


In [None]:
X_train = data_df[data_df.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data_df[data_df.date_block_num < 33]['item_cnt_month']
X_valid = data_df[data_df.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data_df[data_df.date_block_num == 33]['item_cnt_month']
X_test = data_df[data_df.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
X_train.head()

Unnamed: 0,date_block_num,shop_id,item_id,city_code,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_6,item_cnt_month_lag_12,date_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_2,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_item_avg_item_cnt_lag_12,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,delta_price_lag,month,days,item_shop_last_sale,item_last_sale,item_shop_first_sale,item_first_sale
0,12,2,27,0,19,5,10,0.0,0.0,0.0,0.0,1.0,0.4114,0.087,0.04443,0.1305,0.06525,0.1555,0.1481,0.10065,0.08905,0.096,0.1412,1.082,0.9556,0.1481,0.0,-0.2827,0,31,1,1,12,12
1,12,2,30,0,40,11,4,0.0,0.0,0.0,0.0,0.0,0.4114,1.021,1.022,0.522,0.891,0.0,0.1481,0.10065,0.08905,0.096,0.0,0.2915,0.04623,0.1481,0.0,-0.4834,0,31,1,1,11,11
2,12,2,31,0,37,11,1,0.0,0.0,0.0,0.0,0.0,0.4114,0.5435,0.6,0.5435,0.3044,0.0,0.1481,0.10065,0.08905,0.096,0.0,0.2328,0.05945,0.1481,0.0,-0.1375,0,31,1,1,11,11
3,12,2,32,0,40,11,4,0.0,0.0,0.0,0.0,0.0,0.4114,1.935,1.8,1.261,1.892,5.38,0.1481,0.10065,0.08905,0.096,0.1412,0.2915,0.04623,0.1481,0.0,-0.4072,0,31,-1,1,12,12
4,12,2,33,0,37,11,1,1.0,2.0,0.0,0.0,1.0,0.4114,0.913,0.3333,0.7173,1.0,1.355,0.1481,0.10065,0.08905,0.096,0.1412,0.2328,0.05945,0.1481,1.0,-0.2255,0,31,1,1,12,12


In [None]:
del data
gc.collect();

## Model training

In [None]:
is_use_GPU = False

In [None]:
if is_use_GPU:
    task_type = "GPU"
else:
    task_type = "CPU"

### Hyperopt

In [None]:
def hyperopt_objective(params):
    model = CatBoostRegressor(
        task_type = task_type,
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        depth=int(params['depth']),
        
        iterations=500, # go to 500
        eval_metric='RMSE',
        random_seed=42,
        logging_level='Silent'
    )
    
    cv_data = cv(
        Pool(X_train
                , Y_train
                , cat_features = categorical_features_indices
            )
        ,model.get_params()
    )
    best_rmse = np.min(cv_data['test-RMSE-mean'])
    
    return best_rmse # as hyperopt minimises

In [None]:
do_tuning = False

In [None]:
if do_tuning:
    params_space = {
        'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
        'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
        #'depth' : hyperopt.hp.uniform('depth', 5, 15)
        'depth' : hyperopt.hp.choice('depth', range(5,15))
    }

    trials = hyperopt.Trials()

    best = hyperopt.fmin(
        hyperopt_objective,
        space=params_space,
        algo=hyperopt.tpe.suggest,
        max_evals=50, # go to 50
        trials=trials,
        rstate=RandomState(123)
    )

    print(best)

### Model parametrization

In [None]:
ts = time.time()

print("task type:",task_type)

model = CatBoostRegressor(
    task_type = task_type
    ,loss_function='RMSE'
    ,random_seed=42
    ,logging_level='Info'
    
    ,iterations=1000 # go to 500
    ,depth=7
    ,l2_leaf_reg=1.0
    ,learning_rate=0.36
    
    ,thread_count=-1
    ,early_stopping_rounds=50
)

train_pool = Pool(X_train
                  , Y_train
                  #, cat_features=categorical_features_indices
                 )

model_fit = model.fit(train_pool
                  , verbose=True 
                  , use_best_model=True
                  , eval_set=[(X_valid, Y_valid)])

time.time() - ts

task type: GPU
0:	learn: 1.0635187	test: 1.0494160	best: 1.0494160 (0)	total: 133ms	remaining: 2m 13s
1:	learn: 0.9808246	test: 0.9990526	best: 0.9990526 (1)	total: 204ms	remaining: 1m 41s
2:	learn: 0.9351607	test: 0.9680294	best: 0.9680294 (2)	total: 277ms	remaining: 1m 31s
3:	learn: 0.9115793	test: 0.9561274	best: 0.9561274 (3)	total: 343ms	remaining: 1m 25s
4:	learn: 0.8987176	test: 0.9474366	best: 0.9474366 (4)	total: 424ms	remaining: 1m 24s
5:	learn: 0.8904159	test: 0.9441049	best: 0.9441049 (5)	total: 498ms	remaining: 1m 22s
6:	learn: 0.8845093	test: 0.9405823	best: 0.9405823 (6)	total: 579ms	remaining: 1m 22s
7:	learn: 0.8799512	test: 0.9388863	best: 0.9388863 (7)	total: 659ms	remaining: 1m 21s
8:	learn: 0.8771009	test: 0.9373010	best: 0.9373010 (8)	total: 740ms	remaining: 1m 21s
9:	learn: 0.8742543	test: 0.9356766	best: 0.9356766 (9)	total: 818ms	remaining: 1m 20s
10:	learn: 0.8728465	test: 0.9331059	best: 0.9331059 (10)	total: 895ms	remaining: 1m 20s
11:	learn: 0.8668628	test:

93:	learn: 0.8030781	test: 0.9131136	best: 0.9111744 (64)	total: 6.98s	remaining: 1m 7s
94:	learn: 0.8028629	test: 0.9132639	best: 0.9111744 (64)	total: 7.05s	remaining: 1m 7s
95:	learn: 0.8026979	test: 0.9134100	best: 0.9111744 (64)	total: 7.11s	remaining: 1m 6s
96:	learn: 0.8024345	test: 0.9138630	best: 0.9111744 (64)	total: 7.19s	remaining: 1m 6s
97:	learn: 0.8022806	test: 0.9136134	best: 0.9111744 (64)	total: 7.26s	remaining: 1m 6s
98:	learn: 0.8013758	test: 0.9121109	best: 0.9111744 (64)	total: 7.34s	remaining: 1m 6s
99:	learn: 0.8011781	test: 0.9121633	best: 0.9111744 (64)	total: 7.4s	remaining: 1m 6s
100:	learn: 0.8009983	test: 0.9119475	best: 0.9111744 (64)	total: 7.47s	remaining: 1m 6s
101:	learn: 0.8007753	test: 0.9135814	best: 0.9111744 (64)	total: 7.54s	remaining: 1m 6s
102:	learn: 0.8002699	test: 0.9132776	best: 0.9111744 (64)	total: 7.61s	remaining: 1m 6s
103:	learn: 0.8000729	test: 0.9132601	best: 0.9111744 (64)	total: 7.68s	remaining: 1m 6s
104:	learn: 0.7999300	test: 0

51.490495920181274

In [None]:
feature_importances = model_fit.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

item_cnt_month_lag_1: 24.096973014999396
item_first_sale: 12.006346081698476
item_category_id: 11.571586104046894
date_item_avg_item_cnt_lag_1: 9.347904226652117
date_shop_cat_avg_item_cnt_lag_1: 4.792384156230253
month: 4.374915441564244
subtype_code: 4.234835590645327
item_cnt_month_lag_3: 3.9195514752137726
item_cnt_month_lag_2: 3.157629308588907
date_shop_avg_item_cnt_lag_1: 2.4298752525641234
delta_price_lag: 2.344605276021997
date_cat_avg_item_cnt_lag_1: 1.8257605079068258
item_id: 1.789004696570604
date_avg_item_cnt_lag_1: 1.7318593722736364
date_item_city_avg_item_cnt_lag_1: 1.7284243808433235
item_cnt_month_lag_6: 1.5795793795733843
type_code: 1.3516868759138587
date_item_avg_item_cnt_lag_2: 1.1456964671201675
date_block_num: 1.1236301086839122
item_shop_first_sale: 1.0011627440861472
shop_id: 0.9794114035607472
days: 0.8941072326132936
city_code: 0.5249867892289879
item_cnt_month_lag_12: 0.48348609097249273
date_item_avg_item_cnt_lag_3: 0.4461188839180178
date_city_avg_item_c

## Predict on TEST

In [None]:
Y_pred_train = model_fit.predict(X_train)
Y_pred_valid = model_fit.predict(X_valid)

In [None]:
print("done")

done


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_connectors.gcp.ipynb.
Converted 01_nlp.fasttext.ipynb.
Converted 02_forecasting.dataprep.ipynb.
Converted 03_models.catboost.ipynb.
Converted 04_nlp.nbsvm.ipynb.
Converted index.ipynb.
