In [None]:
# default_exp models.catboost

# CatBoost

> API details.

https://github.com/catboost/tutorials/blob/master/python_tutorial.ipynb

https://www.kaggle.com/c/avito-demand-prediction/discussion/59880

In [None]:
import pandas as pd
import os
import sys
import inspect
import numpy as np

import time
import sys
import gc


from catboost import CatBoostRegressor, Pool, cv

import hyperopt

## Data loading

In [None]:
data_df = pd.read_csv("../data/sales_train.csv")
data_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [None]:
data_df = data_df.drop("date",axis=1)

In [None]:
X_train = data_df[data_df.date_block_num < 33].drop(['item_cnt_day'], axis=1)
Y_train = data_df[data_df.date_block_num < 33]['item_cnt_day']
X_valid = data_df[data_df.date_block_num == 33].drop(['item_cnt_day'], axis=1)
Y_valid = data_df[data_df.date_block_num == 33]['item_cnt_day']
X_test = data_df[data_df.date_block_num == 34].drop(['item_cnt_day'], axis=1)

In [None]:
X_train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price
0,0,59,22154,999.0
1,0,25,2552,899.0
2,0,25,2552,899.0
3,0,25,2554,1709.05
4,0,25,2555,1099.0


In [None]:
del data_df
gc.collect();

## Model training

In [None]:
is_use_GPU = False

In [None]:
if is_use_GPU:
    task_type = "GPU"
else:
    task_type = "CPU"

### Hyperopt

In [None]:
def hyperopt_objective(params):
    model = CatBoostRegressor(
        task_type = task_type,
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        depth=int(params['depth']),
        
        iterations=500, # go to 500
        eval_metric='RMSE',
        random_seed=42,
        logging_level='Silent'
    )
    
    cv_data = cv(
        Pool(X_train
                , Y_train
                , cat_features = categorical_features_indices
            )
        ,model.get_params()
    )
    best_rmse = np.min(cv_data['test-RMSE-mean'])
    
    return best_rmse # as hyperopt minimises

In [None]:
do_tuning = False

In [None]:
if do_tuning:
    params_space = {
        'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
        'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
        #'depth' : hyperopt.hp.uniform('depth', 5, 15)
        'depth' : hyperopt.hp.choice('depth', range(5,15))
    }

    trials = hyperopt.Trials()

    best = hyperopt.fmin(
        hyperopt_objective,
        space=params_space,
        algo=hyperopt.tpe.suggest,
        max_evals=50, # go to 50
        trials=trials,
        rstate=RandomState(123)
    )

    print(best)

### Model parametrization

In [None]:
ts = time.time()

print("task type:",task_type)

model = CatBoostRegressor(
    task_type = task_type
    ,loss_function='RMSE'
    ,random_seed=42
    ,logging_level='Info'
    
    ,iterations=1000 # go to 500
    ,depth=7
    ,l2_leaf_reg=1.0
    ,learning_rate=0.36
    
    ,thread_count=-1
    ,early_stopping_rounds=50
)

train_pool = Pool(X_train
                  , Y_train
                  #, cat_features=categorical_features_indices
                 )

model_fit = model.fit(train_pool
                  , verbose=True 
                  , use_best_model=True
                  , eval_set=[(X_valid, Y_valid)])

time.time() - ts

task type: CPU
0:	learn: 2.2294130	test: 9.7623564	best: 9.7623564 (0)	total: 762ms	remaining: 12m 41s
1:	learn: 2.2051488	test: 9.7571760	best: 9.7571760 (1)	total: 1.14s	remaining: 9m 31s
2:	learn: 2.1936169	test: 9.7581899	best: 9.7571760 (1)	total: 1.55s	remaining: 8m 37s
3:	learn: 2.1501633	test: 9.6044575	best: 9.6044575 (3)	total: 1.84s	remaining: 7m 37s
4:	learn: 2.1415261	test: 9.6035323	best: 9.6035323 (4)	total: 2.13s	remaining: 7m 5s
5:	learn: 2.1368785	test: 9.6032698	best: 9.6032698 (5)	total: 2.54s	remaining: 7m
6:	learn: 2.1271711	test: 9.6032873	best: 9.6032698 (5)	total: 2.84s	remaining: 6m 43s
7:	learn: 2.1240736	test: 9.6031107	best: 9.6031107 (7)	total: 3.27s	remaining: 6m 46s
8:	learn: 2.1198226	test: 9.6070751	best: 9.6031107 (7)	total: 3.56s	remaining: 6m 32s
9:	learn: 2.1158140	test: 9.6057547	best: 9.6031107 (7)	total: 3.85s	remaining: 6m 21s
10:	learn: 2.1068363	test: 9.6056186	best: 9.6031107 (7)	total: 4.15s	remaining: 6m 12s
11:	learn: 2.1054902	test: 9.60

93:	learn: 1.9008316	test: 8.9677856	best: 8.9676521 (91)	total: 32.6s	remaining: 5m 14s
94:	learn: 1.8998135	test: 8.9678121	best: 8.9676521 (91)	total: 33.3s	remaining: 5m 16s
95:	learn: 1.8988977	test: 8.9677911	best: 8.9676521 (91)	total: 33.6s	remaining: 5m 16s
96:	learn: 1.8986541	test: 8.9675947	best: 8.9675947 (96)	total: 33.9s	remaining: 5m 15s
97:	learn: 1.8980265	test: 8.9676082	best: 8.9675947 (96)	total: 34.3s	remaining: 5m 15s
98:	learn: 1.8978497	test: 8.9675173	best: 8.9675173 (98)	total: 34.6s	remaining: 5m 15s
99:	learn: 1.8942698	test: 8.9593814	best: 8.9593814 (99)	total: 35.4s	remaining: 5m 18s
100:	learn: 1.8935714	test: 8.9591959	best: 8.9591959 (100)	total: 35.8s	remaining: 5m 18s
101:	learn: 1.8932886	test: 8.9593890	best: 8.9591959 (100)	total: 36.5s	remaining: 5m 21s
102:	learn: 1.8914082	test: 8.9595186	best: 8.9591959 (100)	total: 36.9s	remaining: 5m 21s
103:	learn: 1.8911552	test: 8.9589037	best: 8.9589037 (103)	total: 37.4s	remaining: 5m 22s
104:	learn: 1

63.07235407829285

In [None]:
feature_importances = model_fit.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

item_price: 33.16861862581194
shop_id: 30.35006874341807
item_id: 24.272024440825234
date_block_num: 12.209288189944767


## Predict on TEST

In [None]:
Y_pred_train = model_fit.predict(X_train)
Y_pred_valid = model_fit.predict(X_valid)

In [None]:
print("done")

done


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_connectors.gcp.ipynb.
Converted 01_nlp.fasttext.ipynb.
Converted 02_forecasting.dataprep.ipynb.
Converted 03_models.catboost.ipynb.
Converted 04_nlp.nbsvm.ipynb.
Converted index.ipynb.
