# CatBoost

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/CatBoost.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New Catboot Model #####")

### Data

In [3]:
df_imp = pd.read_csv('docs/ft_importances_20190831.csv')

In [4]:
X_cols = df_imp[df_imp.importance > 0.08].feature.to_list()

In [5]:
len(X_cols)

227

In [6]:
X_cols

['R1',
 'card1',
 'R12',
 'TransactionAmt',
 'N6',
 'C1',
 'N1',
 'N5',
 'card2',
 'R4',
 'addr1',
 'R11',
 'R3',
 'R7',
 'N3',
 'R29',
 'C13',
 'card1_fe1',
 'R8',
 'R28',
 'R9',
 'C11',
 'R17',
 'D2',
 'C14',
 'R13',
 'D1',
 'R5',
 'C6',
 'Transaction_hour',
 'card2_fe1',
 'dist1',
 'C2',
 'R21',
 'R2',
 'addr1_fe1',
 'M5',
 'card5',
 'N2',
 'V307',
 'card1_fe2',
 'M5_fe2',
 'id_02',
 'R19',
 'P_emaildomain',
 'P_emaildomain_0_fe1',
 'N4',
 'DeviceInfo',
 'Transaction_day_of_week',
 'P_emaildomain_0',
 'addr1_fe2',
 'V258',
 'V257',
 'C12',
 'C9',
 'M4_fe2',
 'id_31',
 'V317',
 'card2_fe2',
 'V45',
 'id_20',
 'id_19',
 'card6',
 'P_emaildomain_fe1',
 'V308',
 'P_emaildomain_fe2',
 'R20',
 'V315',
 'V310',
 'V53',
 'C8',
 'M6_fe1',
 'V313',
 'V201',
 'M4',
 'R18',
 'C4',
 'card6_fe1',
 'card6_fe2',
 'R_emaildomain_0',
 'V314',
 'V283',
 'P_emaildomain_0_fe2',
 'V282',
 'V285',
 'V200',
 'C10',
 'V83',
 'R14',
 'R27',
 'card5_fe1',
 'V189',
 'card5_fe2',
 'V294',
 'R22',
 'C7',
 'V127'

In [7]:
data_folder = 'input'

In [8]:
train = pd.read_csv(data_folder+'/train_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols+['isFraud', 'TransactionDT'])
test = pd.read_csv(data_folder+'/test_ft_eng_1.zip', dtype = schema_ft_eng_1, usecols=X_cols)

In [9]:
X = train[X_cols]
y = train.isFraud

### Model

In [10]:
logging.warning("Used columns: {}".format(X_cols))
k = 5
logging.warning("Folds number: {}".format(k))

In [11]:
train_ids = X.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

#### CatBoost

In [12]:
params = {'depth':11,
          'iterations':20000,
          'eval_metric':'AUC',
          'random_seed':42,
          'logging_level':'Verbose',
          'allow_writing_files':False,
          'early_stopping_rounds':20,
          'learning_rate':0.01,
          'thread_count':8,
          'boosting_type':'Plain',
          'bootstrap_type':'Bernoulli',
          'rsm':0.6}

In [13]:
model_cb = CatBoostClassifier(**params)

In [14]:
logging.warning("Params: {}".format(str(model_cb.get_params())))

In [15]:
cat_ft_id = list()
n = 0
for c in X.columns:
    if c in cat_ft:
        cat_ft_id.append(n)
    n += 1

In [16]:
counter = 1
auc_score = 0
iterat = 0
for train_index, test_index in skf.split(train_ids, y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]
    
    model_cb.fit(X_fit,
             y_fit,
             cat_features=cat_ft_id,
             eval_set=(X_val, y_val),
             verbose=100
             )
    
    logging.warning("Best AUC in this fold: {}".format(model_cb.best_score_['validation']['AUC']))
    logging.warning("Best iteration in this fold: {}".format(model_cb.best_iteration_))
    auc_score += model_cb.best_score_['validation']['AUC']
    iterat += model_cb.best_iteration_

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

0:	test: 0.5756614	best: 0.5756614 (0)	total: 796ms	remaining: 4h 25m 20s
100:	test: 0.8692477	best: 0.8692477 (100)	total: 7m 51s	remaining: 1d 1h 46m 37s
200:	test: 0.9228785	best: 0.9228785 (200)	total: 19m 58s	remaining: 1d 8h 47m 42s
300:	test: 0.9349212	best: 0.9349212 (300)	total: 32m 5s	remaining: 1d 11h 22s
400:	test: 0.9414264	best: 0.9414264 (400)	total: 42m 53s	remaining: 1d 10h 56m 16s
500:	test: 0.9455488	best: 0.9455488 (500)	total: 55m 46s	remaining: 1d 12h 10m 38s
600:	test: 0.9491994	best: 0.9491994 (600)	total: 1h 6m 34s	remaining: 1d 11h 49m 2s
700:	test: 0.9521104	best: 0.9521104 (700)	total: 1h 17m 10s	remaining: 1d 11h 24m 47s
800:	test: 0.9543357	best: 0.9543357 (800)	total: 1h 27m 40s	remaining: 1d 11h 1m 18s
900:	test: 0.9560913	best: 0.9560913 (900)	total: 1h 38m 10s	remaining: 1d 10h 41m 7s
1000:	test: 0.9575703	best: 0.9575703 (1000)	total: 1h 48m 45s	remaining: 1d 10h 24m 12s
1100:	test: 0.9588123	best: 0.9588123 (1100)	total: 1h 59m 28s	remaining:

KeyboardInterrupt: 

In [None]:
params['iterations'] = int(mean_iterat*1.1)
params['iterations']

In [None]:
model_cb = CatBoostClassifier(**params)
model_cb.fit(X, y, cat_features=cat_ft_id, verbose=200)

In [None]:
y_preds = model_cb.predict_proba(test[X.columns])

In [None]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [None]:
df_sub['isFraud'] = y_preds[:,1]

In [None]:
df_sub.head()

In [None]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [None]:
submission_name = '{0}_CatBoost_{1}'.format(D, mean_auc_score)

In [None]:
logging.warning("Submission name: {}".format(submission_name))

In [None]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [None]:
logging.warning("End")