# CatBoost

### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
import logging
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
import datetime

from utils.schemas import *
from utils.functions import *

### Logging

In [2]:
LOG_NAME = 'logs/CatBoost.log'
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')
logging.warning("")
logging.warning("##### New Catboot Model #####")

### Data

In [3]:
df_imp = pd.read_csv('docs/ft_importances_20190831.csv')

In [4]:
X_cols = df_imp[df_imp.importance > 0.08].feature.to_list()

In [9]:
len(X_cols)

227

In [10]:
X_cols

['R1',
 'card1',
 'R12',
 'TransactionAmt',
 'N6',
 'C1',
 'N1',
 'N5',
 'card2',
 'R4',
 'addr1',
 'R11',
 'R3',
 'R7',
 'N3',
 'R29',
 'C13',
 'card1_fe1',
 'R8',
 'R28',
 'R9',
 'C11',
 'R17',
 'D2',
 'C14',
 'R13',
 'D1',
 'R5',
 'C6',
 'Transaction_hour',
 'card2_fe1',
 'dist1',
 'C2',
 'R21',
 'R2',
 'addr1_fe1',
 'M5',
 'card5',
 'N2',
 'V307',
 'card1_fe2',
 'M5_fe2',
 'id_02',
 'R19',
 'P_emaildomain',
 'P_emaildomain_0_fe1',
 'N4',
 'DeviceInfo',
 'Transaction_day_of_week',
 'P_emaildomain_0',
 'addr1_fe2',
 'V258',
 'V257',
 'C12',
 'C9',
 'M4_fe2',
 'id_31',
 'V317',
 'card2_fe2',
 'V45',
 'id_20',
 'id_19',
 'card6',
 'P_emaildomain_fe1',
 'V308',
 'P_emaildomain_fe2',
 'R20',
 'V315',
 'V310',
 'V53',
 'C8',
 'M6_fe1',
 'V313',
 'V201',
 'M4',
 'R18',
 'C4',
 'card6_fe1',
 'card6_fe2',
 'R_emaildomain_0',
 'V314',
 'V283',
 'P_emaildomain_0_fe2',
 'V282',
 'V285',
 'V200',
 'C10',
 'V83',
 'R14',
 'R27',
 'card5_fe1',
 'V189',
 'card5_fe2',
 'V294',
 'R22',
 'C7',
 'V127'

In [6]:
data_folder = 'input'

In [7]:
train = pd.read_csv(data_folder+'/train_ft_eng_1.csv', dtype = schema_ft_eng_1, usecols=X_cols+['isFraud', 'TransactionDT'])
test = pd.read_csv(data_folder+'/test_ft_eng_1.csv', dtype = schema_ft_eng_1, usecols=X_cols)

In [8]:
X = train[X_cols]
y = train.isFraud

### Model

In [11]:
logging.warning("Used columns: {}".format(X_cols))
k = 5
logging.warning("Folds number: {}".format(k))

In [12]:
train_ids = X.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

#### CatBoost

In [13]:
params = {'depth':11,
          'iterations':20000,
          'eval_metric':'AUC',
          'random_seed':42,
          'logging_level':'Verbose',
          'allow_writing_files':False,
          'early_stopping_rounds':20,
          'learning_rate':0.01,
          'thread_count':8,
          'boosting_type':'Plain',
          'bootstrap_type':'Bernoulli',
          'rsm':0.6}

In [14]:
model_cb = CatBoostClassifier(**params)

In [15]:
logging.warning("Params: {}".format(str(model_cb.get_params())))

In [16]:
cat_ft_id = list()
n = 0
for c in X.columns:
    if c in cat_ft:
        cat_ft_id.append(n)
    n += 1

In [17]:
counter = 1
auc_score = 0
iterat = 0
for train_index, test_index in skf.split(train_ids, y):
    print('Fold {}\n'.format(counter))
    logging.warning("Training fold {}".format(counter))

    X_fit, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
    y_fit, y_val = y.iloc[train_index], y.iloc[test_index]
    
    model_cb.fit(X_fit,
             y_fit,
             cat_features=cat_ft_id,
             eval_set=(X_val, y_val),
             verbose=100
             )
    
    logging.warning("Best AUC in this fold: {}".format(model_cb.best_score_['validation']['AUC']))
    logging.warning("Best iteration in this fold: {}".format(model_cb.best_iteration_))
    auc_score += model_cb.best_score_['validation']['AUC']
    iterat += model_cb.best_iteration_

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()
    
    counter += 1
    
mean_auc_score = auc_score/k
mean_iterat = iterat/k

logging.warning("Mean AUC in {0} folds: {1}".format(k, mean_auc_score))
logging.warning("Mean iterations in {0} folds: {1}".format(k, mean_iterat))

Fold 1

0:	test: 0.8080485	best: 0.8080485 (0)	total: 3.66s	remaining: 20h 19m 58s
100:	test: 0.8686341	best: 0.8686915 (99)	total: 5m 45s	remaining: 18h 54m 3s


KeyboardInterrupt: 

In [38]:
params['iterations'] = int(mean_iterat*1.1)
params['iterations']

2052

In [39]:
model_cb = CatBoostClassifier(**params)
model_cb.fit(X, y, cat_features=cat_ft_id, verbose=200)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
               data_random_seed=42, drop_seed=42, feature_fraction_seed=42,
               first_metric_only=True, importance_type='split', lambda_l1=0,
               lambda_l2=0, learning_rate=0.01, max_depth=-1, metric=['AUC'],
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2052, n_jobs=-1, num_leaves=351,
               objective='xentropy', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, seed=42, silent=True, subsample=1.0,
               subsample_for_bin=200000, ...)

In [40]:
y_preds = model_cb.predict_proba(test[X.columns])

In [41]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [42]:
df_sub['isFraud'] = y_preds[:,1]

In [43]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000243
1,3663550,0.001092
2,3663551,0.000159
3,3663552,0.001502
4,3663553,0.000712


In [42]:
today = datetime.date.today()
D = today.strftime('%Y%m%d')

In [43]:
submission_name = '{0}_CatBoost_{1}'.format(D, mean_auc_score)

In [46]:
logging.warning("Submission name: {}".format(submission_name))

In [47]:
df_sub.to_csv('submissions/{}.csv'.format(submission_name), sep=',', header=True, index=None)

In [48]:
logging.warning("End")