In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
test = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/test.csv')
train = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/train.csv')

In [None]:
%%time

# Random permutation is needed for CatBoostEncoder to reduce leakage
def random_permutation(x):
    perm = np.random.permutation(len(x)) 
    x = x.iloc[perm].reset_index(drop=True) 
    return x

train = random_permutation(train)
test = random_permutation(test)

train_ids = train.id
test_ids = test.id

train.drop('id', 1, inplace=True)
test.drop('id', 1, inplace=True)

train_targets = train.target
train.drop('target', 1, inplace=True)

# Preprocessing

## Mapping values

For the binary and ordinal variables I will simply use a value mapping approach.

In [None]:
%%time

# bin variables
bin_recode = {0: 0, 1: 1, 'F':0, 'T':1, 'N':0, 'Y':1}
for i in range(5):
    train[f'bin_{i}'] = train[f'bin_{i}'].map(bin_recode)
    test[f'bin_{i}'] = test[f'bin_{i}'].map(bin_recode)

# ord_1
levels = { 'Novice':0, 'Contributor':1, 
          'Expert':2, 'Master':3, 'Grandmaster':4 }
train['ord_1'] = train['ord_1'].map(levels)
test['ord_1'] = test['ord_1'].map(levels)

# ord_2
temps = { 'Freezing':0, 'Cold':1, 'Warm':2, 'Hot':3, 
         'Boiling Hot':4, 'Lava Hot':5 }
train['ord_2'] = train['ord_2'].map(temps)
test['ord_2'] = test['ord_2'].map(temps)

# ord_3
lowercase_letters = {'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,
                     'g':7,'h':8,'i':9,'j':10,'k':11,
                     'l':12,'m':13,'n':14,'o':15,'p':16,
                     'q':17,'r':18,'s':19,'t':20,'u':21,
                     'v':22,'w':23,'x':24,'y':25,'z':26}
train['ord_3'] = train['ord_3'].map(lowercase_letters)
test['ord_3'] = test['ord_3'].map(lowercase_letters)

# ord_4
uppercase_letters = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,
                     'G':7,'H':8,'I':9,'J':10,'K':11,
                     'L':12,'M':13,'N':14,'O':15,'P':16,
                     'Q':17,'R':18,'S':19,'T':20,'U':21,
                     'V':22,'W':23,'X':24,'Y':25,'Z':26}
train['ord_4'] = train['ord_4'].map(uppercase_letters)
test['ord_4'] = test['ord_4'].map(uppercase_letters)

## Dummy coding

For `nom_0` to `nom_4` I will use dummy coding.

In [None]:
%%time

noms_0_4 = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
train = pd.get_dummies(train, 
                       columns = noms_0_4, 
                       prefix = noms_0_4,
                       drop_first=True,
                       sparse=True, 
                       dtype=np.int8)

test = pd.get_dummies(test, 
                      columns = noms_0_4, 
                      prefix = noms_0_4,
                      drop_first=True,
                      sparse=True, 
                      dtype=np.int8)

## Target encoding

`nom_5` to `nom_9` and `ord_5` are high-cardinality features. One way to handle these features is to use target encoding. However, there are different ways of doing target encoding and some are better at avoiding leakage / overfitting. Here I use the CatBoostEncoder which implements a leave-one-out strategy of target encoding.

In [None]:
%%time

from category_encoders.cat_boost import CatBoostEncoder

# noms 5-9
for i in [5,6,7,8,9]:
    cbe = CatBoostEncoder()
    train[f'nom_{i}'] = cbe.fit_transform(train[f'nom_{i}'], train_targets)
    test[f'nom_{i}'] = cbe.transform(test[f'nom_{i}'])

# ord 5
cbe = CatBoostEncoder()
train['ord_5'] = cbe.fit_transform(train['ord_5'], train_targets)
test['ord_5'] = cbe.transform(test['ord_5'])

In [None]:
print(train.shape)
print(test.shape)

# CatBoost Classifier

I used `grid_search()` to identify the best parameters for the `CatBoostClassifier`.

In [None]:
#grid = {'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
#        'depth': [3, 4, 5, 6],
#        'l2_leaf_reg': [1, 2, 3, 4, 5, 6, 7]}

#grid_search_result = cb.grid_search(grid, 
#                                    X=train_cbe, 
#                                    y=train_targets, 
#                                    plot=True)

The model below uses the best parameters that I found.

In [None]:
%%time

from catboost import CatBoostClassifier
cb = CatBoostClassifier(eval_metric='AUC',
                        learning_rate=0.1,
                        depth=3,
                        l2_leaf_reg=5)

cb.fit(train, train_targets, verbose=False)

# Prepare submission

In [None]:
preds = cb.predict_proba(test)[:, 1]
preds_df = pd.DataFrame(list(zip(test_ids, preds)), 
                        columns = ['id', 'target'])
preds_df.sort_values(by=['id'], inplace = True)

preds_df.to_csv("./submission.csv", index=False)

In [None]:
preds_df.head()