In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

pd.options.display.max_columns = 25
sns.set_style('darkgrid')

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [None]:
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import callbacks
from tensorflow.keras import utils

# Dataset

In [None]:
train = pd.read_csv("../input/cat-in-the-dat/train.csv")
print(train.shape)
train.head()

In [None]:
test = pd.read_csv("../input/cat-in-the-dat/test.csv")
sample = pd.read_csv("../input/cat-in-the-dat/sample_submission.csv")

In [None]:
test["target"] = -1
data = pd.concat([train, test]).reset_index(drop=True)
features = [x for x in train.columns if x not in ["id", "target"]]

for feat in features:
    lbl_enc = LabelEncoder()
    data[feat] = lbl_enc.fit_transform(data[feat].values)

# EDA

In [None]:
def viz_high_cardinality(col):
    tmp = pd.crosstab(train[col], train['target'])
    tmp = tmp.reset_index()

    fig, ax = plt.subplots(figsize=(20, 6))
    N = len(tmp)
    ind = np.arange(N)
    width = 0.35

    p1 = plt.bar(ind, tmp[0].values, width)
    p2 = plt.bar(ind, tmp[1].values, width, bottom=tmp[0].values)

    plt.xticks(ind, tmp[col].values)
    plt.legend((p1[0], p2[0]), ('0', '1'))
    plt.title(col, fontsize=18)
    plt.show()
    
def viz_huge_cardinality(col):
    fig, ax = plt.subplots(figsize=(20, 6))
    sns.distplot(train[col].value_counts().reset_index()[col].values)
    ax.set_title(col, fontsize=18)    
    plt.show()

## Target Class Distribution

In [None]:
fig = plt.figure(figsize=(20, 6))

ax1 = plt.subplot2grid((1, 3), (0, 0))
train['target'].value_counts().plot.pie(legend=True, autopct='%1.0f%%', ax=ax1)

ax2 = plt.subplot2grid((1, 3), (0, 1), colspan=2)
train['target'].value_counts().plot.bar()
ax2.grid()
x_offset = -0.03
y_offset = 0.05
for p in ax2.patches:
    b = p.get_bbox()
    val = "{:.2f}".format(b.y1 + b.y0)        
    ax2.annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 + y_offset))

## Binary Features

In [None]:
fig = plt.figure(figsize=(20, 20))
ax = []
for i in range(5):
    ax.append(plt.subplot2grid((5, 3), (i, 0)))
    ax.append(plt.subplot2grid((5, 3), (i, 1), colspan=2))

for i in range(0, 10, 2)    :
    train[f'bin_{i//2}'].value_counts().plot.pie(ax=ax[i], autopct="%1.1f%%")

x_offset = -0.03
y_offset = 0.05    
    
for i in range(1, 10, 2):
    sns.countplot(x=f'bin_{i//2}', data=train, hue='target', ax=ax[i])
    for p in ax[i].patches:
        b = p.get_bbox()
        val = "{:.2f}%".format((b.y1 + b.y0)/len(train)*100)        
        ax[i].annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 + y_offset))

We can't see any clear pattern to positive values in target.

* It's interesting to see that in bin_3 the ratio of target true have the same size in both values; (~15%)
* In the other binary features the pattern is very similar.

## Nominal Features

In [None]:
fig = plt.figure(figsize=(20, 20))
ax = []
for i in range(5):
    ax.append(plt.subplot2grid((5, 3), (i, 0)))
    ax.append(plt.subplot2grid((5, 3), (i, 1), colspan=2))

for i in range(0, 10, 2)    :
    train[f'nom_{i//2}'].value_counts().plot.pie(ax=ax[i], autopct="%1.1f%%")

x_offset = -0.03
y_offset = 0.05    
    
for i in range(1, 10, 2):
    sns.countplot(x=f'nom_{i//2}', data=train, hue='target', ax=ax[i])
    for p in ax[i].patches:
        b = p.get_bbox()
        val = "{:.2f}%".format((b.y1 + b.y0)/len(train)*100)        
        ax[i].annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 + y_offset))

We can see clear different patterns between the nominal category values.

Some summary of these features:

* NOM_0 - Red (~35%) value have the highest % of positive values in the target;
* NOM_1 - Triangle(~36%) value have the highest % of positive values in the target;
* NOM_2 - Hamster(~36%) value have the highest % of positive values in the target;
* NOM_3 - India(~36%) value have the highest % of positive values in the target;
* NOM_4 - Theremin(~36%) value have the highest % of positive values in the target;
* All the values with highest % of True values on target, are the category's with lowest frequency on the nominal categories

In [None]:
for i in range(5, 10):
    viz_huge_cardinality(f'nom_{i}')

## Ordinal Features

In [None]:
fig = plt.figure(figsize=(20, 12))
ax = []
for i in range(3):
    ax.append(plt.subplot2grid((3, 3), (i, 0)))
    ax.append(plt.subplot2grid((3, 3), (i, 1), colspan=2))

for i in range(0, 6, 2)    :
    train[f'ord_{i//2}'].value_counts().plot.pie(ax=ax[i], autopct="%1.1f%%")

x_offset = -0.03
y_offset = 0.05    
    
for i in range(1, 6, 2):
    sns.countplot(x=f'ord_{i//2}', data=train, hue='target', ax=ax[i])
    for p in ax[i].patches:
        b = p.get_bbox()
        val = "{:.2f}%".format((b.y1 + b.y0)/len(train)*100)        
        ax[i].annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 + y_offset))

In [None]:
viz_high_cardinality('ord_3')
viz_high_cardinality('ord_4')
viz_huge_cardinality('ord_5')  

## Day and Month

In [None]:
viz_high_cardinality('day')
viz_high_cardinality('month')

Curiously, the data have two values that have few entries;

* In the day column, the value 6 (maybe saturday?!) have less entries;
* In the month column, the value 6 (maybe the holidays?!) have less entries;

# Modelling

In [None]:
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)
test_data = [test.iloc[:, k+1].values for k in range(len(features))]

X = [train.iloc[:, k+1] for k in range(len(features))]
y = utils.to_categorical(train.target.values)

In [None]:
def auroc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

## Baseline Model
One Hot Encoding -> LogisticRegression

In [None]:
%%time

X_ = OneHotEncoder().fit_transform(train[features])
y_ = train.target

print('Baseline Score:', np.mean(cross_val_score(LogisticRegression(max_iter=100_000), X_, y_, scoring='roc_auc')))

## Entity Embeddings

In [None]:
es = callbacks.EarlyStopping(
    monitor='val_auroc', min_delta=0.001, patience=7, verbose=1, mode='max', baseline=None,
    restore_best_weights=True
)

rlp = callbacks.ReduceLROnPlateau(
    monitor='val_auroc', factor=0.5, patience=3, min_lr=1e-6, mode='max', verbose=1
)

class config():
    EMBEDDING_DIM = 64
    SPATIAL_DROPOUT = 0.3
    HIDDEN_LAYERS = [(300, 'relu', 0.3), (300, 'relu', 0.3)]
    OUTPUT_CELLS = 2
    OUTPUT_ACTIVATION = 'softmax'
    LOSS = 'binary_crossentropy'
    OPT = 'adam'
    METRICS = [auroc]
    BATCH_SIZE = 1024
    MAX_EPOCHS = 100
    CALL_BACKS = [es, rlp]

In [None]:
def create_model(data, catcols):    
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, config.EMBEDDING_DIM, name=c)(inp)
        out = layers.SpatialDropout1D(config.SPATIAL_DROPOUT)(out)
        out = layers.Reshape(target_shape=(config.EMBEDDING_DIM, ))(out)
        inputs.append(inp)
        outputs.append(out)
    
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    for n_cells, act_fn, dropout in config.HIDDEN_LAYERS:
        x = layers.Dense(n_cells, activation=act_fn)(x)
        x = layers.Dropout(dropout)(x)
        x = layers.BatchNormalization()(x)
    
    y = layers.Dense(config.OUTPUT_CELLS, activation=config.OUTPUT_ACTIVATION)(x)

    model = Model(inputs=inputs, outputs=y)
    return model

In [None]:
model = create_model(data, features)
model.compile(loss=config.LOSS, optimizer=config.OPT, metrics=config.METRICS)
model.summary()
utils.plot_model(model, show_shapes=True)

In [None]:
history =  model.fit(
    X, y, validation_split=0.1, 
    batch_size=config.BATCH_SIZE, callbacks=config.CALL_BACKS, epochs=config.MAX_EPOCHS
)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(20, 8))
df = pd.DataFrame(history.history)
df[['auroc', 'val_auroc']].plot(ax=ax[0])
df[['loss', 'val_loss']].plot(ax=ax[1])
ax[0].set_title('Model AUROC', fontsize=12)
ax[1].set_title('Model Loss', fontsize=12)
fig.suptitle('Model Metrics', fontsize=18);

In [None]:
pd.DataFrame({
    'id': test.id.values,
    'target': model.predict(test_data)[:, 1]
}).to_csv("submission.csv", index=False)

**Score:** 0.80221

## Hybrid Model

Entity Embeddings -> Catboost

In [None]:
encoder = Model(inputs=model.input, outputs=model.layers[-8].output)
embeddings_train = encoder.predict(X)
embeddings_test = encoder.predict(test_data)

In [None]:
%%time

estimator = CatBoostClassifier(task_type="GPU", silent=True).fit(embeddings_train, y[:, 1])

pd.DataFrame({
    'id': test.id.values,
    'target': estimator.predict_proba(embeddings_test)[:, 1]
}).to_csv(f"submission_hybrid.csv", index=False)

**Score:** 0.79553

# References
[Entity Embeddings of Categorical Variables](https://arxiv.org/pdf/1604.06737v1.pdf)