# Keras Model with DAE/TE Features (TPU)

In this notebook, I will show how to train a NN model with DAE and target encoded features in Keras (TPU).

The contents of the notebooks are organized as follows:
1. Installing and loading libraries: installs Kaggler and load data and libraries
2. Feature engineering: shows how to transform features with target encoding with Kaggler
3. Model definition and training: shows how to setup TPU and define a NN model with skip connection in Keras
4. Submission

Enjoy~!

# Loading Libraries and Data

In [None]:
!pip install -U kaggler

In [None]:
%matplotlib inline
import kaggler
from kaggler.model import AutoLGB
from kaggler.preprocessing import DAE, TargetEncoder, LabelEncoder
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import tensorflow as tf
from tensorflow import keras
from warnings import simplefilter
print(kaggler.__version__, tf.__version__)

In [None]:
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
simplefilter('ignore')

In [None]:
feature_name = 'dae_te'
algo_name = 'nn'
version = 2
model_name = f'{algo_name}_{feature_name}_v{version}'

data_dir = Path('../input/tabular-playground-series-may-2021')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

dae_feature_file = '../input/tps5-dae-features/dae.h5'
feature_file = f'{feature_name}.h5'
predict_val_file = f'{model_name}.val.txt'
predict_tst_file = f'{model_name}.tst.txt'
submission_file = f'{model_name}.sub.csv'

id_col = 'id'
target_col = 'target'

In [None]:
seed = 42
n_fold = 5
n_class = 4
n_stop = 5
n_epoch = 100
n_emb = 16
n_hidden_unit = 128 
dropout = .3
batch_size = 1024

In [None]:
trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
sub = pd.read_csv(sample_file, index_col=id_col)
print(trn.shape, tst.shape, sub.shape)

In [None]:
y = trn[target_col].str.split('_').str[1].astype(int) - 1
n_trn = trn.shape[0]
df = pd.concat([trn.drop(target_col, axis=1), tst], axis=0)
feature_cols = df.columns.tolist()
print(y.shape, df.shape)

# Feature Engineering: DAE + Target Encoding + Label Encoding

In [None]:
df_dae = pd.read_hdf(dae_feature_file, key='data')

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
te = TargetEncoder(cv=cv)
te.fit(trn[feature_cols], y)
df_te = te.transform(df[feature_cols])
df_te.columns = [f'te_{x}' for x in df.columns]

all_df = pd.concat([df, df_te, df_dae], axis=1)
all_df.to_hdf(feature_file, key='data')
print(all_df.shape)
all_df.head()

In [None]:
feature_cols = all_df.columns.tolist()
n_feature = len(feature_cols)
cat_cols = df.columns.tolist()
num_cols = [x for x in feature_cols if x not in cat_cols]
n_cat_col = len(cat_cols)
n_num_col = len(num_cols)
print(n_feature, n_cat_col, n_num_col)

# Part 3. Keras NN Model Training

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
def build_model(n_emb=16, n_hidden_unit=128, dropout=.3):
    cat_inputs = []
    embs = []
    for i, col in enumerate(cat_cols):
        inp = keras.layers.Input((1,), name=f'{col}')
        emb = keras.layers.Embedding(input_dim=all_df[col].nunique(), output_dim=n_emb)(inp)
        cat_inputs.append(inp)
        embs.append(emb)

    num_inputs = keras.layers.Input((len(num_cols),))
    
    inputs = cat_inputs + [num_inputs]
    merged_inputs = keras.layers.Concatenate()(inputs)
    x = keras.layers.Dense(n_hidden_unit, 'relu')(merged_inputs)
    x = keras.layers.Dropout(dropout)(x)
    ox = x
    
    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Add()([ox, x])
    
    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.Dropout(dropout)(x)
    ox = x
    
    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Add()([ox, x])
    
    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.Dropout(dropout)(x)
    ox = x

    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Add()([ox, x])

    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.Dropout(dropout)(x)
    ox = x
    
    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Add()([ox, x])
    
    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.Dropout(dropout)(x)
    ox = x
    
    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Add()([ox, x])

    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.Dropout(dropout)(x)

    outputs = keras.layers.Dense(n_class, 'softmax')(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model

In [None]:
with tpu_strategy.scope():
    model = build_model(n_emb, n_hidden_unit, dropout)
    model.summary()

In [None]:
def scheduler(epoch, lr, warmup=5):
    if epoch < warmup:
        return lr * 1.5
    else:
        return lr * tf.math.exp(-.1)

es = keras.callbacks.EarlyStopping(patience=n_stop, restore_best_weights=True)
lr = keras.callbacks.LearningRateScheduler(scheduler)

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

X = all_df.iloc[:n_trn].values
X_tst = all_df.iloc[n_trn:].values

P = np.zeros((n_trn, n_class), dtype=float)
P_tst = np.zeros((X_tst.shape[0], n_class), dtype=float)
for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
    model = build_model()
    X_trn, X_val = X[i_trn], X[i_val]
    y_trn, y_val = y[i_trn], y[i_val]
    history = model.fit([[X_trn[:, i] for i in range(n_cat_col)] + [X_trn[:, n_cat_col:]]], y_trn, 
                        validation_data=([[X_val[:, i] for i in range(n_cat_col)] + [X_val[:, n_cat_col:]]], y_val), 
                        epochs=n_epoch, batch_size=batch_size, callbacks=[es, lr], verbose=0)
    P[i_val] = model.predict([[X_val[:, i] for i in range(n_cat_col)] + [X_val[:, n_cat_col:]]])
    P_tst += model.predict([[X_tst[:, i] for i in range(n_cat_col)] + [X_tst[:, n_cat_col:]]]) / n_fold
    
    print(f'CV #{i} Loss: {log_loss(y[i_val], P[i_val]):.6f}')

In [None]:
plt.plot(history.history['lr'])

In [None]:
print(f'CV Log Loss: {log_loss(y, P):.6f}')
np.savetxt(predict_val_file, P, fmt='%.6f')
np.savetxt(predict_tst_file, P_tst, fmt='%.6f')

# Part 4. Submission

In [None]:
sub[sub.columns] = P_tst
sub.to_csv(submission_file)
sub.head()

If you find this notebook helpful, please upvote it and share your feedback in comments. I really appreciate it.

You can find my other notebooks in both the current and previous TPS competitions below:
* [Kaggler DAE + AutoLGB Baseline](https://www.kaggle.com/jeongyoonlee/kaggler-dae-autolgb-baseline): trains the LightGBM model with Kaggler's DAE features and AutoLGB
* [Adversarial Validation with LightGBM](https://www.kaggle.com/jeongyoonlee/adversarial-validation-with-lightgbm): shows how close/different the feature distributions between the training and test data. It's a good exercise to perform it at the begining of the competition to understand the risk of overfitting to the training data.
* [DAE with 2 Lines of Code with Kaggler](https://www.kaggle.com/jeongyoonlee/dae-with-2-lines-of-code-with-kaggler): shows how to extract DAE features and train the AutoLGB model with TPS4 data.
* [AutoEncoder + Pseudo Label + AutoLGB](https://www.kaggle.com/jeongyoonlee/autoencoder-pseudo-label-autolgb): shows how to build a basic AutoEncoder using Keras, and perform automated feature selection and hyperparameter optimization using Kaggler's AutoLGB.
* [Supervised Emphasized Denoising AutoEncoder](https://www.kaggle.com/jeongyoonlee/supervised-emphasized-denoising-autoencoder): shows how to build a more sophiscated version of * AutoEncoder, called supervised emphasized Denoising AutoEncoder (DAE), which trains DAE and a classifier simultaneously.
* [Stacking Ensemble](https://www.kaggle.com/jeongyoonlee/stacking-ensemble): shows how to perform stacking ensemble.