Inspired by Oscar Villarreal Escamilla's notebook https://www.kaggle.com/oxzplvifi/tabular-residual-network I am trying embedding with residual network, here in Python instead of R

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
import tempfile
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import layers
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-jun-2021/train.csv")
test_data = pd.read_csv("../input/tabular-playground-series-jun-2021/test.csv")

In [None]:
cols = list(train_data.columns)
cols.remove("id")
cols.remove("target")
classes = [f"Class_{i}" for i in range(1, 10)]
n_classes = len(classes)

In [None]:
def custom_loss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred, 1e-15, 1-1e-15)
    return tf.keras.losses.CategoricalCrossentropy()(y_true, y_pred)

In [None]:
all_data = pd.concat((train_data, test_data))[cols]
categories = {col: sorted(all_data[col].unique()) for col in all_data}

In [None]:
frequencies = train_data.target.value_counts().sort_index()/len(train_data)
biases_calc = np.log(frequencies).values
print(frequencies)
# print(biases_calc)
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))
print(np.exp(biases_calc) / np.sum(np.exp(biases_calc)))

def bias_init(bias_shape,dtype):
    return tf.Variable(biases_calc, dtype=dtype)

In [None]:
X = train_data[cols]
y = train_data["target"]
y_ohe = pd.get_dummies(y)

In [None]:
def assemble_model(hidden_layers = (32, ), third_skip=False, conv_layer=False, embed_opts={"output_dim": 2}, conv_opts={"filters": 12, "kernel_size": 1}):
    inputs = layers.Input(len(X.columns), dtype="int32")
    feat_layer = layers.Embedding(X.max().max()+1, **embed_opts)(inputs)
    if conv_layer:
        feat_layer = layers.Conv1D(activation="relu", **conv_opts)(feat_layer)
        feat_layer = layers.Dropout(0.3)(feat_layer)
    feat_layer = layers.Flatten()(feat_layer)
    cur_layer = feat_layer
    for layer_size in hidden_layers:
        cur_layer = layers.Dropout(.2)(cur_layer)
        lay = tfa.layers.WeightNormalization(layers.Dense(layer_size, activation="selu",
                                 kernel_initializer='lecun_normal',
                                ))
        cur_layer = lay(cur_layer)

    first_hidden_layer = cur_layer

    cur_layer = layers.Concatenate()([feat_layer, cur_layer])
    cur_layer = layers.Dropout(.3)(cur_layer)
    cur_layer = tfa.layers.WeightNormalization(layers.Dense(hidden_layers[-1], activation="relu"))(cur_layer)
    second_skip_layer = cur_layer
    
    cur_layer = layers.Concatenate()([feat_layer, cur_layer, first_hidden_layer])
    cur_layer = layers.Dropout(.4)(cur_layer)
    cur_layer = tfa.layers.WeightNormalization(layers.Dense(hidden_layers[-1], activation="elu", kernel_initializer='lecun_normal',))(cur_layer)

    if third_skip:
        cur_layer = layers.Concatenate()([feat_layer, cur_layer, second_skip_layer, first_hidden_layer])
        cur_layer = layers.Dropout(.3)(cur_layer)
        cur_layer = tfa.layers.WeightNormalization(layers.Dense(hidden_layers[-1], activation="elu"))(cur_layer)

    
    out = layers.Dense(n_classes, activation="softmax", bias_initializer=bias_init
                      )(cur_layer)

    model = tf.keras.Model(inputs,
                           out)

    model.compile(
            optimizer="adam",
            loss=tf.keras.losses.CategoricalCrossentropy(),
            metrics=[tf.keras.metrics.CategoricalCrossentropy(), custom_loss],
        )
    return model

In [None]:
def pack_test_data(proba):
    predicted = pd.DataFrame(proba, columns=classes)
    predicted["id"] = test_data.id
    predicted = predicted[["id"]+classes]
    return predicted

In [None]:
earlystop_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0.005, patience=10, verbose=0,
    mode='auto', baseline=None, restore_best_weights=True
)
reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.7, patience=2, verbose=0
)

In [None]:
def train_predict_cv(get_model, nfolds=10):
    kf = StratifiedKFold(n_splits=nfolds, shuffle=True)

    probas = []
    losses = []
    for train_idx, test_idx in kf.split(X, y):

        X_train, y_train = X.loc[train_idx], y_ohe.loc[train_idx]
        X_test, y_test = X.loc[test_idx], y_ohe.loc[test_idx]
        model = get_model()
        model.fit(X_train, y_train, epochs=100, 
                  batch_size=256,
                  validation_data=(X_test, y_test),
                callbacks=[earlystop_callback, reduce_lr_on_plateau], verbose=0)
        eval_dict = model.evaluate(X_test, y_test, return_dict=True, verbose=0)
        print(eval_dict)
        losses.append(eval_dict["custom_loss"])
        proba = model.predict(test_data[cols])
        probas.append(proba)
    print(f"Mean loss: {np.mean(losses)}, std: {np.std(losses)}")
    return sum(probas)/nfolds

In [None]:
%%time
proba = train_predict_cv(lambda: assemble_model(hidden_layers=(32,), conv_layer=True, 
                                                     embed_opts={"output_dim": 14, "embeddings_regularizer": 'l2'},
                                                     conv_opts={"filters": 12, "kernel_size": 1}), nfolds=20)
pack_test_data(proba).to_csv("tf_model_conv.csv", index=False)

I tried clipping the output probabilities to avoid extreme values affecting the score, but it did not seem to help much though. Here I try a few more cutoffs for the clipping

In [None]:
def clip_and_save(proba, clip_percent=5, renormalize=False, fname="tf_model"):
    proba_clipped = np.clip(proba, clip_percent/100, 1-clip_percent/100)
    if renormalize:
        sums = proba.sum(axis=1)
        proba = proba/sums[:, np.newaxis]
        sums = proba.sum(axis=1)
        fname += "_normed"
        
    predicted = pack_test_data(proba_clipped)
    predicted.to_csv(fname+f"_clipped_{clip_percent}.csv", index=False)

In [None]:
clip_and_save(proba, clip_percent=1.25, renormalize=True)

In [None]:
clip_and_save(proba, clip_percent=0.0, renormalize=False)