In [None]:
import numpy as np
import pandas as pd
import sys
import os
from time import time
import tensorflow as tf
import gc
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold

# What's New?

I read [this paper](https://arxiv.org/pdf/2106.01342.pdf) and found an interesting way to add noise. In the paper, the authors created noise as a two stage process. First, they used CutMix (also known as swapnoise) on the raw data. After an embedding layer, they used MixUp to blend samples together. This paper used the noise to help create embeddings, then fine tunes the embeddings on the data through vanilla supervised training.  My network only uses them for supervised training, however.  

## But aren't Cutmix and Mixup were vision techniques?

While both of those regularization methods are very popular in vision, they can be adapted to tabular relatively easily.  Michael Jahrer used CutMix (which he called swapnoise) in his wonderful [first place finish](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629) to the Porto-Seguro competition.  Instead of replacing boxes in an image, he just replaced column values with values from the same column but different rows.  I implement CutMix as a tf.keras.layer whose replaces values at random from other values in the **minibatch**.

MixUp blends 2 images with some mixing parameter.  While this is pretty straight forward to implement, I have not seen this implemented in tabular data aside from the Somepalli's paper I cited at the start of this block.  Granted, I am not the world's best reader in the field of Neural Networks for tabular data, so I would not be surprised if other authors have done this before!  Like CutMix, for MixUp I blend samples with other samples in the same **minibatch**.  This is implemented as a tf.keras.layer.

Given those 2 interesting regularization techniques, I have created this basic notebook to test their viablility on this dataset.  

In [None]:
##################################################################
#Special layers
##################################################################
class CutMix(tf.keras.layers.Layer):
    '''
    Implementation of CutMix
    Args
    _____
    noise: (R in [0,1)) probability that a value is not sampled from distribution
    Application
    ____________
    CM = CutMix(.2)
    x = tf.reshape(tf.range(0,10, dtype=tf.float32), (5,2))
    print(x.numpy())
    y = CM(x,True)
    print(y.numpy())
    '''
    def __init__(self, noise, **kwargs):
        super(CutMix, self).__init__(**kwargs)
        self.noise = noise

    def get_config(self):
        config = super(CutMix, self).get_config()
        config.update({"noise": self.noise})
        return config

    def call(self, inputs, training=None):
        if training:
            shuffled = tf.stop_gradient(tf.random.shuffle(inputs))
            #print(shuffled.numpy())

            msk = tf.keras.backend.random_bernoulli(tf.shape(inputs), p=1 - self.noise, dtype=tf.float32)
            #print(msk)
            return msk * inputs + (tf.ones_like(msk) - msk) * shuffled
        return inputs

class MixUp(tf.keras.layers.Layer):
    '''
    Implementation of MixUp
    Args
    _____
    alpha: (R in [0,1)) percentage of random sample to input  used
    Application
    ____________
    MU = MixUp(.1)
    x = tf.reshape(tf.range(0,10, dtype=tf.float32), (5,2))
    y = MU(x)
    print(x.numpy())
    print(y.numpy())
    '''
    def __init__(self, alpha, **kwargs):
        super(MixUp, self).__init__(**kwargs)
        self.alpha = alpha
        self.alpha_constant = tf.constant(self.alpha)
        self.one_minus_alpha = tf.constant(1.) - self.alpha

    def get_config(self):
        config = super(MixUp, self).get_config()
        config.update({"alpha": self.alpha})
        return config

    def call(self, inputs, training=None):
        if training:
            shuffled = tf.stop_gradient(tf.random.shuffle(inputs))
            #print(shuffled.numpy())
            return self.alpha_constant * inputs + self.one_minus_alpha * shuffled
        return inputs
    
class ResnetBlockTabular(tf.keras.Model):
    def __init__(self, output_dim, **kwargs):
        '''
        output_dim: (int) dimension of output dense layer. 
        NOTE: if output_dim == input_dim, this is a ResNetIdentityBlock
        '''
        super(ResnetBlockTabular, self).__init__(**kwargs)
        self.output_dim = output_dim
    
    def build(self, input_shape):
        if self.output_dim == input_shape[-1]:
            self.Dense1 = None
        else:
            self.Dense1 = tf.keras.layers.Dense(output_dim)

        self.bn1 = tf.keras.layers.BatchNormalization()
        self.relu1 = tf.keras.layers.ReLU()
        self.dense2 = tf.keras.layers.Dense(self.output_dim)
        self.bn2 = tf.keras.layers.BatchNormalization()
        self.relu2 = tf.keras.layers.ReLU()
        self.dense3 = tf.keras.layers.Dense(self.output_dim)
    
    def call(self, input_tensor, training=False):
        if self.Dense1 is not None:
            input_tensor = self.Dense1(input_tensor)
        
        x = self.bn1(input_tensor)
        x = self.relu1(x)
        x = self.dense2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dense3(x)
        
        return x + input_tensor

## The Network

Nothing too fancy really.  After input, a CutMix layer, then a resnet block, then a Mixup layer.  The rest of the network is just resnet blocks with dropout regularization.  Feel free to changed this up to your own preference!  

In [None]:
def ff(num_input_columns, BLOCKS, drop_rate, cutmix_noise, mixup_alpha, optimizer, block_sizes =None):
    
    if block_sizes is None:
        block_sizes = [num_input_columns for _ in range(BLOCKS)]
    else:
        if len(block_sizes) !=BLOCKS:
            print(f'block_sizes has {len(block_sizes)} blocks.  Needs {BLOCKS}.')
    
    #Input
    inp = tf.keras.layers.Input(num_input_columns)
    x = CutMix(noise = cutmix_noise)(inp)
    x = tf.keras.layers.BatchNormalization()(x)
    x = ResnetBlockTabular(output_dim = block_sizes[0], name=f'Resnet_0')(x)
    x = MixUp(alpha= mixup_alpha)(x)
    
    for i in range(1,BLOCKS):
        x = ResnetBlockTabular(output_dim = block_sizes[i], name=f'Resnet_{i}')(x)
        x = tf.keras.layers.Dropout(drop_rate)(x)
    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(inputs=inp, outputs=x)
    
    
    model.compile(optimizer=optimizer,
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=[tf.keras.metrics.AUC()])
    return model

# The Data

I used Rank Gaussed data from [this notebook](https://www.kaggle.com/ottpocket/feather-creator) to train the network.  This data has a column counting the number of `np.nans` each row and has imputed each `np.nan` with -6.  Why -6 you ask?  Rank Gauss makes each column resemble a normal distribution.  By replacing the `nans` with -6, we tell the network that those values are 6 standard distributions below from the mean.  The next lowest value in the data is -5, for reference.  

In [None]:
train = pd.read_feather('/kaggle/input/september-feather/train_rg_min')
test =  pd.read_feather('/kaggle/input/september-feather/test_rg_min')
ss = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
FEATURES = [feat for feat in train.columns if 'f' in feat]
TARGET = 'claim'

# Feature Engineering

Take row wise statistics.  Also use crude mean encodings and count on the nan_count variable

In [None]:
#Row wise stats
STATS = ['min','max','sum','var','mean']
for df in [train, test]:
    df['min'] = df[FEATURES].min(axis=1)
    df['max'] = df[FEATURES].max(axis=1)
    df['sum'] = df[FEATURES].sum(axis=1)
    df['var'] = df[FEATURES].var(axis=1)
    df['mean'] = df[FEATURES].mean(axis=1)
    
#Adding Encoding based on nan_count
agg = train.groupby('nan_count').agg({'id':'count', TARGET:'mean'}).rename(columns={'id':'count', 'claim':'encoding'})
count_max = agg['count'].max()
for df in [train, test]:
    #df['encoding'] = df.nan_count.map(agg['encoding'])
    df['count'] = df.nan_count.map(agg['count']) / count_max

FEATURES = FEATURES + ['nan_count', 'count'] + STATS #+ ['encoding']

# Model Hyperparameters

All parameters were obtained via optuna in [this notebook](https://www.kaggle.com/ottpocket/fork-of-fork-of-fork-of-nn-starter).  In that notebook I trained for 4.5 hours.  This was the first time using optuna for me, so maybe you can make some improvements on your own!  

In [None]:
#####################
#Model Params
##################### 
batch_size = 1024
BLOCKS = 7
drop_rate = 0.19980551223829823 #Dropout rate for body of resnet
cutmix_noise = 0.11104093311728253 #Probability that a value will be randomly swapped
mixup_alpha = 0.2312874504067844 #How much weight the mixup mixing parameter has
adam_learning_rate = 0.0014281456754098325
optimizer = tf.keras.optimizers.Adam(learning_rate = adam_learning_rate)

# Cross Validation

We will get a 5 fold cross val here.  We train a model to early stopping, then evaluate.

In [None]:
#######################
#Cross Val Params
#######################
NUM_FOLDS = 5 # the number of folds in the KFold validation
NUM_STARTS = 1 #Number of random starts to train per fold
NUM_SPLITS = 1 #Number of times to repeat the KFold validation

In [None]:
oof = pd.DataFrame()
preds = pd.DataFrame()
ES = tf.keras.callbacks.EarlyStopping(monitor='val_auc', min_delta=0, patience=20, verbose=0, mode='max')

#Number of times to do KFold cross val
for random_state in tqdm(range(NUM_SPLITS)):
    skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=random_state)
    oof[random_state] = np.zeros(train.shape[0])
    
    for f, (t_idx, v_idx) in enumerate(skf.split(X=train, y=train[TARGET])):
        tr_temp = train[FEATURES].iloc[t_idx]
        tr_temp_target = train[TARGET].iloc[t_idx]
        val_temp = train[FEATURES].iloc[v_idx]
        val_temp_target = train[TARGET].iloc[v_idx]
        Repeat_start = time()
        
        #Number of random starts per fold
        for repeat in range(NUM_STARTS):
            tf.keras.backend.clear_session()
            start = time()
            model = ff(num_input_columns = len(FEATURES), BLOCKS = BLOCKS, drop_rate = drop_rate, 
                       cutmix_noise = cutmix_noise, mixup_alpha = mixup_alpha, optimizer = optimizer)
            
            model.fit(tr_temp, tr_temp_target, batch_size=batch_size, callbacks=[ES], epochs=200,
                      validation_data=(val_temp, val_temp_target))
            oof[random_state].iloc[v_idx] = np.squeeze(model.predict(val_temp, batch_size=100000)) / NUM_STARTS
            preds[f'{random_state}_{f}'] = np.squeeze(model.predict(test[FEATURES], batch_size=100000)) / NUM_STARTS
            print(f'{time() - start :.2f}', end=', ')
        print(f'Repeat total: {time() - Repeat_start :.2f}')

In [None]:
scores = [roc_auc_score(train[TARGET].values, oof[col].values) for col in oof.columns]
score = np.mean(scores)
print(f'Scores on all Cross validation splits: {scores}')
print(f'Mean AUC from splits: {score}')

In [None]:
preds.to_csv('predictions.csv', index=False)
oof.to_csv('oof.csv', index=False)
ss[TARGET] = np.mean(preds, axis=1)
ss.to_csv('ss.csv', index=False)