Building upon the very good notebook by https://www.kaggle.com/mlanhenke (https://www.kaggle.com/mlanhenke/tps-11-nn-baseline-keras) which presents a very clean and easy to understand structure, I prepared some a few variations to help furthermore the participants of this competition.

The improvements upsofar are:

* deterministic random seeding
* enabling TPU usage just by switching accelerator type
* adding mish and gelu activations
* model summary and plot


In this notebook, I will create a variation inspired by the work of Chris Deotte (https://www.kaggle.com/cdeotte/lstm-feature-importance) in the recent Google Brain Ventilator Pressure competition. The idea is to use permutation feature importance (see: https://christophm.github.io/interpretable-ml-book/feature-importance.html#feature-importance) in order to figure the role of a feature in the prediction. 

When the first fold is created, and you have a baseline performance on the out of fold, you can try permuting each feature at once and see if the result degrades (highlighting the feature is important for the model to produce good predictions), stays the same (implying the feature is irrelevant) or improves (hinting that the feature is harmful).

The procedure doesn't require much computational power because it involves only prediction and not re-training. However, there are caveats. The procedure doesn't take into account multi-collinearity of the features. Consequently, if you remove one feature and then reprocess the importance (as in recursive feature elimination, see:https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html) you are surely removing one unimportant feature. Instead, if you remove bunch of features at once, it may occur that you are removeing highly correlated features that hide each other importance in respect of the prediction (in fact, when you shuffle one, the other high correlated ones do work in its place). In this case you may see the performances unexpectantly decrese instead of improving or staying the same.

In [None]:
import os
import numpy as np
import pandas as pd
import random
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from warnings import filterwarnings
filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Flatten, Input, Concatenate, Dropout
from tensorflow.keras.utils import plot_model

In [None]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
SEED = 3024

In [None]:
def gelu(x):
    return 0.5 * x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))

class Mish(keras.layers.Activation):
    '''
    Mish Activation Function.
    see: https://github.com/digantamisra98/Mish/blob/master/Mish/TFKeras/mish.py
    .. math::
        mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + e^{x}))
    Shape:
        - Input: Arbitrary. Use the keyword argument `input_shape`
        (tuple of integers, does not include the samples axis)
        when using this layer as the first layer in a model.
        - Output: Same shape as the input.
    Examples:
        >>> X = Activation('Mish', name="conv1_act")(X_input)
    '''

    def __init__(self, activation, **kwargs):
        super(Mish, self).__init__(activation, **kwargs)
        self.__name__ = 'Mish'

def mish(inputs):
    return inputs * tf.math.tanh(tf.math.softplus(inputs))

keras.utils.get_custom_objects().update({'gelu': keras.layers.Activation(gelu)})
keras.utils.get_custom_objects().update({'mish': Mish(mish)})

In [None]:
def plot_history(history, start=0, fold=0):
    epochs = np.arange(len(history.history['loss']))
    plt.figure(figsize=(15,5))
    plt.plot(epochs[start:], history.history['loss'][start:], 
             label='train', color='blue')
    plt.plot(epochs[start:], history.history['val_loss'][start:], 
             label='validation', color='red')
    plt.ylabel('Loss',size=14)
    plt.title(f"Fold: {fold+1}")
    plt.legend()
    plt.show()

In [None]:
# Loading data
df_train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

sample_submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

# Preparing training data
X = df_train.drop(columns=['id','target']).copy()
y = df_train['target'].copy()

# Preparing test data
X_test = df_test.drop(columns='id').copy()

# Data Standardization
scaler = MinMaxScaler(feature_range=(0, 1))
 
X = pd.DataFrame(columns=X.columns, data=scaler.fit_transform(X))
X_test = pd.DataFrame(columns=X_test.columns, data=scaler.transform(X_test))

In [None]:
### define callbacks
early_stopping = EarlyStopping(
    monitor='val_loss', 
    min_delta=0, 
    patience=20, 
    verbose=0,
    mode='min', 
    baseline=None, 
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2,
    patience=5,
    mode='min'
)

In [None]:
### create baseline-model
def get_model(name:str):
    
    inputs_sequence = Input(shape=(X.shape[1]))
    x = Flatten()(inputs_sequence)

    skips = list()
    layers = [128, 64, 32]
    for layer, nodes in enumerate(layers):
        x = Dense(128, activation='swish')(x)
        x = Dropout(0.2)(x)
        if layer != (len(layers) - 1):
            skips.append(x)
    
    x = Concatenate(axis=1)([x] + skips)
    output_class = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inputs_sequence, outputs=output_class, name=name)
    
    return model

In [None]:
model = get_model(name='Baseline')
model.summary()

In [None]:
plot_model(
    model, 
    to_file='baseline.png', 
    show_shapes=True,
    show_layer_names=True
)

In [None]:
try:
    # detect and init the TPU
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    # instantiate a distribution strategy
    tf_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print("Running on TPU:", tpu.master())
except:
    tf_strategy = tf.distribute.get_strategy()
    print(f"Running on {tf_strategy.num_replicas_in_sync} replicas")
    print("Number of GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
N_FOLDS = 10
FEATURE_IMPORTANCE = True

### cross-validation 
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=1)

scores = {fold:None for fold in range(cv.n_splits)}
predictions = []
oof = np.zeros(len(X))

with tf_strategy.scope():
    for fold, (idx_train, idx_valid) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        model = get_model(name='Baseline')

        model.compile(
            keras.optimizers.Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['AUC']
        )

        print('**'*60)
        print(f"Fold {fold+1} | Now training ...", end=" ")

        seed_everything(SEED+fold)

        history = model.fit(
            X_train, y_train,
            validation_data=(X_valid, y_valid),
            batch_size=1024,
            epochs=1000,
            verbose=0,
            shuffle=True,
            callbacks=[
                early_stopping,
                reduce_lr
            ]
        )
        
        plot_history(history, start=0, fold=fold)

        scores[fold] = (history.history)
        
        print(f"Best training AUC: {np.min(scores[fold]['auc']):0.5f}")
        print(f"Best validation AUC: {np.min(scores[fold]['val_auc']):0.5f}")
        
        oof[idx_valid] = np.ravel(model.predict(X_valid))
        predictions.append(np.ravel(model.predict(X_test)))
        
        if (FEATURE_IMPORTANCE is True) and (fold==0):
            # Feature importance
            results = []
            print('Computing DNN feature importance...')

            # Compute baseline (no shuffle)
            oof_preds = oof[idx_valid]
            baseline = roc_auc_score(y_true=y_valid, y_score=oof_preds)
            results.append({'feature':'BASELINE','roc_auc':baseline})           

            cols = list(X.columns)
            for k in range(len(cols)):

                # Shuffle feature k
                saved_col = X_valid.iloc[:, k].copy()
                np.random.shuffle(X_valid.iloc[:,k].values)

                # Computing OOF ROC-AUC with shuffled feature k
                imp_oof_preds = model.predict(X_valid, batch_size=1024, verbose=0)
                imp_roc_auc = roc_auc_score(y_true=y_valid, y_score=imp_oof_preds)
                results.append({'feature':cols[k],'roc_auc':imp_roc_auc})

                # Putting everything back as before
                X_valid.iloc[:, k] = saved_col

            # DISPLAY LSTM FEATURE IMPORTANCE
            print()
            df = pd.DataFrame(results)
            df = df.sort_values('roc_auc', ascending=False)
            plt.figure(figsize=(10, 20))
            plt.barh(np.arange(len(cols)+1),df.roc_auc)
            plt.yticks(np.arange(len(cols)+1),df.feature.values)
            plt.title('DNN Feature Importance',size=16)
            plt.ylim((-1, len(cols)+1))
            plt.plot([baseline, baseline],[-1, len(cols)+1], '--', color='orange',
                     label=f'Baseline OOF\nROC-AUC={baseline:.3f}')
            plt.xlabel(f'Fold {fold+1} OOF ROC-AUC with feature permuted',size=14)
            plt.ylabel('Feature',size=14)
            plt.legend()
            plt.show()

In [None]:
### average predictions over each fold and create submission file
sample_submission['target'] = np.mean(np.column_stack(predictions), axis=1)
sample_submission.to_csv('./nn_baseline.csv', index=False)