In [None]:
import os
import gc
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix

from warnings import filterwarnings
filterwarnings('ignore')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

<img src="https://i.ibb.co/PWvpT9F/header.png" alt="header" border="0" width=800 height=300>

# Introduction

<div style="font-size:110%;line-height:155%">
<p>Hi,</p>
<p>while I'm still enjoying my deep-learning adventure, reading and learning a ton of stuff - I decided it's time to implement different kinds of networks with this month competition. I've been trying to implement a denoising autoencoder since TPS-November, but with last month data I wasn't so sure if it would be any success. This time I finally came around to try my luck with a <b>Denoising Autoencoder</b>. My implementation is heavily based on the <a href="https://www.kaggle.com/springmanndaniel/1st-place-turn-your-data-into-daeta">TPS-January winning solution by Danzel</a>. <p>The idea here is to use an autoencoder to learn more meaningful features by discovering latent variables. The architecture here will create three encoding layers, concatenated and then fed into a fully-connected neural network. To avoid learning the identity-function while blowing up the dimensionality, noise will be injected. In this case I implemented the Swap-Row-Noise function using a noise probability of around 15%.</p>
<p><em>Disclaimer: I am still testing different setups, since training and finetuning the autoencoder does not seem to be so trivial.</em></p>
    
<blockquote><img src="https://i.ibb.co/j8n07rn/Deepstack-DAE.png" width="50%" alt="Deepstack-DAE" border="0"></blockquote>
    
<p>Feel free to take a look at my other notebooks, covering some different ideas and architectures:
    <li><a href="https://www.kaggle.com/mlanhenke/tps-12-simple-nn-baseline-keras">Simple NN Baseline</a></li>
    <li><a href="https://www.kaggle.com/mlanhenke/tps-12-deep-wide-nn-keras">Deep & Wide NN </a></li>
    <li><a href="https://www.kaggle.com/mlanhenke/tps-12-bn-autoencoder-nn-keras">Bottleneck Autoencoder</a></li>
    <li><a href="https://www.kaggle.com/mlanhenke/tps-12-deep-cross-nn-keras">Deep & Cross NN</a></li>
</p>
    
<em>Thank you very much for taking some time to read my notebook. Please leave an upvote if you find any of this information useful.</em>
</div>

# Import & Prepare Data

In [None]:
# import train & test data
df_train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')

sample_submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')

# drop underrepresented class
df_train = df_train[df_train['Cover_Type'] != 5]

# split dataframes for later modeling
X = df_train.drop(columns=['Id','Cover_Type','Soil_Type7','Soil_Type15','Soil_Type1']).copy()
y = df_train['Cover_Type'].copy()

X_test = df_test.drop(columns=['Id','Soil_Type7','Soil_Type15','Soil_Type1']).copy()

# create label-encoded one-hot-vector for softmax, mutliclass classification
le = LabelEncoder()
target = keras.utils.to_categorical(le.fit_transform(y))

del df_train, df_test
gc.collect()

print(X.shape, y.shape, target.shape, X_test.shape)

# Model Setup

In [None]:
# define helper functions
def set_seed(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    print(f"Seed set to: {seed}")

def plot_eval_results(scores, n_splits):
    cols = 5
    rows = int(np.ceil(n_splits/cols))
    
    fig, ax = plt.subplots(rows, cols, tight_layout=True, figsize=(20,2.5))
    ax = ax.flatten()

    for fold in range(len(scores)):
        df_eval = pd.DataFrame({'train_loss': scores[fold]['loss'], 'valid_loss': scores[fold]['val_loss']})

        sns.lineplot(
            x=df_eval.index,
            y=df_eval['train_loss'],
            label='train_loss',
            ax=ax[fold]
        )

        sns.lineplot(
            x=df_eval.index,
            y=df_eval['valid_loss'],
            label='valid_loss',
            ax=ax[fold]
        )

        ax[fold].set_ylabel('')

    sns.despine()

def plot_cm(cm):
    metrics = {
        'accuracy': cm / cm.sum(),
        'recall' : cm / cm.sum(axis=1),
        'precision': cm / cm.sum(axis=0)
    }
    
    fig, ax = plt.subplots(1,3, tight_layout=True, figsize=(15,5))
    ax = ax.flatten()

    mask = (np.eye(cm.shape[0]) == 0) * 1

    for idx, (name, matrix) in enumerate(metrics.items()):

        ax[idx].set_title(name)

        sns.heatmap(
            data=matrix,
            cmap=sns.dark_palette("#69d", reverse=True, as_cmap=True),
            cbar=False,
            mask=mask,
            lw=0.25,
            annot=True,
            fmt='.2f',
            ax=ax[idx]
        )
    sns.despine()

In [None]:
# define callbacks
lr = keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", 
    factor=0.5, 
    patience=5, 
    verbose=True
)

es = keras.callbacks.EarlyStopping(
    monitor="val_loss", 
    patience=10, 
    verbose=True, 
    mode="min", 
    restore_best_weights=True
)

In [None]:
class SwapRowNoise:
    def __init__(self, proba):
        self.proba = proba
    
    def apply(self, X):
        random_idx = np.random.randint(low=0, high=X.shape[0], size=1)[0]
        swap_matrix = K.random_bernoulli(shape=X.shape, p=self.proba) * tf.ones(shape=X.shape)    
        corrupted = tf.where(swap_matrix==1, X.iloc[random_idx], X)
        return corrupted.numpy()
    
# create autoencoder
class EncodingLayer(layers.Layer):
    def __init__(self, encoding_dim, activation='relu'):
        super().__init__()
        self.enc1 = layers.Dense(encoding_dim, activation)
        self.enc2 = layers.Dense(encoding_dim, activation)
        self.enc3 = layers.Dense(encoding_dim, activation)
        self.concat = layers.Concatenate()
    
    def call(self, inputs):
        enc1 = self.enc1(inputs)
        enc2 = self.enc2(enc1)
        enc3 = self.enc3(enc2)
        merge = self.concat([enc1, enc2, enc3])
        return merge

class DecodingLayer(layers.Layer):
    def __init__(self, num_outputs, activation='linear'):
        super().__init__()
        self.dec = layers.Dense(num_outputs, activation)
    
    def call(self, inputs):
        return self.dec(inputs)
    
class AutoEncoder(keras.Model):
    def __init__(self, encoding_dim, num_outputs, activation='relu'):
        super().__init__()
        self.encoder = EncodingLayer(encoding_dim, activation,)
        self.decoder = DecodingLayer(num_outputs)
    
    def call(self, inputs):
        encoder = self.encoder(inputs)
        decoder = self.decoder(encoder)
        return decoder
    
    def get_encoder(self):
        return self.encoder

In [None]:
# create custom layer
class DenseBlock(layers.Layer):
    def __init__(self, units, activation='relu', dropout_rate=0, l2=0):
        super().__init__()
        self.dense = layers.Dense(
            units, activation,
            kernel_regularizer=keras.regularizers.l2(l2)
        )
        self.batchn = layers.BatchNormalization()
        self.dropout = layers.Dropout(dropout_rate)
    
    def call(self, inputs):
        x = self.dense(inputs)
        x = self.batchn(x)
        x = self.dropout(x)
        return x

# create fully-connected NN
class MLP(keras.Model):
    def __init__(self, hidden_layers, autoencoder, activation='relu', dropout_rate=0, l2=0):
        super().__init__()
        self.encoder = autoencoder.get_encoder()
        self.hidden_layers = [DenseBlock(units, activation, l2) for units in hidden_layers]
        self.softmax = layers.Dense(units=target.shape[-1], activation='softmax')
        self.concat = layers.Concatenate()
        
    def call(self, inputs):
        x = self.encoder(inputs)
        for layer in self.hidden_layers:
            x = layer(x)
        x = self.softmax(x)
        return x

# Training

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    tf_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print("Running on TPU:", tpu.master())
except:
    tf_strategy = tf.distribute.get_strategy()
    print(f"Running on {tf_strategy.num_replicas_in_sync} replicas")
    print("Number of GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
seed = 2021
set_seed(seed)

noise_maker = SwapRowNoise(0.10)
X_noise = noise_maker.apply(X)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

predictions = []
oof_preds = {'y_valid': list(), 'y_hat': list()}
scores_ae = {fold:None for fold in range(cv.n_splits)}
scores_nn = {fold:None for fold in range(cv.n_splits)}

for fold, (idx_train, idx_valid) in enumerate(cv.split(X,y)):
    X_train, y_train = X.iloc[idx_train], target[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], target[idx_valid]
    X_noise_train, X_noise_valid = X_noise[idx_train], X_noise[idx_valid]

    # scale data
    scl = StandardScaler()
    X_train = scl.fit_transform(X_train)
    X_noise_train = scl.transform(X_noise_train)
    X_valid = scl.transform(X_valid)
    X_noise_valid = scl.transform(X_noise_valid)

    # train autoencoder
    with tf_strategy.scope():
        ae = AutoEncoder(
            encoding_dim=128,
            num_outputs=X.shape[-1],
            activation='relu'
        )

        ae.compile(
            optimizer=keras.optimizers.Adam(learning_rate=1e-3),
            loss=keras.losses.MeanSquaredError()
        )

    print('_'*65)
    print(f"Fold {fold+1} || Autoencoder Training")
    print('_'*65)

    history_ae = ae.fit(
        X_noise_train, X_train,
        validation_data=(X_noise_valid, X_valid),
        epochs=500,
        batch_size=4096,
        shuffle=True,
        verbose=False,
        callbacks=[lr,es]
    )

    scores_ae[fold] = history_ae.history

    print('_'*65)
    print(f"Fold {fold+1} || AE Min Val Loss: {np.min(scores_ae[fold]['val_loss'])}")
    print('_'*65)

    # train fully-connected nn
    with tf_strategy.scope():
        model = MLP(
            hidden_layers=[32,32,32],
            autoencoder=ae,
            activation='relu'
        )

        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=1e-3),
            loss=keras.losses.CategoricalCrossentropy(),
            metrics=['acc']
        )

    print('_'*65)
    print(f"Fold {fold+1} || NN Training")
    print('_'*65)

    history_nn = model.fit(
        X_train, y_train,
        validation_data=(X_valid, y_valid),
        epochs=500,
        batch_size=4096,
        shuffle=True,
        verbose=False,
        callbacks=[lr,es]
    )

    scores_nn[fold] = history_nn.history

    oof_preds['y_valid'].extend(y.iloc[idx_valid])
    oof_preds['y_hat'].extend(model.predict(X_valid, batch_size=4096))

    prediction = model.predict(scl.transform(X_test), batch_size=4096)
    predictions.append(prediction)

    del ae, model, prediction
    gc.collect()
    K.clear_session()

    print('_'*65)
    print(f"Fold {fold+1} || NN Min Val Loss: {np.min(scores_nn[fold]['val_loss'])}")
    print('_'*65)

overall_score_ae = [np.min(scores_ae[fold]['val_loss']) for fold in range(cv.n_splits)]
overall_score_nn = [np.min(scores_nn[fold]['val_loss']) for fold in range(cv.n_splits)]

print('_'*65)
print(f"Overall AE Mean Validation Loss: {np.mean(overall_score_ae)} || Overall NN Mean Validation Loss: {np.mean(overall_score_nn)}")

# Evaluation & Submission

In [None]:
plot_eval_results(scores_nn, cv.n_splits)

In [None]:
# prepare oof_predictions
oof_y_true = np.array(oof_preds['y_valid'])
oof_y_hat = le.inverse_transform(np.argmax(oof_preds['y_hat'], axis=1))

# create confusion matrix, calculate accuracy, recall & precision
cm = pd.DataFrame(data=confusion_matrix(oof_y_true, oof_y_hat, labels=le.classes_), index=le.classes_, columns=le.classes_)
plot_cm(cm)

In [None]:
# create final prediction, inverse labels to original classes
final_predictions = le.inverse_transform(np.argmax(sum(predictions), axis=1))

sample_submission['Cover_Type'] = final_predictions
sample_submission.to_csv('./baseline_nn.csv', index=False)

sns.countplot(final_predictions)
sns.despine()

In [None]:
sample_submission.head()