# Tabular Playground Series - Feb 2022

**(using Denoising AutoEncoder Network)**


# Data Loading

In [None]:
import numpy as np
import pandas as pd 

maindir = "../input/tabular-playground-series-feb-2022/"      
traincsv = maindir+"train.csv"  
testcsv = maindir+"test.csv"
submission = maindir+"sample_submission.csv"

In [None]:
train = pd.read_csv(traincsv)
test = pd.read_csv(testcsv)
submission = pd.read_csv(submission)

print(train.shape, test.shape, submission.shape)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
X = train 
y = train['target']    

In [None]:
print(X.shape, y.shape)

# Denoising AutoEncoder Network
Our next model is going to be Denoising Auto Encoder (DAE) based on the implementation of the notebook [TPS-12] Denoising AutoEncoder NN (Keras).

In [None]:
import os
import gc
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tensorflow import keras
import matplotlib.pyplot as plt 
from sklearn.impute import SimpleImputer
from tensorflow.keras.metrics import AUC
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from sklearn.metrics import accuracy_score
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.callbacks import ModelCheckpoint
from imblearn.over_sampling import SMOTE,SMOTENC,SVMSMOTE
from imblearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score,make_scorer
from sklearn.metrics import precision_score, recall_score, confusion_matrix,classification_report
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from datetime import date
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer, KBinsDiscretizer
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Flatten, Add

In [None]:
SEED = 2022

def seed_everything(seed = SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(SEED)

# Modelling Steps:

In [None]:
RUNS = 1
FOLDS = 10                  # 1 # 10 # For debugging, change this when you use this notebook!
SEED = 2022
EPOCHS = 120                # 1 # 120 # For debugging, change this when you use this notebook!
VERBOSE = 1
LR = 0.00012
BATCH_SIZE = 32  # 1024 

CHECKPOINT_FILEPATH = './'
INPUT_PATH = './'

def metric(y_true, y_pred): return accuracy_score(y_true, y_pred)

X_test = test

ID_COL = X['row_id']
TARGET_COL = y

features = [col for col in X_test.columns if col not in ['row_id','target']]
target = y.copy()

In [None]:
train = X 
train.head()

In [None]:
X_test.head()

In [None]:
train.shape, X_test.shape

In [None]:
import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import gc
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import make_column_transformer
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Model, Sequential
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [None]:
train.head()

In [None]:
train_df = train
test_df = X_test

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target = le.fit_transform(train_df.target)

from sklearn.preprocessing import RobustScaler

cols = [col for col in train_df.columns if col not in ['row_id','target']]

scaler = RobustScaler()
train_df[cols] = scaler.fit_transform(train_df[cols])
test_df[cols] = scaler.transform(test_df[cols])

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

train_df['sum_na'] = train_df.isna().sum(axis = 1)
test_df['sum_na'] = test_df.isna().sum(axis = 1)

cols = test_df.columns
for data in [train_df, test_df]:
    data["mean"] = data[cols].mean(axis=1)
    data["min"] = data[cols].min(axis=1)
    data["max"] = data[cols].max(axis=1)

features = [feat for feat in test_df.columns if feat not in ['row_id','target']]
X = train_df[features]

# Defining A Data Pipeline Preprocessor

In [None]:
features = [feat for feat in test_df.columns if feat not in ['row_id','target']]

data_pipe_transformer = make_pipeline(
    StandardScaler()
)

preprocessor = make_column_transformer(
    (data_pipe_transformer, features)
)

# TPU Configuration

In [None]:
# If you want to train on GPU just set it to False
TPU = False     # Else set it to True

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
    
print('Replicas:', strategy.num_replicas_in_sync)

Following below are the cells for training each model type on the dataset. For Each model we train we save it's test predictions as well as out-of-fold predictions (for later stacking).

In [None]:
total_oof_list = []
test_pred_list = []

# Implementation

In [None]:
class SwapRowNoise:
    def __init__(self, proba):
        self.proba = proba
    
    def apply(self, X):
        random_idx = np.random.randint(low=0, high=X.shape[0], size=1)[0]
        swap_matrix = K.random_bernoulli(shape=X.shape, p=self.proba) * tf.ones(shape=X.shape)    
        corrupted = tf.where(swap_matrix==1, X.iloc[random_idx], X)
        return corrupted.numpy()
    
# create autoencoder
class EncodingLayer(layers.Layer):
    def __init__(self, encoding_dim, activation='relu'):
        super().__init__()
        self.enc1 = layers.Dense(encoding_dim, activation)
        self.enc2 = layers.Dense(encoding_dim, activation)
        self.enc3 = layers.Dense(encoding_dim, activation)
        self.concat = layers.Concatenate()
    
    def call(self, inputs):
        enc1 = self.enc1(inputs)
        enc2 = self.enc2(enc1)
        enc3 = self.enc3(enc2)
        merge = self.concat([enc1, enc2, enc3])
        return merge

class DecodingLayer(layers.Layer):
    def __init__(self, num_outputs, activation='linear'):
        super().__init__()
        self.dec = layers.Dense(num_outputs, activation)
    
    def call(self, inputs):
        return self.dec(inputs)   

# create custom layer
class DenseBlock(layers.Layer):
    def __init__(self, units, activation='relu', dropout_rate=0, l2=0):
        super().__init__()
        self.dense = layers.Dense(
            units, activation,
            kernel_regularizer=keras.regularizers.l2(l2)
        )
        self.batchn = layers.BatchNormalization()
        self.dropout = layers.Dropout(dropout_rate)
    
    def call(self, inputs):
        x = self.dense(inputs)
        x = self.batchn(x)
        x = self.dropout(x)
        return x

# Model Configuration

In [None]:
# Create Auto-Encoder NN
class AutoEncoder(keras.Model):
    def __init__(self, encoding_dim, num_outputs, activation='relu'):
        super().__init__()
        self.encoder = EncodingLayer(encoding_dim, activation,)
        self.decoder = DecodingLayer(num_outputs)
    
    def call(self, inputs):
        encoder = self.encoder(inputs)
        decoder = self.decoder(encoder)
        return decoder
    
    def get_encoder(self):
        return self.encoder
        
# Create Fully-Connected NN
class MLP(keras.Model):
    def __init__(self, hidden_layers, autoencoder, activation='relu', dropout_rate=0, l2=0):
        super().__init__()
        self.encoder = autoencoder.get_encoder()
        self.hidden_layers = [DenseBlock(units, activation, l2) for units in hidden_layers]
        self.softmax = layers.Dense(units=len(le.classes_), activation='softmax')
        self.concat = layers.Concatenate()
        
    def call(self, inputs):
        x = self.encoder(inputs)
        for layer in self.hidden_layers:
            x = layer(x)
        x = self.softmax(x)
        return x    

# Training

In [None]:
np.random.seed(2022)
tf.random.set_seed(2022)

score_list, history_list = [], []
oof_list = [np.full((len(train_df), len(le.classes_)), -1.0, dtype='float32') for run in range(RUNS)]
for run in range(RUNS):
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=1)
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df, y=train_df.target)):
        print(f"Fold {run}.{fold}")
        K.clear_session()
        
        X_tr = train_df.iloc[train_idx]
        X_va = train_df.iloc[val_idx]
        y_tr = target[train_idx]
        y_va = target[val_idx]
        X_tr = X_tr[features]
        X_va = X_va[features]

        X_tr[features] = preprocessor.fit_transform(X_tr)
        X_va[features] = preprocessor.transform(X_va)

        noise_maker = SwapRowNoise(0.10)
        X_noise_train = noise_maker.apply(X_tr)
        X_noise_valid = noise_maker.apply(X_va)

        # TPU model
        if TPU:
            with strategy.scope():
                ae = AutoEncoder(encoding_dim=128,num_outputs=X_tr.shape[-1],activation='relu')
                ae.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),loss=keras.losses.MeanSquaredError())
            lr = ReduceLROnPlateau(monitor = "val_loss", factor = 0.5, patience = 5, verbose = VERBOSE)
            es = EarlyStopping(monitor = "val_loss", patience = 10, verbose = VERBOSE, mode = "min", restore_best_weights = True)
            history_ae = ae.fit(X_noise_train, X_tr,validation_data=(X_noise_valid, X_va),epochs=EPOCHS,batch_size=BATCH_SIZE,validation_batch_size=BATCH_SIZE,shuffle=True,verbose=False,callbacks=[lr,es])

            with strategy.scope():
                model = MLP(hidden_layers=[32,32,32],autoencoder=ae,activation='relu')
                model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),loss="sparse_categorical_crossentropy",metrics=['acc'])

        else:
            # GPU model
            ae = AutoEncoder(encoding_dim=128,num_outputs=X_tr.shape[-1],activation='relu')
            ae.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),loss=keras.losses.MeanSquaredError())
            lr = ReduceLROnPlateau(monitor = "val_loss", factor = 0.5, patience = 5, verbose = VERBOSE)
            es = EarlyStopping(monitor = "val_loss", patience = 10, verbose = VERBOSE, mode = "min", restore_best_weights = True)
            history_ae = ae.fit(X_noise_train, X_tr,validation_data=(X_noise_valid, X_va),epochs=EPOCHS,batch_size=BATCH_SIZE,validation_batch_size=BATCH_SIZE,shuffle=True,verbose=False,callbacks=[lr,es])

            model = MLP(hidden_layers=[32,32],autoencoder=ae,activation='relu')
            model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),loss="sparse_categorical_crossentropy",metrics=['acc'])

        lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5,
                               patience=5, verbose=VERBOSE)

        es = EarlyStopping(monitor="val_acc", patience=10,
                           verbose=VERBOSE, mode="max",
                           restore_best_weights=True)

        history = model.fit(X_tr, y_tr,
                            validation_data=(X_va, y_va),
                            epochs=EPOCHS,
                            verbose=VERBOSE,
                            batch_size=BATCH_SIZE,
                            validation_batch_size=BATCH_SIZE,
                            shuffle=True,
                            callbacks=[lr, es])
        history_list.append(history.history)

        y_va_pred = model.predict(X_va, batch_size=len(X_va))
        oof_list[run][val_idx] = y_va_pred
        y_va_pred = le.inverse_transform(np.argmax(y_va_pred, axis=1))

        accuracy = accuracy_score(train_df.iloc[val_idx].target, y_va_pred)

        print(f"Fold {run}.{fold} | Epochs: {len(history_list[-1]['loss'])} | Accuracy: {accuracy:.5f}")

        test_pred_list.append(model.predict(preprocessor.transform(test_df[features]), batch_size=BATCH_SIZE))

        del model, y_va_pred
        gc.collect()
total_oof_list += oof_list

In [None]:
submission['target'] = le.inverse_transform(np.argmax(sum(test_pred_list), axis=1)) 
submission.to_csv('submission.csv', index=False)
print(submission.head())