# TPS-June2021 - Simple FeedForwad N/W
Original notebook was cloned from @pranjalchatterjee

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import pylab
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Setting up the Data
Now, we'll load and take a look at the data.

In [None]:
## categorical data encoding
OH_encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)

## read train data
train_data = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv', index_col='id')
X = train_data.iloc[:,:75]
# One-Hot encode the labels
y = pd.DataFrame(OH_encoder.fit_transform(pd.DataFrame(train_data.target)))
y = y.rename(columns={i:f'Class_{i+1}' for i in range(9)})
# train_data.head()

## scale train data
""" I observed StandardScaler gives better performance than MinMaxScaler"""
sc = StandardScaler()
X = sc.fit_transform(X)

## read test data
test_data = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv', index_col='id')
X_test = test_data.iloc[:,:]

## scale train data
X_test = sc.fit_transform(X_test)
# test_data.head()

In [None]:
##class distribution is imbalanced
train_data.target.value_counts()

In [None]:
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)
X_test_pca = pca.transform(X_test)

In [None]:
## no distinguishable boundary as such
colors = []
for i in train_data.target:
    colors.append(int(i.split('_')[-1]))
plt.scatter(X_pca[:,0], X_pca[:,1], c=colors, cmap='viridis')

In [None]:
## We  observe with no of features variance retained increases linearly. 
## This is interesting, as typically the increase is exponential.

ratios = [round(i*0.05,2) for i in range(10,20)]
dimensions = []
for r in ratios:
    pca = PCA(r)
    pca.fit(X)
    X_pca_ratio = pca.transform(X)
    dimensions.append(X_pca_ratio.shape[1])
plt.plot(ratios, dimensions)

## The Model
How to use Activation, Dropout and BatchNormalization is still matter of discussions. There is no pre-defined rule here, mot probably a matter of opinion as this is empirical work. I found this [thread](https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout) useful.   

- Activation and Dropout: for some functions such as ReLU order doesn't matter - [check here](https://sebastianraschka.com/faq/docs/dropout-activation.html). In case of non-linear activation function, typically, dropout is applied after activation layer.  
- BN used after Dropout bcs I found this [argument](https://stackoverflow.com/a/50698801/5094187) intuitive.
- Dropout rate kept low pertaining to [discussions in this direction](https://stackoverflow.com/a/59001644/5094187).  
- No dropout in last layer as per [discussions](https://stats.stackexchange.com/questions/361700/lack-of-batch-normalization-before-last-fully-connected-layer)  
So the order I've used **Activation->Dropout->BN**

**But** [this paper](https://arxiv.org/pdf/2107.02279.pdf) on design smells in DL programs recommends **Activation->BN->Dropout** with support from literature. Hence need to check with both configs.

We'll then compile the model with the Adam optimizer, the categorical_crossentropy as loss and metric.

In [None]:
def my_model(ip_size):
    model = keras.Sequential([
        layers.InputLayer([ip_size]),
        layers.Dense(96, activation='relu'),
        layers.Dropout(rate=0.2),
        layers.BatchNormalization(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(rate=0.2),
        layers.BatchNormalization(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(rate=0.25),
#         layers.BatchNormalization(),
#         layers.Dense(512, activation='relu'),
#         layers.Dropout(rate=0.5),
#         layers.BatchNormalization(),
        layers.Dense(9, activation='softmax')
    ])
    
    opt = keras.optimizers.Adam(learning_rate=0.0005)
    
    model.compile(
        optimizer=opt,
        loss='categorical_crossentropy',
        metrics=['categorical_crossentropy']
    )
    return model

## Training and Validation
Now, we'll set up the training and validation data.

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      train_size=0.9, 
                                                      test_size=0.1, 
                                                      stratify=y,
                                                      random_state=0)
pca_final = PCA(0.85)
X_train = pca_final.fit_transform(X_train)
X_valid = pca_final.transform(X_valid)

Then, we'll train the model on the training data and check with validations. We'll use an early stopping metric as well, training on many epochs.

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=50,
    min_delta=0.0005,
    restore_best_weights=False,
    verbose=2
)

model = my_model(X_train.shape[1])
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=90,
    epochs=200,
    callbacks=[early_stopping]
)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12, 4))
ax[0].plot(history.history['loss'], label="Training loss")
ax[0].plot(history.history['val_loss'], label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['categorical_crossentropy'], label="Training accuracy")
ax[1].plot(history.history['val_categorical_crossentropy'],label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)


plt.setp(ax[:], xlabel='epoch')
plt.setp(ax[0], ylabel='loss')
plt.setp(ax[1], ylabel='accuracy')

plt.show()

## Prediction

In [None]:
predictions = model.predict(pca_final.transform(X_test))
output = pd.DataFrame(predictions)
output = output.rename(columns={i:f'Class_{i+1}' for i in range(9)})
output = output.rename_axis("id", axis='rows')
idcol = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
idcol = idcol.iloc[:,0]
output = pd.concat([idcol, output], axis=1)
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")