## Simple Keras Pipeline

- EDA : https://www.kaggle.com/subinium/tps-may-categorical-eda

In [None]:
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.utils import class_weight
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

#!pip install imbalanced-learn
import imblearn
from imblearn.over_sampling import SMOTE

from tensorflow.keras.losses import CategoricalCrossentropy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')
train.head()

In [None]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# counts each type of Class
sorted(train['target'].value_counts())

## Normalization

In [None]:
for i in range(50):
        #mean, std = train[f'feature_{i}'].mean(), train[f'feature_{i}'].std()
        #train[f'feature_{i}'] = train[f'feature_{i}'].apply(lambda x : (x-mean)/std)
        max = train[f'feature_{i}'].max()
        train[f'feature_{i}'] = train[f'feature_{i}'].apply(lambda x : x/max)
        
train.head()

In [None]:
# transform target column into four columns, one for each class

label_dict = {val:idx for idx, val in enumerate(sorted(train['target'].unique()))}
train['target'] = train['target'].map(label_dict)

target = train['target']
train.drop(['target'], inplace=True, axis=1)

In [None]:
train = train.values
target = target.values
target = to_categorical(target)

## Split Data

In [None]:
oversample = SMOTE()
trainS, targetS = oversample.fit_resample(train, target)
np.sum(targetS, axis = 0)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(trainS, targetS, test_size = 0.20, 
                                                  random_state = 2021, stratify = targetS)

## Model (Keras)

In [None]:
num_features = 50
num_classes = 4

The structure of the model can be changed freely, and the model is an MLP model using only Dense, Batchnormalization, Dropout.

In [None]:
model = Sequential([
        Dense(528, input_dim = num_features, kernel_initializer='normal', activation='relu'),
        Dropout(0.3),
        BatchNormalization(),
        Dense(256, activation='relu'),
        Dropout(0.3),
        BatchNormalization(),
        Dense(128, activation='relu'),
        Dropout(0.2),
        BatchNormalization(),
        Dense(num_classes, activation = 'softmax')
    ])

model.summary()

### Compile

In [None]:
model.compile(loss = CategoricalCrossentropy(label_smoothing = 0.001),
              optimizer = keras.optimizers.Adam(), 
              metrics = 'categorical_accuracy')

In [None]:
history = model.fit(X_train, y_train,
            batch_size = 512, epochs = 60, verbose = 2,
            validation_data = (X_val, y_val))

### Evaluate

In [None]:
score = model.evaluate(X_val, y_val, verbose = 0)
score = model.evaluate(train, target, verbose = 0)
print('Val loss: {}%'.format(score[0]))
print('Val score: {}%'.format(score[1] * 100))
print("MLP Error: %.2f%%" % (100*(1 - score[1])))

## Result visualization

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
sns.lineplot(x = history.epoch, y = history.history['loss'])
sns.lineplot(x = history.epoch, y = history.history['val_loss'])
ax.set_title('Learning Curve (Loss)')
ax.set_ylabel('Loss')
ax.set_xlabel('Epoch')
ax.legend(['train', 'test'], loc='best')
plt.show()

### Confusion Matrix

In [None]:
# Method 2
pred = model.predict(train).argmax(axis=1)
fig, ax = plt.subplots(figsize=(9, 9))
sns.heatmap(confusion_matrix(target.argmax(axis=1), pred), cmap ='Blues', 
            annot = True, cbar = False, fmt ='d', square = True, linewidth = 0.4, ax = ax)

ax.set_xlabel('Pred', fontweight='bold')
ax.set_ylabel('True', fontweight='bold')

plt.show()

## Output

In [None]:
for i in range(50):
        #mean, std = test[f'feature_{i}'].mean(), test[f'feature_{i}'].std()
        #test[f'feature_{i}'] = test[f'feature_{i}'].apply(lambda x : (x-mean)/std)
        max = test[f'feature_{i}'].max()
        test[f'feature_{i}'] = test[f'feature_{i}'].apply(lambda x : x/max)
        
test.head()

In [None]:
sample_submission[['Class_1','Class_2', 'Class_3', 'Class_4']] = model.predict(test)

sample_submission.to_csv('my_submission.csv',index = False)
sample_submission.head(20)