# Goal

This notebook provides a baseline for solving the present problem using DNN. There are a lot of room to grow. So play with it and have fun ^_^

If you like this, plz **upvote** to give me some motivation to continue.


### Versions

version 1: consider 5 folds, run just the first fold using optimizer RMSprop, 200 epochs

version 2: add some comments, reduce the learning_rate to have a smoother learning curve

version 3: increase to 1000 epochs, hope that we can reach to a stable CV of 0.99x

version 4: consider updated dataset with removed duplicates to rebalance the folds

version 7: CV 5 folds

version 9: correct data duplicated issue

In [None]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.simplefilter('ignore')

In [None]:
# Reproducibility
SEED=2022
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# Data

In [None]:
# Consider light version of the dataset to gain some memory and speed
# see https://www.kaggle.com/sytuannguyen/avoid-oom-issues-with-pickle
train = pd.read_pickle('../input/tpsfeb2022-ds-to-pickle-with-folds/train.pkl')
print(train.shape)
train.head(3)

In [None]:
features = [col for col in train.columns if 'A' in col]

# Encode the labels
lb = LabelEncoder()
train.target = lb.fit_transform(train.target)

In [None]:
X_test = pd.read_pickle('../input/tpsfeb2022-ds-to-pickle-with-folds/test.pkl')
X_test.head(3)

# Model

In [None]:
# Baseline ANN model
model = keras.models.Sequential()
model.add(keras.layers.Dense(512, activation='relu', input_shape=(286,)))
model.add(keras.layers.Dense(256, activation='relu'))
model.add(keras.layers.Dense(10, activation='softmax')) # softmax is appropriate for multi classification

In [None]:
#keras.utils.plot_model(model, show_shapes=True)

In [None]:
LR = 0.0005
model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=LR), # try also other optimizers such as Adam, SGD, etc.
             loss='categorical_crossentropy', # this loss function is appropriate for multi classification
             metrics=['accuracy'])

In [None]:
# train the model
EPOCHS=1000
BATCH_SIZE=1024

early_stop = keras.callbacks.EarlyStopping(monitor='loss', patience=5) # stop the model if it cannot improve after 5 epochs

In [None]:
# Choose a fold to train and validation
preds=[]
for FOLD in [0.0,1.0,2.0,3.0,4.0]:
    print(f'Fold: {FOLD}')
    X_train = train[features][train['5_folds'] != FOLD] # try also 10 folds, 20 folds
    X_val   = train[features][train['5_folds'] == FOLD]
    y_train = train['target'][train['5_folds'] != FOLD]
    y_val   = train['target'][train['5_folds'] == FOLD]

    
    # Transform labels to categories
    y_train = to_categorical(y_train)
    y_val = to_categorical(y_val)

    checkpoint = keras.callbacks.ModelCheckpoint(f"ann_{FOLD}", save_best_only=True) # save best model
    history=model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[checkpoint, early_stop], verbose=0)
    
    print('Max accuracy:', np.max(history.history['accuracy']))
    preds.append(model.predict(X_test))

    # show the evolution of loss and metrics during the training process
    plt.figure(figsize=(10,5))
    plt.plot(history.history['accuracy'], 'k', label='train')
    plt.plot(history.history['val_accuracy'], 'b', label='val')
    plt.ylabel('Accuracy', fontsize=16)
    plt.xlabel('Epochs', fontsize=16)
    plt.legend(fontsize=16)
    plt.show()


# Prediction

In [None]:
y_test = np.mean(preds, axis=0)

In [None]:
y_test = lb.inverse_transform(np.argmax(y_test, axis=1))

# Submission

In [None]:
sub = pd.read_pickle('../input/tpsfeb2022-ds-to-pickle-with-folds/sub.pkl')
sub.target = y_test

In [None]:
# show the distribution of the classes
# hint: you may want to some postprocessing technique to rebalance the classes to make it consistent with the train set.
target_df = pd.DataFrame((sub.target.value_counts().sort_index())).reset_index()
target_df.columns = ['target', 'count']
fig = px.bar(data_frame = target_df, 
             x = 'target',
             y = 'count' , 
             color = "count",
             color_continuous_scale="Emrld") 
fig.show()

In [None]:
sub.to_csv('submission.csv', index=False)
sub.head()