## Simple Keras Pipeline

- EDA : https://www.kaggle.com/subinium/tps-may-categorical-eda

In [None]:
import tensorflow.keras as keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, learning_curve, cross_val_score
from sklearn.metrics import confusion_matrix, log_loss, make_scorer, accuracy_score

from xgboost import XGBClassifier
import xgboost as xgb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [None]:
train.info()

## Normalization

The data needs to be normalized to fit into the DNN.

In [None]:
# # for i in range(50):
# #     mean, std = train[f'feature_{i}'].mean(), train[f'feature_{i}'].std()
# #     train[f'feature_{i}'] = train[f'feature_{i}'].apply(lambda x : (x-mean)/std)
# #     test[f'feature_{i}'] = test[f'feature_{i}'].apply(lambda x : (x-mean)/std)

features = [col for col in test.columns if col != "id" and col != "target"]

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler().fit(train[features])
# train[features] = scaler.transform(train[features])
# test[features] = scaler.transform(test[features])

# # from sklearn import preprocessing
# # min_max_scaler = preprocessing.MinMaxScaler()
# # train[features] = min_max_scaler.fit_transform(train[features])
# # test[features] = min_max_scaler.fit_transform(test[features])

In [None]:
label_dict = {val:idx for idx, val in enumerate(sorted(train['target'].unique()))}
train['target'] = train['target'].map(label_dict)

target = train['target']
train.drop(['target'], inplace=True, axis=1)

# train = train.values
# target = target.values
# target =  to_categorical(target)

## Feature Selection

https://www.kaggle.com/nishantdhingra/cb-lgbm-xgb-feature-importance-and-interactions

In [None]:
drop_features = ['feature_3']
train_new = train.drop(drop_features,axis=1)
test_new =test.drop(drop_features,axis=1)

In [None]:
def gen_features(df, features):
    for i in range (len(features)):
        for j in range(i+1, len(features)):
            df[str(features[i])+'+'+str(features[j])] = df[str(features[i])]+df[str(features[j])]
    
    return df

In [None]:
features = ['feature_2','feature_13']
train_new = gen_features(train_new, features)
test_new = gen_features(test_new, features)

In [None]:
new_features = train_new.columns

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(train_new[new_features])
train_new[new_features] = scaler.transform(train_new[new_features])
test_new[new_features] = scaler.transform(test_new[new_features])

## Model (Keras)

### Initialization

In [None]:
num_features = len(new_features)
num_classes = 4

The structure of the model can be changed freely, and the model is an MLP model using only Dense, Batchnormalization, Dropout.

In [None]:
def CreateModel():
    hidden_units = [150, 150, 150]
    dropout_rates = [0.2, 0.2, 0.2, 0.2]
    
    inp = tf.keras.layers.Input(shape=(num_features,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation("relu")(x)
#         x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)

    x = tf.keras.layers.Dense(num_classes)(x)
    out = tf.keras.layers.Activation("softmax")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    
#     model = Sequential([
#         Dense(512, input_dim=num_features, activation='relu'),
#         BatchNormalization(),
#         Dropout(0.3),
#         Dense(256, activation='relu'),
#         BatchNormalization(),
#         Dropout(0.2),
#         Dense(128, activation='relu'),
#         BatchNormalization(),
#         Dropout(0.2),
#         Dense(num_classes, activation='softmax')
#     ])
    model.compile(loss='mean_squared_logarithmic_error', optimizer="adam", metrics='accuracy')
    return model

### Fit

I didn't do a lot of Epochs for fast execution, and the batch size and epoch can be adjusted.

With the GPU, you can run the model much faster.

In [None]:
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

In [None]:
oof = np.zeros((train_new.shape[0],4))
pred = np.zeros((test_new.shape[0],4))

for fold, (tr_idx, ts_idx) in enumerate(skf.split(train_new, target)):
    print(f"===== FOLD {fold} =====")       
    X_train = train_new.iloc[tr_idx] # X_train
    y_train = target.iloc[tr_idx] # y_train
    X_val = train_new.iloc[ts_idx] # X_valid 
    y_val = target.iloc[ts_idx] # y_valid
    
    X_train = X_train.values
    X_val = X_val.values
    y_train = y_train.values
    y_val = y_val.values
    y_train =  to_categorical(y_train)
    y_val =  to_categorical(y_val)

    model = CreateModel()
    model.fit(X_train, y_train,
          batch_size = 100, epochs = 20, verbose = 2,
          validation_data=(X_val, y_val));
    
    oof[ts_idx] = model.predict(X_val)
    pred += model.predict(test) / N_FOLDS
    
    score = log_loss(y_val, oof[ts_idx])
    print(f"FOLD {fold} Score {score}\n")

score = log_loss(target, oof)
print(f"Score total {score}\n")

## Output

In [None]:
sample_submission[['Class_1','Class_2', 'Class_3', 'Class_4']] = pred

In [None]:
sample_submission.to_csv(f'submission.csv',index=False)