# Importing Libraries and Loading datasets

In [None]:
import numpy as np
import pandas as pd

# Plot
import seaborn as sns
import matplotlib.pyplot as plt

# Feature Engineering
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.feature_selection import mutual_info_classif

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# Neural Network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

# Cross-Validation
from sklearn.model_selection import StratifiedKFold

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

# Explore Data

In [None]:
train_data.head()

In [None]:
train_data.describe()

# Feature Engineering


In [None]:
# Get train data without the target and ids
X = train_data.iloc[:, 1:-1].copy()
# Get the target
y = train_data.target.copy()
# Create test X, drop ids.
test_X = test_data.iloc[:, 1:].copy()

# It takes time to handle all of the data.
# So, I am using a smaller portion of the data
# while debugging/testing.
#X = train_data.iloc[0:10000, 1:-1].copy()
#y = train_data.target[0:10000].copy()
#test_X = test_data.iloc[0:10000, 1:].copy()

## Mutual Information

For now, I am using mutual information so select some features.  
The reason is simple, I am reading some tutorials https://www.kaggle.com/ryanholbrook/mutual-information :) and I am trying to find a way that I can implement what I have learned.

In [None]:
def make_mi_scores(mi_scores, X, y):
    mi_scores = pd.Series(mi_scores, name="MI Scores")
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
mi_scores = mutual_info_classif(X, y)
mi_scores_classif = make_mi_scores(mi_scores, X, y)

In [None]:
plt.figure(dpi=100, figsize=(20, 16))
plot_mi_scores(mi_scores_classif[mi_scores_classif > 1e-4])

# Selected Features

In [None]:
selected_features = mi_scores_classif[mi_scores_classif > 1e-4].index.tolist()
selected_features = [f'f{feature}' for feature in selected_features]
print(f"Selected Features: {selected_features}")

The dataset is splitted according to the distributions of each column.  
This idea is taken from, https://www.kaggle.com/javiervallejos/simple-nn-with-good-results-tps-nov-21, https://www.kaggle.com/adityasharma01/simple-nn-tps-nov-21  
The difference is, that notebook is using all columns to create new basic columns while this notebook is creating those columns from selected features.

In [None]:
# The number 2 is just a threshold to split
data = X[selected_features].copy()
h_skew = data.loc[:,data.skew() >= 2].columns  # with Skewed
l_skew = data.loc[:,data.skew() < 2].columns   # Bimodal

In [None]:
# Skewed distrubutions
X['median_h'] = X[h_skew].median(axis=1)
test_X['median_h'] = test_X[h_skew].median(axis=1)

X['var_h'] = X[h_skew].var(axis=1)
test_X['var_h'] = test_X[h_skew].var(axis=1)

# Bimodal distributions
X['mean_l'] = X[l_skew].mean(axis=1)
test_X['mean_l'] = test_X[l_skew].mean(axis=1)

X['std_l'] = X[l_skew].std(axis=1)
test_X['std_l'] = test_X[l_skew].std(axis=1)

X['median_l'] = X[l_skew].median(axis=1)
test_X['median_l'] = test_X[l_skew].median(axis=1)

X['skew_l'] = X[l_skew].skew(axis=1)
test_X['skew_l'] = test_X[l_skew].skew(axis=1)

X['max_l'] = X[l_skew].max(axis=1)
test_X['max_l'] = test_X[l_skew].max(axis=1)

X['var_l'] = X[l_skew].var(axis=1)
test_X['var_l'] = test_X[l_skew].var(axis=1)

# Pre-processing

In [None]:
# Scaling and Nomalization
transformer_high_skew = make_pipeline(
    StandardScaler(), 
    MinMaxScaler(feature_range=(0, 1))
)

transformer_low_skew = make_pipeline(
    StandardScaler(),
    MinMaxScaler(feature_range=(0, 1))
)

h_skew = X.loc[:, X.skew() >= 2].columns
l_skew = X.loc[:, X.skew() < 2].columns

preprocessor = make_column_transformer(
    (transformer_high_skew, l_skew),
    (transformer_low_skew, h_skew)
)

# Modelling


I will use my setup from https://www.kaggle.com/sfktrkl/tps-nov-2021-nn?scriptVersionId=80095054

In [None]:
# Set seeds
my_seed = 1
np.random.seed(my_seed)
tf.random.set_seed(my_seed)

## Callbacks

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
early_stopping = callbacks.EarlyStopping(
    min_delta=0.001,           # Minimium amount of change to count as an improvement
    patience=5,               # How many epochs to wait before stopping
    restore_best_weights=True)

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ReduceLROnPlateau
reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2,                # Factor by which the learning rate will be reduced
    patience=5,                # Number of epochs with no improvement
    min_lr=0.001)              # Lower bound on the learning rate

## Model

In [None]:
EPOCHS = 100
BATCH_SIZE = 512
N_SPLITS = 15
CALLBACKS = [early_stopping]

In [None]:
model = keras.Sequential([
    layers.Dense(100, activation='swish', input_shape=[X.shape[1]]),
    layers.Dropout(0.3),
    layers.Dense(64, activation='swish'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='swish'),
    layers.Dropout(0.3),
    # For a binary classification function use sigmoid
    layers.Dense(1, activation='sigmoid')])

In [None]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['AUC'])

## Training

In [None]:
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer

In [None]:
fold = 0
test_predictions = np.zeros(test_X.shape[0])
skf = StratifiedKFold(n_splits=N_SPLITS, random_state=48, shuffle=True)
scores = {fold:None for fold in range(skf.n_splits)}
for train_idx, test_idx in skf.split(X, y):
    train_X, val_X = X.loc[train_idx], X.loc[test_idx]
    train_y, val_y = y.loc[train_idx], y.loc[test_idx]

    # Preprocessing
    test  = test_X.copy()
    
    train_X = preprocessor.fit_transform(train_X)
    val_X = preprocessor.transform(val_X)
    test = preprocessor.transform(test)

    # Model
    history = model.fit(
        train_X, train_y,
        validation_data=(val_X, val_y),
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        callbacks=CALLBACKS,        # Put your callbacks in a list
        verbose=0)                  # Turn off training log

    scores[fold] = (history.history)
    print(f"Fold {fold + 1} \t\t AUC: {np.max(scores[fold]['val_auc'])}")

    # Get the average values from each fold to the prediction
    test_predictions += model.predict(test, batch_size=BATCH_SIZE).reshape(1,-1)[0] / skf.n_splits
    fold += 1

overall_auc = [np.max(scores[fold]['val_auc']) for fold in range(skf.n_splits)]
print('Overall Mean AUC: ', np.mean(overall_auc))

# Evaluation

In [None]:
# Credits to https://www.kaggle.com/mlanhenke/tps-11-nn-baseline-keras?scriptVersionId=79830528
fig, ax = plt.subplots(3, 5, figsize=(20, 15))
ax = ax.flatten()

for fold in range(skf.n_splits):
    df_eval = pd.DataFrame({'train_loss': scores[fold]['loss'], 'valid_loss': scores[fold]['val_loss']})

    min_train = np.round(np.min(df_eval['train_loss']),5)
    min_valid = np.round(np.min(df_eval['valid_loss']),5)
    delta = np.round(min_valid - min_train,5)
    
    sns.lineplot(
        x=df_eval.index,
        y=df_eval['train_loss'],
        label='train_loss',
        ax = ax[fold]
    )

    sns.lineplot(
        x=df_eval.index,
        y=df_eval['valid_loss'],
        label='valid_loss',
        ax = ax[fold]
    )
    
    ax[fold].set_ylabel('')
    ax[fold].set_xlabel(f"Fold {fold+1}\nmin_train: {min_train}\nmin_valid: {min_valid}\ndelta: {delta}", fontstyle='italic')

sns.despine()

# Submission

In [None]:
# Run the code to save predictions in the format used for competition scoring
output = pd.DataFrame({'id': test_data.id, 'target': test_predictions})
output.to_csv('submission.csv', index=False)
output