In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load the training data set

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv', index_col='id')
train_df.describe()

In [None]:
y = train_df['target']
X = train_df.drop('target', axis='columns')

### Normalize and split the training data

All of the columns are numeric, and none of the columns have missing values, so we can dive right into normalization. The features in the data set are in many different ranges of values. Normalization will put them all in the similar ranges with a mean of 0 and standard deviation of 1.

In [None]:
normal_X = (X - X.mean()) / X.std()
normal_X.describe()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(normal_X, y, random_state=42)
input_shape = [X_train.shape[1]]

### Create and compile the model

Here I created a simple neural network with two hidden layers. In early tests, I noticed that the model was overfitting after a very small number of epochs, so I added a small amount of dropout to each layer. Since I didn't know how many epochs the model would need to train, I added an early stopping callback as well.

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import BinaryCrossentropy

layer_size = 64

model = keras.Sequential([
    layers.Dense(layer_size, activation='swish', input_shape=input_shape),
    layers.Dropout(0.25),
    layers.Dense(layer_size, activation='swish'),
    layers.Dropout(0.25),
    layers.Dense(1, activation='sigmoid'),
])

bce = BinaryCrossentropy(label_smoothing=0.1)

model.compile(
    optimizer='rmsprop',
    loss=bce,  # 'binary_crossentropy',
    metrics=['AUC', 'accuracy']
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

### Fit the model, plot the loss, accuracy, and AUC

In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=512,
    epochs=500,
    callbacks=[early_stopping]
)

history_df = pd.DataFrame(history.history)
print("Best Validation Loss: {:0.4f}".format(history_df['val_loss'].min()))
print("Best Validation Accuracy: {:0.4f}".format(history_df['val_accuracy'].max()))
print("Best Validation ROC AUC: {:0.4f}".format(history_df['val_auc'].max()))

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14, 4))
axes[0].plot(history_df.index, history_df[['loss', 'val_loss']])
axes[0].set_title("Cross-entropy")

axes[1].plot(history_df.index, history_df[['accuracy', 'val_accuracy']])
axes[1].set_title("Accuracy")

axes[2].plot(history_df.index, history_df[['auc', 'val_auc']])
axes[2].set_title("AUC")
fig.tight_layout()

### Plot predictions vs. actual target values

In [None]:
y_pred = model.predict(X_test)
y_pred = np.squeeze(y_pred)

In [None]:
predictions_df = pd.DataFrame({'prediction': y_pred, 'target': y_test})
target_0 = predictions_df[predictions_df['target'] == 0]
target_1 = predictions_df[predictions_df['target'] == 1]

In [None]:
plt.figure(figsize=(16,6))
sns.kdeplot(data=target_0['prediction'], label="Target=0 predictions", shade=True)
sns.kdeplot(data=target_1['prediction'], label="Target=1 predictions", shade=True)
plt.title("Distribution of Predictions by target label")
plt.legend()

This shows that most of our predictions in the test set are in the right range, but there's still a sizable spike of incorrect predictions.

### Load and normalize the test set

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv', index_col='id')
normal_test_df = (test_df - X.mean()) / X.std() # use X mean and stddev to avoid leakage

### Retrain the model on all of the data for (hopefully) better generalization

The model started to overfit the validation data after about 25 epochs during training, so we'll stop at 25 when fitting the model to the complete set of training data.

In [None]:
model.fit(
    normal_X, y,
    batch_size=512,
    epochs=25
)

In [None]:
# Make predictions on the normalized test data set.
y_pred = model.predict(normal_test_df)
y_pred = np.squeeze(y_pred)

### Create the submission file

In [None]:
submission_df = pd.DataFrame({'id': test_df.index, 'target': y_pred})
submission_df.to_csv('submission.csv', index=False)

### Plot the predictions to see their distribution

In [None]:
plt.figure(figsize=(15,8))
sns.histplot(x=y_pred, kde=True)
plt.title("Predictions Distribution")
plt.xlabel("Prediction")
plt.show()

This is a good distribution for a first attempt at a binary classifier, so I'll use this as my first submission. With a few tweaks to the model, this could be a high-ranking model in the competition.