In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Data and Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import gc

In [None]:
train_data =  pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test_data =  pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

In [None]:
X = train_data.drop('target',axis=1).set_index('id')
y = train_data.target
X_test = test_data.set_index('id')

In [None]:
del train_data, test_data
gc.collect()

In [None]:
ss = StandardScaler().fit(pd.concat([X,X_test],axis=0))
X = pd.DataFrame(ss.transform(X),index=X.index,columns=X.columns)
X_test = pd.DataFrame(ss.transform(X_test),index=X_test.index,columns=X_test.columns)

In [None]:
X.head()

In [None]:
X_test.head()

For simplicity, instead of doing a standard cross-validation, we simply split the training data into half, using one half for training and the half for validating/testing. Our models will be small, so there should be enough data for training without overfitting.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["KMP_SETTINGS"] = "0" 

import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("tensorflow").addHandler(logging.NullHandler(logging.ERROR))

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.callbacks import EarlyStopping

import tensorflow_probability as tfp
tfd = tfp.distributions
tfpl = tfp.layers

# Logistic Regression Model

As a warm-up, we first implement logistic regression as a neural network with a single output layer of 1 unit with a sigmoidal activation and no regularization. 

In [None]:
def logistic_regression(input_shape):
    tf.keras.backend.clear_session()
    model = Sequential()
    model.add(InputLayer(input_shape=input_shape))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy', 
                  optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001), 
                  metrics=['accuracy',tf.keras.metrics.AUC(name='auc')])
    model.summary()
    return model  

In [None]:
def train_model(model, X_train, y_train):
    
    callbacks = [EarlyStopping(monitor='val_loss',mode='min',patience=20,restore_best_weights=True)]
    
    X_t, X_v, y_t, y_v = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

    history = model.fit(x=X_t, y=y_t,
                batch_size=1024,
                epochs=5000,
                validation_data=(X_v,y_v),
                callbacks=callbacks,
                verbose=1)
    # plot graphs
    fig = plt.figure(figsize=(18, 6))
    figno = [131,132,133]
    metrics = ['loss','auc','accuracy']
    legend_pos = ['upper right','lower right','lower right']
    for i in range(3):
        fig.add_subplot(figno[i])
        metric = metrics[i]
        plt.plot(history.history[metric])
        plt.plot(history.history[F'val_{metric}'])
        plt.title(F'{metric} vs epochs')
        plt.ylabel(metric)
        plt.xlabel('Epoch')
        plt.legend(['Training', 'Validation'], loc=legend_pos[i])
    plt.show()
        
    return model

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
y_pred = train_model(logistic_regression((X.shape[1],)), X_train, y_train).predict(X_valid)
print(F'Test accuracy: {accuracy_score(y_valid,y_pred>0.5)}')
print(F'Test AUC: {roc_auc_score(y_valid,y_pred)}')

_=plt.hist(y_pred,bins=100)
      

As can be seen, the logistic model outputs a unimodal distribution of predictions (probabilities). We would expect 2 modes around 0.25 and 0.75, but the logistic model, being a linear model, is not able to capture that. 

A common way to improve the fit is to add additional layers into the network, essentially increasing the number of model parameters. We know a deeper neural network is able to capture the bimodal distribution, but there is another direction to generalize the logistic model.

# Mixture of Bernoulli Distributions

The logistic model (and other multi-layer neural networks) outputs the probability of class "1". We can also interpret this output as the parameter (mean) of a Bernoulli distribution. Viewed this way, one way to generalize the model is to replace the Bernoulli distribution by another probability distribution of binary random variable. One such distribution is mixture of Bernoulli distributions. 

Modeling the probability distribution as a mixture of 2 Bernoullis has the following nice interpretation. There is a common belief that approximately 25% of the original target values of this dataset have been "flipped". So the categorical distribution of the mixture (which is another Bernoulli distribution) models the probability $p(1 \mbox{ before flip})$. The 2 Bernoulli distributions model respectively $p(1|1\mbox{ before flip})$ and $p(1|0\mbox{ before flip})$.

To this end, we want to build a neural network that outputs a probability distribution, not a tensor. The TensorFlow Probability library makes this easy. We simply replace the last dense layer (with single unit) by a dense layer with appropriate number of units *and no activation* that will be fed into a final, non-trainable layer `MixtureSameFamily` that encapsulates the mixture of Bernoullis probability model.

In [None]:
def mixture_of_bernoullis(input_shape):
    tf.keras.backend.clear_session()
    model = Sequential()
    model.add(InputLayer(input_shape=input_shape))
    # replace last layer with sigmoidal activation by a probabilistic layer
    comps = 2
    event_shape = [1]
    params_size = tfpl.MixtureSameFamily.params_size(comps,
                                                     component_params_size=tfpl.IndependentBernoulli.params_size(event_shape))              
    model.add(Dense(params_size, activation=None))
    model.add(tfpl.MixtureSameFamily(comps, tfpl.IndependentBernoulli(event_shape),
                                    convert_to_tensor_fn=tfp.distributions.Distribution.mean))
    
    model.compile(loss=lambda y_true, y_pred: -y_pred.log_prob(y_true), # loss function "binary cross-entropy" replaced by "minus log likelihood"
                  optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001), 
                  metrics=['accuracy',tf.keras.metrics.AUC(name='auc')])
    model.summary()
    return model

In [None]:
model = train_model(mixture_of_bernoullis((X.shape[1],)), X_train, y_train)
y_pred = model.predict(X_valid)
print(F'Test accuracy: {accuracy_score(y_valid,y_pred>0.5)}')
print(F'Test AUC: {roc_auc_score(y_valid,y_pred)}')

_=plt.hist(y_pred,bins=100)

More than a thousand epochs later, we get the fitted model, and the distribution of predicted probabilities is bimodal as expected.

Let's try our luck and submit the predictions for the test data using this model.

In [None]:
predictions_test = model.predict(X_test).flatten()

In [None]:
pd.DataFrame({'id': X_test.index, 'target': predictions_test}).to_csv('submission.csv', index=False)
print("Submission saved!")