In [None]:
"""
Imports
"""

import numpy as np
import pandas as pd
from random import random, randint, sample
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from copy import deepcopy
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt

if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"
device = torch.device(dev)
CUDA_LAUNCH_BLOCKING="1"

# Benford's law and Neural Networks
This notebook is an attempt to make use of the results of [this paper](https://arxiv.org/pdf/2102.03313.pdf). 
One of the conclusions of the said paper is that the \\( MLH \\) is a good indicator of whether the network is overfitting or not.

# Benford's law
> The leftmost non-zero digit’s occurrence in the observations
of a population is not uniformly distributed for many datasets.
Instead, it is log-uniform, with 1 occurring the maximum
number of times, followed by 2, 3, ... 9. According to Benford’s Law, the probability for a sample having a significant digit d is given as follows:

$$ P(d) = log_{10}(1 + \frac{1}{d}), d = 1, 2, 3, ..., 9$$

In [None]:
d = torch.arange(1, 10, device=device)
benford_b10 = torch.log10(1 +1/d)
plt.title("Benford's law for base 10")
plot = plt.plot(np.arange(1, 10), benford_b10.cpu().numpy())

# MLH metric
The \\( MLH \\) metric is defined in the paper:
> We devise a new metric, \\( MLH \\), that provides the measure
of correlation between Benford’s Law and the distribution of
significant digits of model weights.
This score indicates a strong correlation between the
weights and BL and proves to be an essential metric for a
wide range of neural networks, and is described in the following sections.
The proposed score, \\( MLH \\) based on the Pearson’s Correlation Coefficient is defined as follows:

$$MLH(\theta) = PearsonR(BinCount(\theta), \beta)$$

> \\( BinCount(\theta) \\) is the distribution of Significant Digits of network parameter set \\( \theta \\). \\( \beta \\) is the distribution defined by BL :


$$BinCount(\theta) = \frac{[f_0, f_1, ..., f_1]}{D_{\theta}}$$

## Pearson's correlation coefficient

Pearson's correlation coefficient is basically the covariance normalized between -1 and 1.
Its formula is:

$$
PearsonR(X, Y) = \frac{cov(X, Y)}{\sigma_X\sigma_Y} \\
$$

# KLDB Metric

As the MLH represents a certain kind of correlation, I would like to test another metric that measures a distance between two distributions. I'll refer as the \\( KLDB \\) the metric defined by:

$$
    KLDB(\theta) = KLDivergence(BinCount(\theta), \beta)
$$

## KLDivergence

The KLDivergence is defined on Wikipedia as follows:

> In mathematical statistics, the Kullback–Leibler divergence, \\( D_{\text{KL}} \\) (also called relative entropy), is a measure of how one probability distribution is different from a second, reference probability distribution.

Its formula is:

$$
     D_{\text{KL}}(P\parallel Q)=\sum _{x\in {\mathcal {X}}}P(x)\log \left({\frac {P(x)}{Q(x)}}\right)
$$

So the closest \\( KLDB \\) is to \\( 0 \\), the closest the distribution of the first significant digit in the neural network parameters is to Benford's law.

In [None]:
"""
    MLH Metric
"""

def pearsonR(X, Y):
    """
        return the Pearson's correlation coefficient between variable X and Y
    """
    x_centered = (X - torch.mean(X))
    y_centered = (Y - torch.mean(Y))
    
    r = torch.sum(x_centered * y_centered)
    r /= torch.sqrt(torch.sum(torch.square(x_centered))) * torch.sqrt(torch.sum(torch.square(y_centered)))
    
    return r

def BinCount(theta):
    """
        Return the frequence of each digit's occurence as first significant digit in the parameter set defined by theta
    """
    abs_scaled = torch.abs(theta) * 1e10
    digits = (abs_scaled // (10 ** torch.log10(abs_scaled).long())).long()
    val, counts = torch.unique(digits, return_counts=True)

    freq = torch.zeros(9, device=device)
    freq[val-1] = counts.float()
    freq /= max(1, torch.numel(theta))

    return freq
    
@torch.no_grad()
def MLH(theta):
    """
        Return the MLH of a set of parameters
    """
    # Security in case there are zeros
    theta = theta[theta.abs() > 1e-9]
    return pearsonR(BinCount(torch.flatten(theta)), benford_b10)

def KlDivergence(P, Q):
    """
        Return the KL Divergence between two distributions
    """
    ratio = P / Q
    # Security in case there are zeros
    ratio[ratio == 0.0] = 1e-7
    return torch.sum(P * torch.log2(ratio))

@torch.no_grad()
def KLDB(theta):
    """
        Return the KLDB (KL Divergence between the distribution of first significant didgits of theta and benford's law)
    """
    # Security in case there are zeros
    theta = theta[theta.abs() > 1e-9]
    return KlDivergence(BinCount(torch.flatten(theta)), benford_b10)
    

def model_MLH(model):
    """
        Return the MLH of a model
    """
    params = []
    for p in model.parameters():
        params.append(torch.flatten(p.detach()))
    params_flattened = torch.cat(params)
    return MLH(params_flattened)

def model_KLDB(model):
    """
        Return the KLDB of a model
    """
    params = []
    for p in model.parameters():
        params.append(torch.flatten(p.detach()))
    params_flattened = torch.cat(params)
    return KLDB(params_flattened)

def model_benford_metrics(model):
    """
        Return both the MLH and the KLDB of a model
    """
    params = []
    for p in model.parameters():
        params.append(torch.flatten(p.detach()))
    theta = torch.cat(params)
    theta = theta[theta.abs() > 1e-9]
    bc = BinCount(torch.flatten(theta))
    mlh = pearsonR(bc, benford_b10)
    kldb = KlDivergence(bc, benford_b10)
    return mlh.item(), kldb.item()

# Empirical test on datasets

So I'm gonna use different datasets and try to fit an autoencoder with cross-validation on them. I do that because it's a task that we can do for each dataset as long as it contains numbers. I'll try to select the best model once based on the validation loss, once based on the \\( MLH \\) score, and once based on the \\( KLDB \\) score. I'll also measure the performances of the model at the end of training without any selection. The hypothetical advantage to use MLH and KLDB is that it allows the training process to use all of the data.

In [None]:
"""
    Data management functions
"""
def process_data(csv_file):
    """
        Take a csv_file, reads it and return the numerical normalized vaues
    """
    data = pd.read_csv(csv_file)
    final = pd.DataFrame()
    for c in data.columns:
        # We only care about numerical data, so skip the categorical features
        if data[c].dtype != "object" and data[c].duplicated().any():
            m = data[c].mean()
            s = data[c].std()
            final[c] = (data[c] - m) / (s if s > 0.0 else 1)
    return final.values

In [None]:
"""
    Neural network auto-encoder functions
"""

def get_model(data_shape, n_hidden_layers, n_components):
    """
        Return two identical autoencoders that takes as input a data of size data_shape,
        compress it to n_components, and reconstruct it to data_shape with n_hidden_layers
        in both the encoder and decoder part
    """
    layers = [nn.Flatten()]
    a = data_shape
    # We select the number of units linearly
    units = np.linspace(float(data_shape), float(n_components), num=2 + n_hidden_layers)
    
    for b in units[1:]:
        layers += [nn.Linear(int(a), int(b)), nn.ReLU()]
        a = b
        
    for b in list(reversed(units))[1:]:
        layers += [nn.Linear(int(a), int(b)), nn.ReLU()]
        a = b
        
    layers = layers[:-1]
    model = nn.Sequential(*layers)
    # Name it to be able to find it later
    model.name = 'AE_' + '-'.join(map(str, units.astype(np.uint))) + '|' + '-'.join(map(str, reversed(units.astype(np.uint))))
    return deepcopy(model).to(device), deepcopy(model).to(device)

def build_models(data, n_max_layers=4, pca_prop=0.4, n_folds=4):
    """
        Iterator that yields n_folds paris of models for each possible architecture until n_max_layers with 
        a hidden representation of the same size as a PCA requires to explain a fraction of 
        pca_prop of the variance of the data
    """
    # Calculate the desired number of components (Just a way to account for the complexity of the data)
    pca = PCA()
    pca.fit(data)
    n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) > pca_prop)
    
    data_shape = int(data.size / data.shape[0])
    # Yield models
    for i in range(n_max_layers):
        for n in range(n_folds):
            yield get_model(data_shape, i+1, n_components)

In [None]:
"""
    Training functions
"""

def train_val(model, data, n_steps=15000, batch_size=32, val_every=50):
    """
        Train "model" on "data" for "n_steps" with "batch_size". A validation step is made every "val_every" steps.
        Select the best model using validation loss.
        Return the predictions of the best model on the test data and its kldb
    """
    print("Training {}".format(model.name))
    # Separate validation and training
    X_train, X_test = data
    X_train, X_val, y_train, y_val = train_test_split(X_train, X_train, test_size=0.15)
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters())
    
    idxs = []
    loss_history = []
    val_loss_history = []
    best_val_loss = float("inf")
    X_val_batch = torch.tensor(X_val, device=device, dtype=torch.float).unsqueeze(1)
    y_val_batch = torch.tensor(y_val, device=device, dtype=torch.long)
    for i in range(n_steps):
        # At end of epoch just loop
        if not len(idxs):
            idxs = np.random.permutation(np.arange(len(X_train)))
        optimizer.zero_grad()
        # Sample a batch
        bidxs = idxs[:batch_size]
        idxs = idxs[batch_size:]
        
        X_batch = torch.tensor(X_train[bidxs], device=device, dtype=torch.float)
        y_batch = torch.tensor(y_train[bidxs], device=device, dtype=torch.float)

        y_pred = model(X_batch)
        # Optimisation
        train_loss = criterion(y_pred, y_batch)
        train_loss.backward()
        loss_history.append(train_loss.item())
        optimizer.step()

        del X_batch
        del y_batch
        # Evaluation
        with torch.no_grad():
            y_pred = model(X_val_batch)
            val_loss = criterion(y_pred, y_val_batch).item()
        # Calculate the Benford metrics
        mlh, kldb = model_benford_metrics(model)
        print("Step {}: loss={:.4f} | val_loss={:.4f} | MLH={:.4f} | KLDB={:.4f}".format(
            i, train_loss, val_loss, mlh, kldb
        ), end='\r')
        val_loss_history.append(val_loss)
        # We save the model if it's better
        if i%val_every==0 and val_loss < best_val_loss:
            torch.save(model, "best_acc.pt")
            best_val_loss = val_loss
            best_val_kldb = kldb
            
    print("Last val loss={:.5f} | Best val loss={:.5f}                           ".format(
        val_loss, best_val_loss
    ))
    model = torch.load("best_acc.pt")
    return model(torch.tensor(X_test, device=device, dtype=torch.float)), best_val_kldb
            
def train_benford(model, data, n_steps=15000, batch_size=32):
    """
        Train "model" on "data" for "n_steps" with "batch_size".
        Select the best model using respectively MLH and KLDB.
        Return the predictions of the best models and the last one on the test data
    """
    print("Training {}".format(model.name))
    X_train, X_test = data
    y_train = X_train
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters())
    
    idxs = []
    mlh_history = []
    kldb_history = []
    loss_history = []
    best_mlh = -1
    best_kldb = float("inf")
    for i in range(n_steps):
        # At end of epoch just loop
        if not len(idxs):
            idxs = np.random.permutation(np.arange(len(X_train)))
        optimizer.zero_grad()

        # Sample a batch
        bidxs = idxs[:batch_size]
        idxs = idxs[batch_size:]

        X_batch = torch.tensor(X_train[bidxs], device=device, dtype=torch.float)
        y_batch = torch.tensor(y_train[bidxs], device=device, dtype=torch.float)

        y_pred = model(X_batch)

        # Optimisation
        train_loss = criterion(y_pred, y_batch)
        train_loss.backward()
        loss_history.append(train_loss.item())
        optimizer.step()

        del X_batch
        del y_batch

        # Calculate the Benford metrics
        mlh, kldb = model_benford_metrics(model)
        print("Step {}: loss={:.5f} | MLH={:.5f} | KLDB={:.5f}".format(
            i, train_loss, mlh, kldb
        ), end='\r')
        mlh_history.append(mlh)
        kldb_history.append(kldb)
        # Save the model if it's better according to the metrics
        if mlh > best_mlh:
            torch.save(model, "best_mlh.pt")
            best_mlh = mlh
            
        if kldb < best_kldb:
            torch.save(model, "best_kldb.pt")
            best_kldb = kldb
            
    print("Last MLH={:.5f} | Best MLH={:.5f}                     ".format(
        mlh, best_mlh)
    )
    print("Last KLDB={:.5f} | Best KLDB={:.5f}".format(kldb, best_kldb))
    X_t_test = torch.tensor(X_test, device=device, dtype=torch.float)
    last_model_preds = model(X_t_test)
    model = torch.load("best_mlh.pt")
    mlh_preds = model(X_t_test)
    kldb = torch.load("best_kldb.pt")
    kldb_preds = model(X_t_test)
    return mlh_preds, kldb_preds, last_model_preds

In [None]:
"""
    Experiment function
"""

def experiment_on_dataset(csv_file, n_folds=3):
    """
        Run the previously described experiment on the data in "csv_file" with "n_folds"
    """
    print("Running on dataset:",csv_file)
    # Load data
    all_data = process_data(csv_file)
    # To save some values
    df = pd.DataFrame(columns=['data', 'model', 'best_mlh', 'best_mlh_val_loss', 'best_kldb', 'best_kldb_val_loss', 'best_val_loss'])
    criterion = nn.MSELoss()
    # Histories
    val_hist = []
    best_val_kldb_hist = []
    mlh_hist = []
    kldb_hist = []
    last_model_hist = []
    best_val_kldb_buffer = []
    val_preds = torch.zeros(all_data.shape, device=device, dtype=torch.float)
    mlh_preds = torch.zeros(all_data.shape, device=device, dtype=torch.float)
    kldb_preds = torch.zeros(all_data.shape, device=device, dtype=torch.float)
    last_model_preds = torch.zeros(all_data.shape, device=device, dtype=torch.float)
    ys = torch.tensor(all_data, device=device, dtype=torch.float)
    # Iterate on the models
    for i, models in enumerate(build_models(all_data, n_folds=n_folds)):
        # Calculate current fold
        fold = i%n_folds
        if fold==0: # If it's the first one we should affect each example to a fold
            folds = np.random.randint(0, n_folds, len(all_data))
        # We split train and test data
        data = (all_data[folds!=fold], all_data[folds==fold])
        # Perform the training using the validation loss method
        val_pred, best_val_kldb = train_val(models[0], data)
        best_val_kldb_buffer.append(best_val_kldb)
        # Perform the training using the Benford metrics
        kldb_pred, mlh_pred, last_model_pred = train_benford(models[1], data)
        # We save the predictions on the test set associated to the current fold
        val_preds[folds==fold] = val_pred
        kldb_preds[folds==fold] = kldb_pred
        mlh_preds[folds==fold] = mlh_pred
        last_model_preds[folds==fold] = last_model_pred
        if fold == n_folds - 1: # When we did every fold
            # Save things
            best_val_kldb = sum(best_val_kldb_buffer) / len(best_val_kldb_buffer)
            best_val_kldb_buffer = []
            val_loss = criterion(val_preds, ys).item()
            val_preds[:] = 0.0
            mlh_loss = criterion(mlh_preds, ys).item()
            mlh_preds[:] = 0.0
            kldb_loss = criterion(kldb_preds, ys).item()
            kldb_preds[:] = 0.0
            last_model_loss = criterion(last_model_preds, ys).item()
            last_model_preds[:] = 0.0
            
            val_hist.append(val_loss)
            best_val_kldb_hist.append(best_val_kldb)
            mlh_hist.append(mlh_loss)
            kldb_hist.append(kldb_loss)
            last_model_hist.append(last_model_loss)
            print("Results: Val loss={:.4f} | MLH loss={:.4f} | KLDB loss={:.4f} | Best val loss KLDB={:.4f}".format(
                val_loss, mlh_loss, kldb_loss, best_val_kldb
            ))
    return val_hist, mlh_hist, kldb_hist, last_model_hist, best_val_kldb_hist, 

In [None]:
mnist_results = experiment_on_dataset("../input/digit-recognizer/train.csv")
plt.title("Score obtained on MNIST for different model selection metrics")
plt.plot(mnist_results[0], label="Validation loss")
plt.plot(mnist_results[1], label="MLH")
plt.plot(mnist_results[2], label="KLDB")
plt.plot(mnist_results[3], label="Last model")
plt.xlabel("Number of hidden layers")
plt.ylabel("Test loss")
plt.legend()

In [None]:
doverfit_results = experiment_on_dataset("../input/older-dataset-for-dont-overfit-ii-challenge/train.csv")
plt.title("Score obtained on \"Don't overfit\" for different model selection metrics")
plt.plot(doverfit_results[0], label="Validation loss")
plt.plot(doverfit_results[1], label="MLH")
plt.plot(doverfit_results[2], label="KLDB")
plt.plot(doverfit_results[3], label="Last model")
plt.xlabel("Number of hidden layers")
plt.ylabel("Test loss")
plt.legend()

In [None]:
tabular_playground = experiment_on_dataset("../input/tabular-playground-series-mar-2021/train.csv")
plt.title("Score obtained on \"Tabular playground series march 2021\" for different model selection metrics")
plt.plot(tabular_playground[0], label="Validation loss")
plt.plot(tabular_playground[1], label="MLH")
plt.plot(tabular_playground[2], label="KLDB")
plt.plot(tabular_playground[3], label="Last model")
plt.xlabel("Number of hidden layers")
plt.ylabel("Test loss")
plt.legend()

As we can see, it's not clear that selecting a model based on \\( MLH \\) or \\( KLDB \\) prevents overfitting. It's even unclear that using thoses metrics is better than taking a random model. Surely those metrics evolve in a certain way during training but it's probably something else.

In [None]:
plt.title("KLDB of the model selected by the validation loss method")
plt.plot(mnist_results[4], label="MNIST")
plt.plot(doverfit_results[4], label="Don't overfit")
plt.plot(tabular_playground[4], label="Tabular playground series march 2021")
plt.xlabel("Number of hidden layers")
plt.ylabel("KLDB")
plt.legend()

Also, it's clear that there is no absolute value of \\( MLH \\) or \\( KLDB \\) that defines a fitted model. 

**TODO**
* Write an algorithm that scraps output files of public notebooks to see if they rank in the leaderboard is related to \\( MLH \\) or \\( KLDB \\)
* Try to implement the results of [Opening the black box of Deep Neural Networks
via Information](https://arxiv.org/pdf/1703.00810.pdf) to see if \\( MLH \\) or \\( KLDB \\) have anything to do with the two phases that are discussed in the paper
* Use it on other tasks than autoencoding (classification, generative process, reinforcement learning)
* Use more Datasets for evidence

It's very likely that in this notebook there are some imprecision, spelling mistakes or things that are not correct. If you spot one of them, feel free to let me know in the comment section, I'll be happy to correct it !

In [None]:
raise Exception("for some reason if the notebook don't raise an error, then the latex is broken.")