In [None]:
import os
import numpy as np
import pandas as pd

from datetime import datetime

import matplotlib.pyplot as plt

dir_data = '/kaggle/input/tabular-playground-series-jun-2022'
df = pd.read_csv(os.path.join(dir_data, 'data.csv'))
df = df.drop('row_id', axis=1)

(Before starting, I am sorry for my poor English in advance.)

## 1. The event of occuring a missing value follows a binomial distribution
Someone might already know this. I think this might be useful when you intentionally add some missing values to your data.

I assumed that for each data, a missing value appears with some constant probability for each column.

Let me give an example. As you already know there are 15 columns whose prefix is `F_1`.<br>
The code below shows the number of rows (`count`) whose the number of missing values in `F_1` is `num_nan`.


In [None]:
prefix = 'F_1'
targets = [col for col in df.columns if prefix in col]

df_sub = df[targets].copy()

df_nan = df_sub.isnull().sum(axis=1).reset_index()
df_nan.columns = ['id', 'count']

df_count = pd.value_counts(df_nan['count']).reset_index()
df_count.columns = ['num_nan', 'count']
df_count = df_count.sort_values('num_nan').reset_index(drop=True)
df_count['proportion'] = df_count['count'] / df_count['count'].sum()
df_count

I found out that the event of occurring a missing value in `F_1` follows the binomial distribution with `n=15` and `p=0.0185`.<br>
The following code computes the Probability Mass Function (PMF) of [a binomial distribution](https://en.wikipedia.org/wiki/Binomial_distribution). (If you find any error, please let me know!)

In [None]:
def binomial(n, k, p):
    q = 1 - p
    comb = 1
    for i in range(1, k + 1):
        comb *= (n - (i - 1))
        comb /= i
    return comb * (p ** k) * (q ** (n - k))

def binomial_distribution(n, p):
    return [binomial(n, k, p) for k in range(n+1)]
    

Let's check the PMF of the binomial distribution with `n=15` and `p=0.0185`. The column `p` is quite similar to the column `proportion` in the above data frame.

In [None]:
data = binomial_distribution(15, 0.0185)
data = pd.DataFrame(data).reset_index()
data.columns = ['k', 'p']
data

Let's visualize it by using histogram.

In [None]:
width = 0.25
plt.figure()
plt.bar(x=df_count['num_nan'], height=df_count['proportion'], width=width, label='observation')
plt.bar(x=data['k'] + width, height=data['p'], width=width, label='Binomial dist')
plt.legend()
plt.show()

We can find the same characteristics for `F_3_x` and `F_4_x`.
- Occurring a missing value in `F_3_x` follows $B(25, 0.185)$ and
- Occurring a missing value in `F_4_x` follows $B(15, 0.185)$ and

In [None]:
prefix = 'F_3'
targets = [col for col in df.columns if prefix in col]

df_sub = df[targets].copy()

df_nan = df_sub.isnull().sum(axis=1).reset_index()
df_nan.columns = ['id', 'count']

df_count = pd.value_counts(df_nan['count']).reset_index()
df_count.columns = ['num_nan', 'count']
df_count = df_count.sort_values('num_nan').reset_index(drop=True)
df_count['proportion'] = df_count['count'] / df_count['count'].sum()

data = binomial_distribution(25, 0.0185)
data = pd.DataFrame(data).reset_index()
data.columns = ['k', 'p']

width = 0.25
plt.figure()
plt.bar(x=df_count['num_nan'], height=df_count['proportion'], width=width, label='observation')
plt.bar(x=data['k'] + width, height=data['p'], width=width, label='Binomial dist')
plt.legend()
plt.show()

In [None]:
prefix = 'F_4'
targets = [col for col in df.columns if prefix in col]

df_sub = df[targets].copy()

df_nan = df_sub.isnull().sum(axis=1).reset_index()
df_nan.columns = ['id', 'count']

df_count = pd.value_counts(df_nan['count']).reset_index()
df_count.columns = ['num_nan', 'count']
df_count = df_count.sort_values('num_nan').reset_index(drop=True)
df_count['proportion'] = df_count['count'] / df_count['count'].sum()

data = binomial_distribution(15, 0.0185)
data = pd.DataFrame(data).reset_index()
data.columns = ['k', 'p']

width = 0.25
plt.figure()
plt.bar(x=df_count['num_nan'], height=df_count['proportion'], width=width, label='observation')
plt.bar(x=data['k'] + width, height=data['p'], width=width, label='Binomial dist')
plt.legend()
plt.show()

## 2. Visualizing whole data frame.

In [None]:
yticks = np.array([15, 40, 65, 80]) - 0.5
yticklabels = ['F_1', 'F_2', 'F_3', 'F_4']

In [None]:
def matshow(data):
    plt.figure()
    plt.matshow(data, aspect='auto', cmap='gray')
    plt.hlines(y=14.5, xmin=0, xmax=1e6, colors='red', linewidth=1)
    plt.hlines(y=39.5, xmin=0, xmax=1e6, colors='red', linewidth=1)
    plt.hlines(y=64.5, xmin=0, xmax=1e6, colors='red', linewidth=1)
    plt.yticks(yticks, yticklabels)
    plt.colorbar()
    plt.show()

### 2.1 Raw data frame

In [None]:
data = df.values.T

matshow(data)

### 2.2 Missing data visualization

In [None]:
matshow(df.isnull().values.T)

### 2.3 Min-max normalized datatrames visualization

In [None]:
data = df.fillna(0).values.T
data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))

matshow(data)

I like the below figure. In this figure, each data (each column in the figure) has similar pattern, especially for columns `F_4_x`.<br>

In [None]:
data = df.fillna(0).values.T
data = (data - data.min(axis=1, keepdims=True)) / (data.max(axis=1, keepdims=True) - data.min(axis=1, keepdims=True))

matshow(data)

In [None]:
data = df.fillna(0).values.T
data = (data - data.min()) / (data.max() - data.min())

matshow(data)