In [None]:
import os
from scipy import stats
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

# Upload data

In [None]:
path = '../input/tabular-playground-series-sep-2021'
# Input data files are available in the "../input/" directory.
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def load_data(source, dtypes, path=path):
    ''' load tables '''
    assert source in ['train', 'test']
    df = pd.read_csv(f'{path}/{source}.csv', index_col="id", dtype= dtypes)
    return df

In [None]:
%%time
train = load_data('train', None)
print(f"Data shape: {train.shape}")
train.sample(2)

In [None]:
target_name = "claim"
features = [col for col in train.columns if col not in [target_name]]

# Statistics

* **Skewness** is a measure of symmetry, or more precisely, the lack of symmetry. A distribution, or data set, is symmetric if it looks the same to the left and right of the center point.
    * If the skewness is between -0.5 and 0.5, the data are fairly symmetrical.
    * If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed.
    * If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed.
        
        

* **Kurtosis** is a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution.
    * Mesokurtic: This distribution has kurtosis statistic similar to that of the normal distribution. It means that the extreme values of the distribution are similar to that of a normal distribution characteristic. This definition is used so that the standard normal distribution has a kurtosis of three.
    * Leptokurtic (Kurtosis > 3): Distribution is longer, tails are fatter. Peak is higher and sharper than Mesokurtic, which means that data are heavy-tailed or profusion of outliers. Outliers stretch the horizontal axis of the histogram graph, which makes the bulk of the data appear in a narrow (“skinny”) vertical range, thereby giving the “skinniness” of a leptokurtic distribution.

    * Platykurtic: (Kurtosis < 3): Distribution is shorter, tails are thinner than the normal distribution. The peak is lower and broader than Mesokurtic, which means that data are light tailed or lack of outliers. The reason for this is because the extreme values are less than that of the normal distribution.

# Creating fill nan Dictionary
1. **KMeans** to test the different cluster configurations.
2. **Elbow Method** to check if there are more than one cluster:
    * NO: Fill nan values with **MEAN** is the better solution.
    * YES: Check if the data is equally distributed between clusters.
        * YES: Fill nan values with **MODE** is the better solution.
        * NO: Play with Kurtosis and skewness to determine if it is better to fill de nan values with **MODE**, **MEDIAN**.

### Scaling data set

In [None]:
mns = MinMaxScaler()
mns.fit(train)
train_scaled = mns.transform(train)
train_scaled = pd.DataFrame(train_scaled, columns=train.columns)

### Elbow Method to determine number of clusters

In [None]:
SSD_DIFF_TH = 6000
MULTIMODAL_TH = 100000
MODE_SKEW_TH = 0.8
MODE_KURTOSIS_TH = -1.4
MEAN_SKEW_TH = 0.4

In [None]:
def optimum_filler(df, col_name, ssd_dif_threshold, multimodal_threshold, mode_skew_threshold, mode_kurtosis_threshold, mean_skew_threshold):
    skew = stats.skew(df[col_name].values, nan_policy="omit").mean()
    kurtosis = stats.kurtosis(df[col_name].values, nan_policy="omit").mean()
        
    K = range(1,4)
    Sum_of_squared_distance = []
    data = df[col_name].dropna().values.reshape(-1, 1)
    km = None
    nomal_flag = True
    
    # Using KMeans to test the different cluster configurations.
    for k in K:
        km = KMeans(n_clusters=k)
        km = km.fit(data.reshape(-1, 1))
        Sum_of_squared_distance.append(km.inertia_)
        ssd_dif = np.abs(np.diff(Sum_of_squared_distance[-2:]))
        
        if len(ssd_dif) == 0:
            pass
        elif ssd_dif > ssd_dif_threshold:
            # print(f"Break on k: {k}")
            nomal_flag = False
            break
    
    if not nomal_flag:
        unique, counts = np.unique(km.labels_, return_counts=True)
        counts_dif = np.abs(np.diff(counts[-2:]))
        
        
        if counts_dif > multimodal_threshold:
            if (-mode_skew_threshold < skew < mode_skew_threshold) and kurtosis < mode_kurtosis_threshold:
                return "Mode"
            else:
                return "Median"
        else:
                return "Mode"
        
    else:
        return "Mean"

In [None]:
%time
fill_nan_dic = {}
for col_name in tqdm(features):
    name_nanfill =  optimum_filler(train_scaled, col_name, SSD_DIFF_TH, MULTIMODAL_TH, MODE_SKEW_TH, MODE_KURTOSIS_TH, MEAN_SKEW_TH)
    fill_nan_dic[col_name] = name_nanfill

In [None]:
fill_nan_dic