# EDA + feature exploration

Credit to

1. For the EDA functions: https://www.kaggle.com/dwin183287/tps-september-2021-eda

In [None]:
# import packages
import os
import joblib
import numpy as np
import pandas as pd
import warnings

import matplotlib
import matplotlib.pyplot as plt
from skimage.filters import threshold_otsu
from matplotlib import ticker
import seaborn as sns

# import datasets
train_df = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv', index_col='id')
test_df = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv', index_col='id')
submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv', index_col='id')

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

In [None]:
print(f'Number of rows: {train_df.shape[0]};  Number of columns: {train_df.shape[1]}; No of missing values: {sum(train_df.isna().sum())}')

In [None]:
def plot_kde(features):
    background_color = "#f6f5f5"

    plt.rcParams['figure.dpi'] = 600
    fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
    gs = fig.add_gridspec(5, 5)
    gs.update(wspace=0.3, hspace=0.3)

    run_no = 0
    for row in range(0, 5):
        for col in range(0, 5):
            locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
            locals()["ax"+str(run_no)].set_facecolor(background_color)
            for s in ["top","right"]:
                locals()["ax"+str(run_no)].spines[s].set_visible(False)
            run_no += 1  

    run_no = 0
    for col in features:
        sns.kdeplot(ax=locals()["ax"+str(run_no)], x=train_df[col], zorder=2, alpha=1, linewidth=1, color='#ffd514')
        locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
        locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
        locals()["ax"+str(run_no)].set_ylabel('')
        locals()["ax"+str(run_no)].set_xlabel(col, fontsize=4, fontweight='bold')
        locals()["ax"+str(run_no)].tick_params(labelsize=4, width=0.5)
        locals()["ax"+str(run_no)].xaxis.offsetText.set_fontsize(4)
        locals()["ax"+str(run_no)].yaxis.offsetText.set_fontsize(4)
        run_no += 1

    run_no = 0
    for col in features:
        sns.kdeplot(ax=locals()["ax"+str(run_no)], x=test_df[col], zorder=2, alpha=1, linewidth=1, color='#ff355d')
        locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
        locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
        locals()["ax"+str(run_no)].set_ylabel('')
        locals()["ax"+str(run_no)].set_xlabel(col, fontsize=4, fontweight='bold')
        locals()["ax"+str(run_no)].tick_params(labelsize=4, width=0.5)
        locals()["ax"+str(run_no)].xaxis.offsetText.set_fontsize(4)
        locals()["ax"+str(run_no)].yaxis.offsetText.set_fontsize(4)
        run_no += 1

    plt.show()

In [None]:
plot_kde(list(train_df.columns[0:25]))
plot_kde(list(train_df.columns[25:50]))
plot_kde(list(train_df.columns[50:75]))
plot_kde(list(train_df.columns[75:100]))

In [None]:
target_df = pd.DataFrame(train_df['target'].value_counts()).reset_index()
target_df.columns = ['target', 'count']

target_percent_df = pd.DataFrame(train_df['target'].value_counts()/train_df.shape[0]).reset_index()
target_percent_df.columns = ['target', 'count']

plt.rcParams['figure.dpi'] = 600
fig = plt.figure(figsize=(5, 1), facecolor='#f6f5f5')
gs = fig.add_gridspec(1, 2)
gs.update(wspace=0.3, hspace=0.05)

background_color = "#f6f5f5"

ax0 = fig.add_subplot(gs[0, 0])
for s in ["right", "top"]:
    ax0.spines[s].set_visible(False)
ax0.set_facecolor(background_color)
ax0_sns = sns.barplot(ax=ax0, y=target_df['target'], x=target_df['count'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax0_sns.set_xlabel("count",fontsize=3, weight='bold')
ax0_sns.set_ylabel("",fontsize=3, weight='bold')
ax0_sns.tick_params(labelsize=3, width=0.5, length=1.5)
ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0.text(0, -0.8, 'Claim', fontsize=4, ha='left', va='top', weight='bold')
ax0.text(0, -0.65, 'Both of 0 and 1 has almost the same numbers', fontsize=2.5, ha='left', va='top')
ax0.get_xaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
# data label
for p in ax0.patches:
    value = f'{p.get_width():,.0f}'
    x = p.get_x() + p.get_width() + 10000
    y = p.get_y() + p.get_height() / 2 
    ax0.text(x, y, value, ha='left', va='center', fontsize=2, 
            bbox=dict(facecolor='none', edgecolor='black', boxstyle='round', linewidth=0.2))
    
ax1 = fig.add_subplot(gs[0, 1])
for s in ["right", "top"]:
    ax1.spines[s].set_visible(False)
ax1.set_facecolor(background_color)
ax1_sns = sns.barplot(ax=ax1, y=target_percent_df['target'], x=target_percent_df['count'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax1_sns.set_xlabel("percentage",fontsize=3, weight='bold')
ax1_sns.set_ylabel("",fontsize=3, weight='bold')
ax1_sns.tick_params(labelsize=3, width=0.5, length=1.5)
ax1_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax1_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
ax1.text(0, -0.8, 'Claim in %', fontsize=4, ha='left', va='top', weight='bold')
ax1.text(0, -0.65, 'Both of 0 and 1 distributrion are alomost the same of 50%', fontsize=2.5, ha='left', va='top')
# data label
for p in ax1.patches:
    value = f'{p.get_width():.2f}'
    x = p.get_x() + p.get_width() + 0.01
    y = p.get_y() + p.get_height() / 2 
    ax1.text(x, y, value, ha='left', va='center', fontsize=2, 
            bbox=dict(facecolor='none', edgecolor='black', boxstyle='round', linewidth=0.2))

## Feature Exploration (WIP)

Experiments implemented in the following notebook
www.kaggle.com/realtimshady/baseline-lgbm

In [None]:
def plot_target_series(df):
    plt.figure(figsize=(20,20))
    sns.histplot(pd.concat([df[y==0].rename('0'), df[y==1].rename('1')], axis=1), bins=100, alpha=0.5)
    plt.show()

In [None]:
'''
!pip install pandarallel -q
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
'''

In [None]:
X = train_df.drop(["target"], axis=1)
y = train_df["target"]

In [None]:
features = pd.DataFrame(y)
features['std'] = X.std(1)

I pretty much looked at whether the distribution was a unimodal point, or a bimodal distribution

In [None]:
pointy = [0,2,4,9,12,16,19,20,21,23,24,27,28,30,31,32,33,35,39,42,44,46,48,49,51,52,53,56,58,59,60,61,62,63,64,68,69,72,73,75,76,78,79,81,83,84,87,88,89,90,92,93,94,95,98,99]
bimodal = [x for x in range(0,100) if x not in pointy]

pointy = list(map(lambda x: 'f'+str(x), pointy))
bimodal = list(map(lambda x: 'f'+str(x), bimodal))

For example, a plot of target distribution with feature 'f1'

In [None]:
plot_target_series(X['f1'])

In [None]:
threshold_otsu(X['f81'])