## TPS October 2021 - Trying EDA

### Work in-progress...

Todo:
- Summary
- Insights
- Conclusion

## Basic setup

### Import libraries

In [None]:
%%time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

PALETTE = sns.color_palette("pastel")

### Load datasets

In [None]:
%%time

data_dir = "../input/tabular-playground-series-oct-2021/"
df_train = pd.read_csv(data_dir + "train.csv")
df_test = pd.read_csv(data_dir + "test.csv")

## Inspect the datasets

In [None]:
%%time

def inspect(df):
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print(f"Missing: {df.isna().sum().sum()}")
    print(f"Columns:")
    for c in df.columns:
        print(c, end=" ")

print("Training data:")
inspect(df_train)
print("\n")

print("\nTest data:")
inspect(df_test)
print("\n")

In [None]:
%%time

print("First five rows of training data:")
df_train.head()

In [None]:
%%time

print("First five rows of test data:")
df_test.head()

In [None]:
%%time

print("Training data stats:")
df_train.loc[:, "f0":"f284"].describe().T

In [None]:
%%time

print("Test data stats:")
df_test.loc[:, "f0":"f284"].describe().T

## Categorical vs. continuous

In [None]:
%%time

# from https://www.kaggle.com/vishwas21/tps-oct-21-eda-modeling

TARGET = "target"

features = [col for col in df_train.columns if col not in ["id", TARGET]]

cat_features = []
cont_features = []

for feat in features:
    if "int" in str(df_train[feat].dtype):
        cat_features.append(feat)
    else:
        cont_features.append(feat)

print(f"Categorical features ({len(cat_features)}):")
for feat in cat_features:
    print(feat, end=" ")

print(f"\n\nContinuous features ({len(cont_features)}):")
for feat in cont_features:
    print(feat, end=" ")

In [None]:
%%time

fig, ax = plt.subplots(figsize=[10,10])
plt.pie([len(cat_features), len(cont_features)], labels=["Categorical", "Continuous"], 
        autopct="%.2f%%", explode=[0.02]*2, pctdistance=0.5, colors=PALETTE)
plt.title("Categorical vs. Continuous", fontsize=14);
plt.show()

## Target distribution

In [None]:
%%time

fig, ax = plt.subplots(figsize=[10, 10])
labels = ["1", "0"]
plt.pie(df_train[TARGET].value_counts(), autopct="%.2f%%", labels=labels,
        explode=[0.02]*2, pctdistance=0.5, colors=PALETTE)
plt.title("Target distribution in training data", fontsize=14)
plt.show()

## Categorical feature distribution

In [None]:
%%time

# from https://www.kaggle.com/desalegngeb/octps-2021-eda-xgboost-lgbm

def count_plot(data, features, hue=None, title="Count plot"):
    count = len(features)
    nrow, ncol = int(np.ceil(count/9)), 9
    
    fig, ax = plt.subplots(nrow, ncol,figsize=(22, 12), sharey=True)
    ax = ax.flatten()

    for index, feature in enumerate(features, start=1):
        total = float(len(data)) 
        plt.subplot(nrow, ncol, index)
        ax = sns.countplot(x=feature, data=data, hue=hue, linewidth=0, palette=PALETTE)
        ax.set_xlabel(feature)
        ax.set_ylabel("")
        ax.xaxis.set_label_position("top")
        ax.get_legend().remove()
        
    lines, labels = fig.axes[-1].get_legend_handles_labels()    
    fig.legend(lines, labels, loc="upper right", borderaxespad=4.0) 
    
    fig.subplots_adjust(top=0.92, hspace=.3)
    plt.suptitle(title ,fontsize=20)
    plt.show()

In [None]:
%%time

count_plot(df_train, cat_features, hue=TARGET, title="Categorical feature: target distribution (count plot)")

## Continous feature distribution

In [None]:
%%time

# from https://www.kaggle.com/desalegngeb/octps-2021-eda-xgboost-lgbm

def new_density_plotter(train, test):
    def density_plotter(features, title="Density plot"):    
        count = len(features)
        nrow, ncol = int(np.ceil(count/10)), 10
        
        fig, ax = plt.subplots(nrow, ncol, figsize=(24, 12))
        ax = ax.flatten()
        
        for index, feature in enumerate(features, start=1):
            plt.subplot(nrow, ncol, index)
            sns_params = {
                "fill": True,
                "alpha": .5,
                "linewidth": 0,
            }
            ax = sns.kdeplot(data=train[feature], label="train", **sns_params)
            ax = sns.kdeplot(data=test[feature], label="test", **sns_params)
            ax.xaxis.set_label_position("top")
            ax.set_ylabel("")
            ax.set_yticks([])
            ax.set_xticks([])

        lines, labels = fig.axes[-1].get_legend_handles_labels()    
        fig.legend(lines, labels, loc="upper center", borderaxespad=4.0) 

        fig.subplots_adjust(top=0.90, hspace=.2)
        plt.suptitle(title, fontsize=20)
        plt.show()

    return density_plotter
    
density_plotter = new_density_plotter(df_train, df_test)

In [None]:
%%time

density_plotter(cont_features[0:60], title="Density plot of numerical features: train & test data (first set)")

In [None]:
%%time

density_plotter(cont_features[60:120], title="Density plot of numerical features: train & test data (second set)")

In [None]:
%%time

density_plotter(cont_features[120:180], title="Density plot of numerical features: train & test data (third set)")

In [None]:
%%time

density_plotter(cont_features[180:], title="Density plot of numerical features: train & test data (last set)")

### Work in-progress...