# EDA : TPS - Oct 2021

# Important Notes:
- Binary Classification : Labels (Target) are either 0 or 1, and well-balanced
- 285 features, 1 label
- The features dtypes are 240 'float64' columns and 45 'int64' columns
- 240 features are numerical, and 45 features are categorical:
    - 'f22', 'f43', and 'f242' ~ 'f284' are binary
- All the features are normalized that all of them has min = 0, max = 1
- No missing value in neither train nor test dataset

# Import Libraries & Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline
np.random.seed(2021)

In [None]:
cp = "/kaggle/input/tabular-playground-series-oct-2021/"
df_train = pd.read_csv(cp + "train.csv")
df_test = pd.read_csv(cp + "test.csv")
submission = pd.read_csv(cp + "sample_submission.csv")

In [None]:
# Labels extraction & redundant columns removal
targets = df_train['target']
df_train.drop(['id', 'target'], axis = 1, inplace = True)
df_test.drop(['id'], axis = 1, inplace = True)

# General Overview

In [None]:
df_train.head(3)

In [None]:
df_train.info()

### Shapes

In [None]:
print(f"df_train shape : {df_train.shape}")
print(f"df_test shape : {df_test.shape}")

### Missing Value

In [None]:
print(f"# of any missing value(s) in df_train / df_test: {df_train.isnull().sum().sum() + df_test.isnull().sum().sum()}")

In [None]:
del df_test # Just to have some memory available temporarily
gc.collect()

# Labels Overview

The labels (target) are well-balanced.

In [None]:
plt.figure(figsize = (6, 6))
plt.xticks(size = 12); plt.yticks(size = 12)
sns.set(style = "darkgrid")
ax = sns.countplot(x = targets, linewidth = 5, palette="Set2")
ax.set_title('Target Countplot', fontsize = 20)
ax.set_xlabel('Target', fontsize = 12); ax.set_ylabel('Frequency', fontsize = 12)
total = targets.shape[0]
for p in ax.patches:
    percent = 100 * p.get_height() / total
    percent_t = f"{percent:.2f}%"
    x, y = p.get_x() + p.get_width() / 2, p.get_y() + p.get_height() / 2
    ax.annotate(percent_t, (x, y), fontsize = 12, ha = 'center')
    p.set_width(p.get_width() * 0.5)
    p.set_x(p.get_x() + p.get_width() * 0.5)
plt.show()

# Features Overview

In [None]:
binaryColumns = []
for idx, col in enumerate(df_train.columns):
    if len(df_train[col].unique()) == 2:
        binaryColumns.append(col)
print(f"Number of Binary features : {len(binaryColumns)}")
print(f"Binary features : {binaryColumns}")

## Feature Statistics

In [None]:
df_train.describe().T.style.background_gradient(cmap = 'Blues')\
                           .bar(subset = ["mean",], color = 'lightgreen')\
                           .bar(subset = ["std"], color = '#ee1f5f')\
                           .bar(subset = ["max"], color = '#FFA07A')

## Feature Correlation

In [None]:
corr = df_train.corr()
f, ax = plt.subplots(figsize = (18, 18))
ax.set_title('Feature Correlation (Train Data)', fontsize = 24, y = 1.05)
mask = np.triu(np.ones_like(corr, dtype = bool))
sns.heatmap(corr, annot = False, mask = mask, center=0, linewidths = .5, cmap = "coolwarm")#, vmin=-0.05, vmax= 0.05)
plt.show()

In [None]:
gc.collect()

## Feature Skewness & Kurtosis

In [None]:
nonBinaryColumns = [col for col in df_train.columns if col not in binaryColumns]
skewness_train = df_train.loc[:, nonBinaryColumns].skew(axis = 0, skipna = True)
kurtosis_train = df_train.loc[:, nonBinaryColumns].kurtosis(axis = 0, skipna = True)

In [None]:
sns.set(style = "darkgrid")
fig, ax = plt.subplots(1,2,figsize = (18, 5))
plt.xticks(size=12); plt.yticks(size = 12)
ax = ax.flatten()
sns.boxplot(x=skewness_train, ax= ax[0]); sns.boxplot(x=kurtosis_train, ax= ax[1])
ax[0].set_title('Skewness', fontsize=20); ax[1].set_title('Kurtosis', fontsize=20)
plt.show()

## Binary Features Proportion & Correlation

In [None]:
binaryToTargetCorrelations = df_train.loc[:,binaryColumns].apply(lambda x : x.corr(targets))
binaryColumnsProportions = df_train.loc[:, binaryColumns].mean()
df_binaryCorProp = pd.concat([binaryToTargetCorrelations, binaryColumnsProportions], axis=1)
df_binaryCorProp.reset_index(inplace = True)
df_binaryCorProp.columns = ['feature', 'correlation', 'proportion']
df_binaryCorProp.sort_values(ascending = False, by = ['proportion'], inplace = True)

In [None]:
# plt.figure(figsize = (18, 6))
# plt.xticks(size = 12); plt.yticks(size = 12)
# sns.set(style = "darkgrid")
# ax = sns.scatterplot(data = df_binaryCorProp, x = 'feature', y = 'proportion')
# ax2 = ax.twinx()
# sns.scatterplot(data = df_binaryCorProp, x = 'feature', y = 'correlation', ax = ax2, color = 'r')
# features = list(df_binaryCorProp["feature"])
# ax.set_xticks(features); ax.set_xticklabels(features, rotation = 45)
# ax.set_xlabel('Feature', fontsize = 12); ax.set_ylabel('Proportion of 1', fontsize = 12); ax2.set_ylabel('Correlation', fontsize = 12)
# ax.set_title('Proportion of Binary Features & Correlations to the Target', fontsize = 20)
# plt.show()

In [None]:
fig, axes = plt.subplots(9,5,figsize = (20, 20))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    gc.collect()
    df_temp = df_train.sample(n = 10000)
    sampleIdx = df_temp.index.tolist()
    target_temp = targets[sampleIdx]
    df_temp['target'] = target_temp
    sns.kdeplot(df_temp[df_temp["target"] == 1][binaryColumns[idx]], shade=True, color="blue", label="1", ax=ax)
    sns.kdeplot(df_temp[df_temp["target"] == 0][binaryColumns[idx]], shade=True, color="green", label="0", ax=ax)
    ax.get_yaxis().set_visible(False)
    ax.set_title(f'f{idx + 121}', loc = 'right', fontsize = 12)
    ax.legend()
fig.suptitle("Binary Feature vs Target")
fig.tight_layout()
plt.show()

## Numerical Features Distribution

In [None]:
gc.collect()
df_test = pd.read_csv(cp + "test.csv")
df_test.drop(['id'], axis = 1, inplace = True)

In [None]:
fig, axes = plt.subplots(11,11,figsize = (20, 15))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    sns.kdeplot(data = df_train.sample(n = 30000), x = f'f{idx}', fill = True, ax = ax)
    sns.kdeplot(data = df_test.sample(n = 30000), x = f'f{idx}', fill = True, ax = ax)    
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel(''); ax.set_ylabel('')
    ax.set_title(f'f{idx}', loc = 'right', fontsize = 12)
fig.tight_layout()
plt.show()

In [None]:
gc.collect()

In [None]:
fig, axes = plt.subplots(11,11,figsize = (20, 15))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    sns.kdeplot(data = df_train.sample(n = 30000), x = f'f{idx + 121}', fill = True, ax = ax)
    sns.kdeplot(data = df_test.sample(n = 30000), x = f'f{idx + 121}', fill = True, ax = ax)    
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel(''); ax.set_ylabel('')
    ax.set_title(f'f{idx + 121}', loc = 'right', fontsize = 12)
fig.tight_layout()
plt.show()