# Exploratory Data Analysis

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.rcParams["figure.figsize"] = (14, 8)
sns.set_theme(context="notebook", style="whitegrid")

## Config

In [None]:
# file paths
DATA_DIR = Path("..", "input", "tabular-playground-series-may-2021") 

# data
TRAIN_DATA = DATA_DIR / "train.csv"

# columns in the data
INDEX_COL = "id"

TARGET_COL = "target"

# random state
RANDOM_STATE = 42

## Loading the data

In [None]:
df = pd.read_csv(TRAIN_DATA, index_col=INDEX_COL)
df.info()

## Separate features from target

In [None]:
y = df[TARGET_COL]
X = df.drop(TARGET_COL, axis=1)

## Distribution of variables

### Target variable

In [None]:
fig, ax = plt.subplots()

sns.countplot(x=y, ax=ax)
ax.set_title(f"Distribution of {TARGET_COL}")
plt.show()

### Numeric features

In [None]:
numeric_cols = X.columns.tolist()
rows = np.ceil(len(numeric_cols) / 2).astype(int)
fig, axes = plt.subplots(rows, 2, figsize=(14, 8 // 2 * rows))
plt.tight_layout()

for i, col in enumerate(numeric_cols):
    ax = axes[i // 2, i % 2]
    sns.histplot(data=df, x=col, ax=ax)
    ax.set_title(f"Histogram of {col}", y=0.88)

plt.show()

### Numeric features by target

In [None]:
fig, axes = plt.subplots(rows, 2, figsize=(14, 8 // 2 * rows))
plt.tight_layout()

for i, col in enumerate(numeric_cols):
    ax = axes[i // 2, i % 2]
    sns.histplot(data=df, x=col, hue=TARGET_COL, element="step", ax=ax)
    ax.set_title(f"Histogram of {col} by {TARGET_COL}", y=0.88)

plt.show()

## Mutual information

In [None]:
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X, y, random_state=RANDOM_STATE)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
fig, ax = plt.subplots(figsize=(14, len(X.columns)//3))

sns.barplot(x=mi_scores.values, y=mi_scores.index, ax=ax, color="tab:blue")
ax.set_title("Mutual information scores")
plt.show()

## Obtain discrete features

In [None]:
threshold = 15
cardinality = X.nunique()
discrete = cardinality[cardinality < threshold]
discrete

## Obtain informative features per category

In [None]:
informative_features = set(mi_scores[mi_scores > 0].index)
discrete_features = set(discrete.index)
informative_continuous = sorted(list(informative_features - discrete_features))
informative_discrete = sorted(list(informative_features & discrete_features))

print(f"Continuous informative features:\n{informative_continuous}")
print(f"Discrete informative features:\n{informative_discrete}")
print(f"Total features: {len(informative_continuous) + len(informative_discrete)}")