# Exploratory Data Analysis

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.rcParams["figure.figsize"] = (14, 8)
sns.set_theme(context="notebook", style="whitegrid")

## Config

In [None]:
# file paths
DATA_DIR = Path("..", "input", "tabular-playground-series-mar-2021")

# data
TRAIN_DATA = DATA_DIR / "train.csv"

# columns in the data
INDEX_COL = "id"

TARGET_COL = "target"

# random state
RANDOM_STATE = 42

## Loading the data

In [None]:
df = pd.read_csv(TRAIN_DATA, index_col=INDEX_COL)
df.info()

## Pandas profiling report

In [None]:
from pandas_profiling import ProfileReport

profile = ProfileReport(df)
profile

## Separate features from target

In [None]:
y = df[TARGET_COL]
X = df.drop(TARGET_COL, axis=1)

numeric_dtypes =  ["int64", "float64"]
categorical_df = X.select_dtypes(exclude=numeric_dtypes)

## Encoding categorical features

In [None]:
for col in categorical_df.columns:
    X[col], _ = X[col].factorize()

X[categorical_df.columns].info()

## Mutual information

In [None]:
from sklearn.feature_selection import mutual_info_classif

discrete_features = X.columns.isin(categorical_df.columns)
mi_scores = mutual_info_classif(
    X, y, discrete_features=discrete_features, random_state=RANDOM_STATE
)

mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
fig, ax = plt.subplots()

sns.barplot(x=mi_scores.values, y=mi_scores.index, ax=ax, color="tab:blue")
ax.set_title("Mutual information scores")
plt.show()