In [None]:
from copy import deepcopy
import warnings
 
warnings.filterwarnings("ignore")
 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
 
sns.set_style("whitegrid")
 
%matplotlib inline

# Reading data
 
Uploaded dataset to githib to use similarity with **colab**, **kaggle** and **local**.

In [None]:
data_folder = "https://raw.githubusercontent.com/XelorR/kaggle_tabular_playground_may21/master/input/tabular-playground-series-may-2021"
 
train = pd.read_csv(f"{data_folder}/train.csv")
test = pd.read_csv(f"{data_folder}/test.csv")
sample_submission = pd.read_csv(f"{data_folder}/sample_submission.csv")
 
train.shape, test.shape, sample_submission.shape

# Functions

In [None]:
def describe_nums(df, sort_by="unique"):
    return (
        df.nunique()
        .to_frame()
        .rename(columns={0: "unique"})
        .join(df.describe().T, how="left")
        .sort_values([sort_by], ascending=False)
        .style.bar(["mean"])
        .background_gradient(subset=["50%"], cmap="viridis")
        .background_gradient(subset=["std"], cmap="Reds")
        .bar("unique", color="lightblue")
    )
 
 
def show_IQR(series, bins_n=None):
 
    perc_25 = series.quantile(0.25, interpolation="midpoint")
    perc_75 = series.quantile(0.75, interpolation="midpoint")
    IQR = perc_75 - perc_25
 
    print(
        f"Unique values count: {series.nunique()}",
        f"\nQ1: {perc_25}\nQ3: {perc_75}\nIQR: {IQR}",
        f"\nOutliers borders: [{perc_25 - 1.5*IQR}, {perc_75 + 1.5*IQR}]",
    )
 
    fig, axes = plt.subplots(ncols=2, figsize=(16, 6))
    sns.distplot(series.values, bins=bins_n, color="#50248f", ax=axes[0]).set(
        xlabel=series.name,
        ylabel="Quantity (frequency)",
        title=series.name + " distribution\n",
    )
 
    sns.boxplot(series.values, color="#38d1ff", ax=axes[1]).set(
        xlabel=series.name, title=series.name + " distribution\n"
    )
 
    plt.show()
 
 
def show_by_cat(df, cat, numeric):
    fig, ax = plt.subplots(figsize=(12, 6))
    sns.boxplot(
        x=cat,
        y=numeric,
        data=df,
        ax=ax,
    )
    ax.set_title("Boxplot for: " + cat + " vs " + numeric)
    plt.show()

# EDA

## Describe

In [None]:
train.columns

In [None]:
describe_nums(train.loc[:, train.columns.str.startswith("feature")])

## Distribution plots

- all features looks like nominative or ordinal variables
- all features have more low values
- all features have a lot of zeros

In [None]:
# zeros are outliers, let's exclude it for more clear picture
 
for c in train.columns:
    if c.startswith("feature"):
        show_IQR(train.query(f"{c} != 0")[c], None)

## Correlation
 
 
There is no important correlation in this dataset

In [None]:
plt.figure(figsize=(50,30))
sns.heatmap(train[[c for c in train.columns if c.startswith("feature")]].replace({0, np.nan}).corr(), annot=True, cmap="viridis")

## Target distribution
 
Dataset is imbalanced: only 8490 rows belongs to class 1 while 57497 (more than half) belongs to class 2
 
Target classes distributed quite similarly for most of variables, except some features:

- class 3 is wider than others: features 21, 26, 40, 42, 43, 45
- class 3 is too thin: 36, 34
- classes 2 and 3 are wider than 1 and 4: 33, 49
- feature 12 is maybe ordinal (represent months?)
- class 1 is wider than others: 6, 27, 37, 38
- features 2, 13, 22, 36 have to low number of unique values

In [None]:
train.target.value_counts().to_frame().reset_index().sort_values(["index"]).set_index("index").style.bar(color="lightblue")

In [None]:
train.drop(["id"], axis=1).groupby("target").nunique().T

In [None]:
for col in train.columns[train.columns.str.startswith("feature")]:
    show_by_cat(train.query(f"{col} != 0").sort_values(["target"]), "target", col)

## By-feature deep dive
 
Work in progress