Here, new month, new TPS competition. Let's look at the data.

# 0. Introduction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

bold = "\033[1m"
end = "\033[0m"

pd.options.display.max_rows = 150

In [None]:
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

print("Train set: \n")
print(train.shape)
display(train.head())

print("\nTest set: \n")
print(test.shape)
display(test.head())

We have nearly a million observations with 118 predictors for training.

In [None]:
train.info()

In [None]:
train.describe().round(3).T

It seems, all features are continuous. We'll check it.

# 1. Target Distribution

In [None]:
target = "claim"
predictors = [x for x in train.columns if x not in ["id", target]]

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))

labels = train[target].value_counts().index.tolist()
palette = ["#0EB8F1", "#F1480F", "#971194", "#FEE715", "#101820"]

ax.pie(train[target].value_counts(), labels = labels, autopct = '%1.2f%%', 
       startangle = 180, colors = palette[: len(labels)])

ax.set_title(target)
plt.show()

We have a balanced dataset. Nice.

# 2. Unique Values

In [None]:
counts = []
for col in predictors:
    count = train[col].value_counts().size
    counts.append(count)

counts_df = pd.DataFrame({"Feature": predictors, "UniqueValues": counts}).sort_values("UniqueValues", ascending = False)

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (15, 8), sharex = True, dpi = 300, facecolor = "#C0C0C0")

sns.barplot(x = "UniqueValues", y = "Feature", data = counts_df[:59], 
            ax = axes[0], palette = "Reds_r", edgecolor = "black", linewidth = 0.5)
sns.barplot(x = "UniqueValues", y = "Feature", data = counts_df[-59:].sort_values("UniqueValues", ascending = True), 
            ax = axes[1], palette = "Blues_r", edgecolor = "black", linewidth = 0.5)

for p in axes[0].patches[:10]:
    axes[0].annotate(int(p.get_width()), xy = (p.get_width(), p.get_y() + p.get_height() / 2),
                     xytext = (-30, 0), textcoords = "offset points", ha = "left", va = "center", 
                     fontsize = 5, color = "#C7D3D4")
    
    
for p in axes[1].patches[:10]:
    axes[1].annotate(int(p.get_width()), xy = (p.get_width(), p.get_y() + p.get_height() / 2), 
                     xytext = (3, 0), textcoords = "offset points", ha = "left", va = "center", fontsize = 5)

for ax in axes:
    
    ax.set_facecolor("#C0C0C0")
    ax.tick_params(axis = "y", which = "major", labelsize = 6)
    ax.tick_params(axis = "x", which = "major", labelsize = 6)

axes[1].set_ylabel("")

sns.despine(top = True, right = True, left = True, bottom = True)
plt.show()

Left part of graph includes continuous features that have more unique values.

Right part, especially top of right part, contains features that have least unique values. 

# 3. Missing Values

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (2, 13), sharex = True, dpi = 200, facecolor = "#C7D3D4")

sns.barplot(x = train[predictors].isnull().sum().values, 
            y = train[predictors].isnull().sum().index, 
            ax = axes[0], color = "#101820")

sns.barplot(x = test[predictors].isnull().sum().values, 
            y = test[predictors].isnull().sum().index, 
            ax = axes[1], color = "#757575")

for p in axes[0].patches:
    axes[0].annotate(int(p.get_width()), xy = (p.get_width(), p.get_y() + p.get_height() / 2),
                     xytext = (-25, 0), textcoords = "offset points", ha = "left", va = "center", 
                     fontsize = 4, color = "#757575")

for p in axes[1].patches:
    axes[1].annotate(int(p.get_width()), xy = (p.get_width(), p.get_y() + p.get_height() / 2), 
                     xytext = (-14, 0), textcoords = "offset points", ha = "left", va = "center", 
                     fontsize = 4, color = "#101820")
    
for ax in axes:
    
    ax.set_facecolor("#C7D3D4")
    ax.tick_params(axis = "y", which = "major", labelsize = 4.5)
    ax.tick_params(axis = "x", which = "major", labelsize = 4.5)
    ax.set_xticks([])

axes[0].set_title("Missing Values for Training Set \nTotal: " + str(train[predictors].isnull().sum().sum()), 
                  fontsize = 4, color = "#101820")
axes[1].set_title("Missing Values for Test Set \nTotal: " + str(test[predictors].isnull().sum().sum()), 
                  fontsize = 4, color = "#101820")

sns.despine(top = True, right = True, left = True, bottom = True)
plt.tight_layout()
plt.show()

We have nearly 2 million missing values for training set, 15000 per feature.

We have almost 1 million missing values for test set, 8000 per feature.

# 4. Feature Distribution

In [None]:
position = range(1, len(predictors) + 1)

plt.rcParams["font.family"] = "Times New Roman"
fig = plt.figure(1, figsize=(30, 25), facecolor = "#C7D3D4")

for col, pos in zip(predictors, range(len(predictors))):
    
    skewness = np.round(train[col].skew(), 3)
    kurtosis = np.round(train[col].kurtosis(), 3)
    
    ax = fig.add_subplot(12, 10, position[pos])
    sns.kdeplot(data = train, x = col, ax = ax, color = "#101820")
    
    ax.set_title(r"$\bf{" + col  + "}$" + "\nSkewness: " + str(skewness) + "\nKurtosis: " + str(kurtosis))
    ax.set_facecolor("#C7D3D4")
    ax.set_xlabel("")

plt.tight_layout()
plt.show()

Distributions of features. We have normal distributed, skewed, bimodal and various distributions.

**Note: Generally, we will use boosting algorithms, tree based algorithms. These kind of models don't need to normal distributed predictors. We also don't have to scale the data.**

# 5. Predictors - Target

In [None]:
position = range(1, len(predictors) + 1)

fig = plt.figure(1, figsize=(30, 25), facecolor = "#C7D3D4")

order = sorted(train[target].unique())
palette = ["#0EB8F1", "#F1480F", "#971194", "#FEE715", "#101820"]

for col, pos in zip(predictors, range(len(predictors))):
    
    ax = fig.add_subplot(12, 10, position[pos])
    sns.boxplot(data = train, y = col, hue = target, ax = ax, x = [""] * len(train), 
                palette = palette[:len(order)], linewidth = 0.5, 
                flierprops = dict(marker = "x", markersize = 3.5))
    
    ax.set_title(r"$\bf{" + col  + "}$")
    ax.set_facecolor("#C7D3D4")
    ax.set_ylabel("")
    
    ax.get_legend().remove()
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc = 'upper center')

plt.tight_layout()
plt.show()

Predictors against classes. It looks like there is no significant pattern.

# 6. Missing Values - Target

I saw this idea from [@dwin183287](https://www.kaggle.com/dwin183287)'s notebook. It would be important feature.

You can take a look at that notebook. https://www.kaggle.com/dwin183287/tps-september-2021-eda It contains great visualizations.

In [None]:
train["Missing"] = train.isnull().sum(axis = 1)

In [None]:
fig, ax = plt.subplots(figsize = (12, 8))
ax.set_facecolor("#C7D3D4")
sns.boxplot(data = train, y = "Missing", hue = target, ax = ax, x = [""] * len(train), palette = palette[:len(order)], 
                linewidth = 0.5, flierprops = dict(marker = "x", markersize = 3.5))

In [None]:
missing = train.groupby("Missing")[target].mean().round(3)

fig, ax = plt.subplots(figsize = (12, 8), facecolor = "#C7D3D4")
ax.set_facecolor("#C7D3D4")

sns.barplot(x = missing.index, y = missing.values, ax = ax, color = "#101820")

for p in ax.patches:
        
    txt = "{:.1f}".format(p.get_height() * 100) + "%"
    
    if p.get_height() > 0.7:
        text = ax.text(p.get_x() + 0.02, p.get_height() + 0.025, txt, 
                       fontsize = 10, color = "#101820", fontweight = 400)
        text.set_bbox(dict(boxstyle = "round4", facecolor = "#F1480F", alpha = 0.3, edgecolor = "#F1480F"))

    else:
        text = ax.text(p.get_x() + 0.02, p.get_height() + 0.025, txt, 
                       fontsize = 10, color = "#101820", fontweight = 400)
        text.set_bbox(dict(boxstyle = "round4", facecolor = "#0EB8F1", alpha = 0.1, edgecolor = "#0EB8F1"))

ax.set_title("Missing Values & Target", fontweight = "bold")
ax.set_xlabel("Total Missing Values per Observations")
ax.set_ylabel("Claim Probability")
ax.set_yticks([])

sns.despine(top = True, right = True, left = True, bottom = True)
plt.show()