In [None]:
import numpy as np
import pandas as pd
import datatable as dt
import matplotlib.pyplot as plt
import seaborn as sns
from skimage import io
from warnings import filterwarnings
filterwarnings('ignore')

## <span style="background:#818181;padding:0.3em;width:100%;display:block;border-radius:0.1em;color:white;font-family:Monospace">Introduction</span>

<div style="font-size: 1.2em; font-family: times-new-roman; border-left: 0.5em solid #efefef; padding-left: 1em">
    <b>Hi there,</b><br><br>
    this is my simple and basic EDA for this month competition.<br>
    I tried to focus on a simple and easy structure as well as simple code, so everyone can follow along easily.<br><br>
    <em>If you like this notebook or copy any parts of it please make sure to leave an upvote...</em><br><br>
    The dataset used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. <br>
    The original dataset deals with predicting identifying spam emails via various extracted features from the email.<br>
    Although the features are anonymized, they have properties relating to real-world features.<br><br>
    Our goal is to predict the correct target class: spam or ham evaluated by ROC.<br><br>
    <em><b>Thanks for stopping by and have fun with this month competition!</b></em>
</div>



## <span style="background:#818181;padding:0.3em;width:100%;display:block;border-radius:0.1em;color:white;font-family:Monospace">Import Data & Basic Overview</span>

In [None]:
%%time

df_train = dt.fread('../input/tabular-playground-series-nov-2021/train.csv').to_pandas()
df_test = dt.fread('../input/tabular-playground-series-nov-2021/test.csv').to_pandas()

In [None]:
print(f"Shape Train Dataset: {df_train.shape}")
print(f"Shape Test Dataset: {df_test.shape}")

df_train.head()

In [None]:
df_train.info()

In [None]:
print(f"Train Data Total Missing Values: {np.sum(df_train.isna().sum())}")
print(f"Test Data Total Missing Values: {np.sum(df_train.isna().sum())}")

<div style="font-size: 1.2em; font-family: times-new-roman; border-left: 0.5em solid #efefef; padding-left: 1em">
    <strong>Insights:</strong><br>
    After taking a first look at this month data we can conclude the following:
    <ul>
        <li>Dataset contains 102 columns in total with 100 feature columns of dtype: float64</li>
        <li>We have no missing values at all</li>
        <li> Train and Test Dataset contain nearly the same amount of observations / rows (600k vs. 540k)</li>
    </ul>
</div>

## <span style="background:#818181;padding:0.3em;width:100%;display:block;border-radius:0.1em;color:white;font-family:Monospace">Uni / Multivariate Analysis</span>

In [None]:
### create a countplot of target's distribution to check even split among classes

fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(12,6))

sns.countplot(
    data=df_train,
    x='target',
    ax=ax,
    palette='Blues'
)

fig.text(
    s='::Target Distribution',
    x=0, y=1.05,
    color='#5c5c5c',
    fontfamily='monospace', fontsize=15, fontweight='bold'
)

sns.despine()

In [None]:
### create kdeplots for each feature, compare train vs. test distribution

feat_cols = [col for col in df_train.columns if col.startswith('f')]

fig, ax = plt.subplots(10, 10, tight_layout=True, figsize=(20,20))

ax = ax.flatten()

for idx,feat in enumerate(feat_cols):
    
    sns.kdeplot(
        data=df_train,
        x=feat,
        ax=ax[idx],
        shade=True,
        alpha=0.8,
        color='#193f6e'
    )
    
    sns.kdeplot(
        data=df_test,
        x=feat,
        ax=ax[idx],
        shade=True,
        alpha=0.8,
        color='#c5bfa7'
    )
    
    ax[idx].set_yticks([])
    ax[idx].set_ylabel('')
    ax[idx].set_xlabel(xlabel=feat, fontweight='bold')

fig.text(
    s='::Feature Distribution || Train vs. Test',
    x=0, y=1.05,
    color='#5c5c5c',
    fontfamily='monospace', fontsize=15, fontweight='bold'
)    

sns.despine()

In [None]:
### create correlation map among all features

corr = df_train.drop(columns='id').corr()
mask = np.triu(corr)

fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(15,15))

sns.heatmap(
    data=corr,
    cmap='coolwarm',
    annot=False,
    linewidth=0.05,
    cbar=True,
    mask=mask,
    ax=ax
)

fig.text(
    s='::Feature Correlation Map',
    x=0, y=1.05,
    color='#5c5c5c',
    fontfamily='monospace', fontsize=15, fontweight='bold'
)  

sns.despine()

<div style="font-size: 1.2em; font-family: times-new-roman; border-left: 0.5em solid #efefef; padding-left: 1em">
    <strong>Insights:</strong><br>
    After plotting some basic distributions and correlation we might conclude:
    <ul>
        <li>Target class is nearly evenly distributed (However I'd still use StratifiedKFold as CV)</li>
        <li>Train and Test Data are equally distributed</li>
        <li>We have some interestingly similiar distributions among the features (e.g f5-f8)</li>
        <li>There are some stronger correlations between some features and the target variable (e.g f34)</li>
        <li>There is no multicolinearity evident</li>
    </ul>
</div>

## <span style="background:#818181;padding:0.3em;width:100%;display:block;border-radius:0.1em;color:white;font-family:Monospace">Feature Analysis (Detail)</span>

In [None]:
### create list with top correlating features and plot
pos_corr = corr['target'].sort_values(ascending=False)[1:5]

fig, ax = plt.subplots(2, 2, tight_layout=True, figsize=(15,6))
ax = ax.flatten()

for idx, feat in enumerate(pos_corr.index):
    
    sns.kdeplot(
        data=df_train,
        x=feat,
        hue='target',
        ax=ax[idx],
        palette='Blues',
        shade=True,
        alpha=0.8,
        edgecolor='black',
        linewidth=1
    )
    
    ax[idx].set_yticks([])
    ax[idx].set_ylabel('')
    ax[idx].set_xlabel(xlabel=feat, fontweight='bold')
    
fig.text(
    s='::TOP Positive Correlated Features',
    x=0, y=1.05,
    color='#5c5c5c',
    fontfamily='monospace', fontsize=15, fontweight='bold'
)

sns.despine(left=True)

In [None]:
### plot correlating features w.r.t target class
neg_corr = corr['target'].sort_values(ascending=True)[:4]

fig, ax = plt.subplots(2, 2, tight_layout=True, figsize=(15,6))
ax = ax.flatten()

for idx, feat in enumerate(neg_corr.index):
    
    sns.kdeplot(
        data=df_train,
        x=feat,
        hue='target',
        ax=ax[idx],
        palette='Blues',
        shade=True,
        alpha=0.8,
        edgecolor='black',
        linewidth=1
    )
    
    ax[idx].set_yticks([])
    ax[idx].set_ylabel('')
    ax[idx].set_xlabel(xlabel=feat, fontweight='bold')
    
fig.text(
    s='::TOP Negative Correlated Features',
    x=0, y=1.05,
    color='#5c5c5c',
    fontfamily='monospace', fontsize=15, fontweight='bold'
)

sns.despine(left=True)

In [None]:
### create row based features for further analysis
df_train['row_sum'] = df_train[feat_cols].sum(axis=1)
df_train['row_mean'] = df_train[feat_cols].mean(axis=1)
df_train['row_std'] = df_train[feat_cols].std(axis=1)

In [None]:
### get all row based features and plot distribution wrt to target

row_feat = [col for col in df_train.columns if col.startswith('row')]

fig, ax = plt.subplots(1, 3, tight_layout=True, figsize=(15,4))

for idx, feat in enumerate(row_feat):
    sns.kdeplot(
        data=df_train,
        x=feat,
        hue='target',
        ax=ax[idx],
        palette='Blues',
        shade=True,
        alpha=0.8,
        edgecolor='black',
        linewidth=1
    )
    
    ax[idx].set_yticks([])
    ax[idx].set_ylabel('')
    ax[idx].set_xlabel(xlabel=feat, fontweight='bold')
    
fig.text(
    s='::Row Features w.r.t target',
    x=0, y=1.05,
    color='#5c5c5c',
    fontfamily='monospace', fontsize=15, fontweight='bold'
)

sns.despine(left=True)

<div style="font-size: 1.2em; font-family: times-new-roman; border-left: 0.5em solid #5c5c5c; padding-left: 1em">
    <b>Thank you for checking out my Notebook!</b><br>
    Leave a comment down below or just an simple upvote if you find this notebook helpful.
</div>