# One stop shop to understand behavior of all the features.

## This notebook captures:

### For the traget:
- Distribution of target variable in train
- Most correlated features

### For each feature:
- Statistics across train (with target=0), train (with target=1), test
- Most popular values. 
    - For each value, number of occurrences in train and test, sum of the target=1 and mean of the target=0
- Biggest target=1 sum values in train
- Most correlated features

#### This work is based on the following work by Kaggle GM alijs (@alijs1):

https://www.kaggle.com/alijs1/ieee-transaction-columns-reference

#### You can view the TPS September version of this notebook here:
https://www.kaggle.com/arnabbiswas1/reference-doc-for-individual-columns/

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import HTML

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/test.csv")

In [None]:
train_df.head()

In [None]:
def display_html(content):
    display(HTML(content))

def _describe(data, col, label):
    df = data.describe().reset_index()
    df.columns = [col, label]
    df = df.append({col:'unique_values', label: data.nunique()}, ignore_index=True)
    df = df.append({col:'nans', label:data.isnull().sum()}, ignore_index=True)
    df = df.append({col:'nans_share', label: np.round(data.isnull().sum() * 100 / len(data), 6)}, ignore_index=True)
    return df

def describe_feature(name, target, train_df, test_df):
    d0 = _describe(train_df[name], name, 'Train')
    d1 = _describe(train_df.loc[train_df[target] == 1, name], name, 'Train Traget 1')
    d2 = _describe(train_df.loc[train_df[target] == 0, name], name, 'Train Target 0')
    d3 = _describe(test_df[name], name, 'Test')
    dd = d0.merge(d1).merge(d2).merge(d3)
    display(dd)
    
    display_html('<b>Most popular values (NaN = -999):</b>')
    N = 10
    d0 = train_df[[target, name]].fillna(-999).groupby(name)[target].agg(['size','mean','sum']).reset_index().sort_values('size', ascending=False).reset_index(drop=True)
    d1 = test_df.reset_index()[['id', name]].fillna(-999).groupby(name)['id'].count().reset_index()
    dd = d0.merge(d1, how='left', on=name).head(N)
    dd = dd.rename({'size':'count_in_train','mean':'mean_value_of_target','sum':'sum_value_of_target','id':'count_in_test'}, axis=1)
    display(dd)

    display_html('<b>Biggest target sum values in train (NaN = -999):</b>')
    dd = d0.sort_values('sum', ascending=False).reset_index(drop=True).head(N).merge(d1, how='left', on=name)
    dd = dd.rename({'size':'count_in_train','mean':'mean_value_of_target','sum':'sum_value_of_target','id':'count_in_test'}, axis=1)
    display(dd)

def correlation(col, train_df):
    N = None #10000
    num_vars = [feature for feature in train_df.columns if train_df[feature].dtype != 'object']
    df_sample = train_df.head(N) if N is not None else train_df.copy()
    corrs = df_sample[num_vars].corrwith(df_sample[col]).reset_index().sort_values(0, ascending=False).reset_index(drop=True).rename({'index':'feature_name', 0: f'correlation with {col}'}, axis=1)
    display_html('<b>Most correlated values with ' + col + ':</b>')
    corr_df = pd.concat([corrs.head(6), corrs.dropna().tail(5)])
    display_html(corr_df.to_html(escape=False))
    
def dump_feature_details(feature_name, target, train_df, test_df):
    display_html(f"<h2>Feature Name: {feature_name}</h2>")
    describe_feature(feature_name, target, train_df, test_df)
    correlation(feature_name, train_df)
    

def plot_point_train_test_side_by_side_w_color_based_on_target(train_df, test_df, feature_name, figsize=(20, 4)):
    fig, ((ax1, ax2)) = plt.subplots(1, 2, figsize=figsize, sharey=True)
    N = 5000
    train_df = train_df[0: N]
    test_df = test_df[0: N]
    train_df[train_df[target] == 0][feature_name].plot(
        style=".",
        alpha=0.3,
        ax=ax1,
        color="blue",
    )
    train_df[train_df[target] == 1][feature_name].plot(
        style=".",
        alpha=0.2,
        ax=ax1,
        color="orange",
    )
    test_df[feature_name].plot(
        style=".",
        alpha=0.2,
        ax=ax2,
        color="green",
    )
    ax1.set_title(f"{feature_name} train [First {N} rows] (blue=target_0, orange=target_1)")
    ax2.set_title(f"{feature_name} test [First {N} rows] (green)")
    plt.ylabel(f"Value of {feature_name}")
    plt.show()
    

def plot_hist_train_test_overlapping(
    df_train, df_test, feature_name, kind="hist", figsize=(10, 10), bins=100
):
    """
    Plot histogram for a particular feature both for train and test.

    kind : Type of the plot

    """
    df_train[feature_name].plot(
        kind=kind,
        figsize=figsize,
        label="train",
        bins=bins,
        alpha=0.4,
        color="blue",
        title=f"Train vs Test {feature_name} distribution",
    )
    df_test[feature_name].plot(
        kind="hist",
        figsize=figsize,
        label="test",
        bins=bins,
        alpha=0.4,
        color="orange",
    )
    plt.legend()
    plt.show()

def plot_boxh_train_test_overlapping(
    train_df, test_df, feature_name, kind="box", log=False, figsize=(10, 4)
):
    """
    Box plot train and test
    """
    fig, ((ax1, ax2)) = plt.subplots(2, 1, sharex=True, figsize=figsize)

    ax1 = train_df[feature_name].plot(
        kind="box",
        vert=False,
        ax=ax1,
        subplots=False,
        label="train",
        title=f"Distribution of {feature_name}",
    )
    ax2 = test_df[feature_name].plot(kind="box", vert=False, label="test", ax=ax2)
    plt.show()
    
def plot_point_train_test_side_by_side(train_df, test_df, feature_name, target, figsize=(20, 4)):
    fig, ((ax1, ax2)) = plt.subplots(1, 2, figsize=figsize, sharey=True)
    N = 5000
    train_df = train_df[0: N]
    test_df = test_df[0: N]
    train_df[train_df[target] == 0][feature_name].plot(
        style=".",
        alpha=0.3,
        ax=ax1,
        color="blue",
    )
    train_df[train_df[target] == 1][feature_name].plot(
        style=".",
        alpha=0.2,
        ax=ax1,
        color="orange",
    )
    test_df[feature_name].plot(
        style=".",
        alpha=0.2,
        ax=ax2,
        color="green",
    )
    ax1.set_title(f"{feature_name} train [First {N} rows] (blue=target_0, orange=target_1)")
    ax2.set_title(f"{feature_name} test [First {N} rows] (green)")
    plt.ylabel(f"Value of {feature_name}")
    plt.show()

In [None]:
target = 'target'
display_html(f"<h2>Target Variable:  {target}</h2>")
display_html('This is target column.')
df = train_df[target].value_counts().reset_index().rename({'index':'Value','target':'Count'}, axis=1)
df['Share'] = np.round(df['Count'] * 100 / df['Count'].sum(), 6)
display(df)
correlation(target, train_df)

In [None]:
target = "target"
for feature_name in train_df.drop(["id", "target"], axis=1)[0: 2]:
    dump_feature_details(feature_name, target, train_df, test_df)
    plot_point_train_test_side_by_side(train_df, test_df, feature_name=feature_name, target=target, figsize=(20, 3))
    plot_hist_train_test_overlapping(train_df, test_df, feature_name, figsize=(20, 3), bins=500)
    plot_boxh_train_test_overlapping(train_df=train_df, test_df=test_df, feature_name=feature_name, figsize=(20, 3))