# EDA for Tabular Playground Series September 2021

Looking forward to your feedback. Please upvote if you like it. 

# Summary
* There are 118 anonymized features. 
* "claim" is the target variable.
* The training data has 957919 rows. The test data has 493474 rows. That's about half of the training set size.
* The features have missing values. The amount of missing values is in the same range for all features (train set: from 15168 to 15678 missing values). This is around 1.6% of total values.
* Roughly 1/3 of rows have no missing values. The rest has 1-15 missing values.
* The features have a very different value range. Some features have a really huge range. 
* The features have very different distributions. Some of them invite for binning. 
* There is no correlation between the features.


Train and test set are similar! (As it should be.)

In [None]:
# import packages

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib
import matplotlib.pyplot as plt
print('Matplotlib: {}'.format(matplotlib.__version__))

import seaborn as sns
print('Seaborn %s' % sns.__version__)
#-------
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# read input files
df_train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

feature_cols = [col for col in df_train.columns if col.startswith("f")]

# Overview

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
# I use a subset of features here because the full table is quite messy. 
# The purpose is to demonstrate that the value range between the features varies greatly
feature_subset=["f1","f9","f29","f35","f50","f73","f96","f100","f116","f118"]

df_train[feature_subset].describe().T\
        .drop(columns=["count", "25%", "50%", "75%"])\
        .style.bar(subset=['mean','std'])\
        .background_gradient(subset=['max'])

In [None]:
# now let's look at the same features for the test set
df_test[feature_subset].describe().T\
        .drop(columns=["count", "25%", "50%", "75%"])\
        .style.bar(subset=['mean','std'])\
        .background_gradient(subset=['max'])

# Feature Analysis, Train & Test sets
## 1. Missing values

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16,20), gridspec_kw={'width_ratios': [2, 1]})
df_train[feature_cols].isna().sum().plot(kind="barh", ax=ax[0])
df_test[feature_cols].isna().sum().plot(kind="barh", color="darkgoldenrod", ax=ax[1])
ax[0].set_title("Number of missing values in the feature columns, train set")
ax[1].set_title("Number of missing values in the feature columns, test set")

# format axes
for i in [0,1]:
    ax[i].spines['bottom'].set_visible(False)
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['right'].set_visible(False)

plt.show()
#df_train[feature_cols].isna().sum().min()
#df_train[feature_cols].isna().sum().max()

We can see that all features have roughly the same amount of missing values. In the training set the missing values are twice of those in the test set. This is expected, because the size of the training set is twice the size of the test set.

In [None]:
rows_with_missing_values = df_train[feature_cols].isna().sum(axis=1)
rows_with_missing_values_test = df_test[feature_cols].isna().sum(axis=1)

fig, ax = plt.subplots(1, 2, figsize=(16,10))

plots = rows_with_missing_values.value_counts().plot(kind="bar",  ax=ax[0])
plots_t = rows_with_missing_values_test.value_counts().plot(kind="bar",  color="darkgoldenrod", ax=ax[1])
ax[0].set_title("Number of rows with missing values, train set")
ax[1].set_title("Number of rows with missing values, test set")

for i in [0,1]:
    ax[i].spines['left'].set_visible(False) #remove the lines around the graph
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['right'].set_visible(False)
    ax[i].get_yaxis().set_ticks([]) # set no ticks
    ax[i].tick_params(axis='x', rotation=0)
    ax[i].set_xlabel("Number of missing values per row")
    #ax[i].set_ylabel("Number of rows")

for bar in plots.patches:
    plots.annotate(bar.get_height(),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=10, xytext=(0, 8),
                   textcoords='offset points')
for bar in plots_t.patches:
    plots_t.annotate(bar.get_height(),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=10, xytext=(0, 8),
                   textcoords='offset points')
    
plt.show()
# train set: 8 rows have 14 missing values...

There are a comparable number of missing values per row in train and test set. 

The train set has 359464 rows where no values are missing. There are 8 rows where 14 values are missing.


The test set has 185693 rows with no missing values and 1 row with 15 missing values. Remeber the size of the test set is half of the train set.

## 2. Duplicates

In [None]:
df_train[feature_cols].duplicated().any()

In [None]:
df_test[feature_cols].duplicated().any()

Neither train nor test set has duplicated rows.

## 3. Distribution

In [None]:
cols = df_train[feature_cols].columns.values
fig, ax = plt.subplots(59, 4, figsize=(16,200))
cnt = 0
for i in cols:
    p1 = sns.histplot(df_train[i], ax=ax[cnt//2, cnt%2])
    p1.set(ylabel=None) # no header on y axis
    p1.set(yticklabels=[]) # no numbers on y axis
    p1.tick_params(left=False) # remove ticks
    p1.spines['left'].set_visible(False) #remove the lines around the graph
    p1.spines['top'].set_visible(False)
    p1.spines['right'].set_visible(False)
    p2 = sns.histplot(df_test[i], ax=ax[cnt//2, 2+cnt%2], color='darkgoldenrod')
    p2.set(ylabel=None)
    p2.set(yticklabels=[])
    p2.tick_params(left=False)
    p2.spines['left'].set_visible(False) 
    p2.spines['top'].set_visible(False)
    p2.spines['right'].set_visible(False)
    cnt += 1
    
#plt.title("Feature Distributions in training set (blue) and test set (golden)")    
plt.show()

The distribution are comparable in train and test set. 

The distributions of the features vary greatly. It looks like binning or transforming of some features should be done before modeling.

In [None]:
# make boxplots of all features, just collapse the cell if your eyes start hurting :)
cols = df_train[feature_cols].columns.values
fix, ax =  plt.subplots(len(cols),2, figsize=(18,len(cols)*3.5))
cnt=0
for i,feat in enumerate(cols):
    p1 = sns.boxplot(data=df_train[feat], orient="h", ax=ax[i,0]).set(xlabel=feat)
    p2 = sns.boxplot(data=df_test[feat], orient="h", ax=ax[i,1], color="darkgoldenrod").set(xlabel=feat)

There are nearly too many features for a visual analysis. Yet, there is still something to discover in the boxplots:

* If it wasn't for the boxplot I might have missed the tiny "hill" on the right in f74.
* f26 looks a bit different in train and test. I assume that this might come from random sampling of train and test set.

In [None]:
# compare boxplot vs histogram
# check feature f75 for an example how a boxplot can mislead you
# check feature f92 for an example where a boxplot reveals additional information
# check feature 74 and 91, for an interesting comparison
feature = "f40"

fig, ax = plt.subplots(2, 1, figsize=(15,10))
fig.suptitle("Comparison between boxplot and histogram for the same feature")
sns.boxplot(data=df_train[feature], orient="h", ax=ax[0])
sns.histplot(data=df_train[feature], ax=ax[1])
plt.show()

## 4. Correlation

In [None]:
# check if there is a correlation between the features -> no
fig, ax = plt.subplots(1, 2, figsize=(20,8))

sns.heatmap(df_train[feature_cols].corr(), cmap='mako', ax=ax[0])
sns.heatmap(df_test[feature_cols].corr(), cmap='rocket', ax=ax[1])
ax[0].set_title("Correlation between the features, train set")
ax[1].set_title("Correlation between the features, test set")
plt.show()

There is no correlation between the features.

# Analysis of target variable

In [None]:
plt.figure(figsize=(8,8))
plots = df_train.claim.value_counts().plot(kind="bar")
plt.title("Values of target variable")
plots.spines['left'].set_visible(False) #remove the lines around the graph
plots.spines['top'].set_visible(False)
plots.spines['right'].set_visible(False)
plots.get_yaxis().set_ticks([]) # set no ticks
plt.xticks(rotation=0)
for bar in plots.patches:
    plots.annotate(bar.get_height(),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=10, xytext=(0, 8),
                   textcoords='offset points')
plt.show()

The target variable has a roughly equal amount of 0s and 1s.