# TPS - Sep 2021

# Important Notes:
- Binary Classification : Labels (Claim) are either 0 or 1
- 118 features, 1 label
- All 118 features : float64
- All features include missing values, better find a good way to fill them in
- Some features include numbers with enormous scale, so scaling is necessary/preferred

# Import Libraries & Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline
np.random.seed(2021)

In [None]:
cp = "/kaggle/input/tabular-playground-series-sep-2021/"
df_train = pd.read_csv(cp + "train.csv")
df_test = pd.read_csv(cp + "test.csv")
submission = pd.read_csv(cp + "sample_solution.csv")

In [None]:
# Labels extraction & redundant columns removal
labels = df_train['claim']
df_train.drop(['id', 'claim'], axis = 1, inplace = True)
df_test.drop(['id'], axis = 1, inplace = True)

# Overview

In [None]:
df_train.head(3)

dtypes of all 118 features are 'float64'.

In [None]:
df_train.info()

## Labels Overview

Labels are well-balanced.

In [None]:
plt.figure(figsize = (6, 6))
plt.xticks(size = 12); plt.yticks(size = 12)
ax = sns.countplot(x = labels,linewidth = 5, palette="Set2")
ax.set_title('Claim Countplot', fontsize = 20)
ax.set_xlabel('Claim', fontsize = 12); ax.set_ylabel('Count', fontsize = 12)
total = labels.shape[0]
for p in ax.patches:
    percent = 100 * p.get_height() / total
    percent_t = f"{percent:.2f}%"
    x, y = p.get_x() + p.get_width() / 2, p.get_y() + p.get_height() / 2
    ax.annotate(percent_t, (x, y), fontsize = 12, ha = 'center')
    p.set_width(p.get_width() * 0.5)
    p.set_x(p.get_x() + p.get_width() * 0.5)
plt.show()

## Features Overview

We observe a wide variety of ranges in the features, scaling would be necessary/preferred.

In [None]:
df_train.describe().T.style.background_gradient(cmap = 'Blues')\
                           .bar(subset = ["mean",], color = 'lightgreen')\
                           .bar(subset = ["std"], color = '#ee1f5f')\
                           .bar(subset = ["max"], color = '#FFA07A')

# Missing Values
All columns contain null values.

## Proportion of Missing Values

In [None]:
emptyCols_train, emptyCols_test = df_train.columns[df_train.isnull().any()], df_test.columns[df_test.isnull().any()]
missing_cells_train, missing_cells_test = df_train.isnull().sum().sum(), df_test.isnull().sum().sum()
missing_percent_train = np.round(100 * missing_cells_train / np.product(df_train.shape), 4)
missing_percent_test = np.round(100 * missing_cells_test / np.product(df_test.shape), 4)
print("############ Null Values in df_train & df_test ############\n")
print(f"df_train : {len(emptyCols_train)} / {len(df_train.columns)} columns contain Nan")
print(f"           {missing_cells_train:,} Nan / {np.product(df_train.shape):,} Cells : {missing_percent_train}%\n")
print(f"df_test : {len(emptyCols_test)} / {len(df_test.columns)} columns contain Nan")
print(f"           {missing_cells_test:,} Nan / {np.product(df_test.shape):,} Cells : {missing_percent_test}%")

In [None]:
df_null_train = pd.DataFrame(data = df_train.isnull().sum(), columns = ['train_nan'])
df_null_test = pd.DataFrame(data = df_test.isnull().sum(), columns = ['test_nan'])
df_null = pd.concat([df_null_train, df_null_test], axis = 1, join = 'inner')
df_null['train_nan'] /= df_train.shape[0]; df_null['test_nan'] /= df_test.shape[0]
df_null['average'] = (df_null['train_nan'] + df_null['test_nan']) / 2
df_null *= 100
df_null.sort_values(ascending = False, by = ['average'], inplace = True)
features = list(df_null.index)
fig, ax = plt.subplots(figsize = (15, 6))
plt.xticks(size = 12); plt.yticks(size = 12)
ax = sns.scatterplot(data = df_null)
ax.set_title('Proportion of Nan Values in Train & Test Data', fontsize = 20)
ax.set_xticks(features[::4]); ax.set_xticklabels(features[::4], rotation = 45)
ax.set_xlabel('feature', fontsize = 12); ax.set_ylabel('%', fontsize = 12)
plt.show()

In [None]:
print(f"All features in both df_train & df_test include Nan values with the proportion : {np.round(df_null.min().min(), 3)}% ~ {np.round(df_null.max().max(),3)}%")

## Labels Distribution on Missing Values

Let's explore how claim distributions are formed for each column when the column has a missing value:

In [None]:
lst1, lst0, totals = [], [], []
maxOneProp, minOneProp = float('-inf'), float('inf')
for col in df_train.columns:
    nullIndex = df_train[col].isnull()
    ones = labels[nullIndex].sum()
    total = labels[nullIndex].count()
    maxOneProp = max(maxOneProp, 100 * ones / total)
    minOneProp = min(minOneProp, 100 * ones / total)
    zeros = total - ones
    lst1.append(ones); lst0.append(zeros); totals.append(total)
feature_missing_df = pd.concat([pd.Series(df_train.columns), pd.Series(lst0), pd.Series(lst1),pd.Series(totals)], axis = 1)
feature_missing_df.set_axis(['feature', 'claim_0', 'claim_1', 'total'], axis = 1, inplace = True)
feature_missing_df.sort_values('total', ascending = False, inplace = True)

In [None]:
sns.set_context('paper')
f, ax = plt.subplots(figsize = (13,20))
sns.set_color_codes('pastel')
sns.barplot(x = 'total', y = 'feature', data = feature_missing_df, label = 'Total', color = 'r', edgecolor = 'w')
sns.barplot(x = 'claim_1', y = 'feature', data = feature_missing_df, label = 'Claim_1', color = 'b', edgecolor = 'w')
sns.set_color_codes('muted')
ax.legend(ncol = 2, loc = 'lower right')
ax.set_title('Claim Distribution for Missing Cols', fontsize = 20)
ax.set_xlabel('Claim 1 vs 0', fontsize = 12); ax.set_ylabel('Feature', fontsize = 12)
plt.xticks(size = 12); plt.yticks(size = 12)
ax.set_yticks(ax.get_yticks()[::4]);
plt.show()
del feature_missing_df

Looking at each column separately, roughly 75% of the Nan values yield Claim 1

In [None]:
print("In terms of columns, for Nan value rows : ")
print(f"Claim 0 Proportion : {np.round(100 - maxOneProp, 2)}% ~ {np.round(100 - minOneProp, 2)}%")
print(f"Claim 1 Proportion : {np.round(minOneProp, 2)}% ~ {np.round(maxOneProp, 2)}%")

Let's look at the distribution of the number of missing columns for each row.

In [None]:
missing_cols_row = df_train.isnull().T.sum()
maxColMissing = missing_cols_row.max()
fig, ax = plt.subplots(1,2,figsize = (16, 8))
fig.suptitle("Missing Cols Count per Row in Train Data", fontsize=18)
plt.xticks(size=12); plt.yticks(size = 12)
ax = ax.flatten()

sns.countplot(x=missing_cols_row, ax = ax[0])
ax[0].set_title('Row Count', fontsize = 15)
ax[0].set_xlabel('Missing Cols', fontsize = 12); ax[0].set_ylabel('Row Count', fontsize=12)
for p in ax[0].patches[8:]:
    count = f"{p.get_height():,}"
    x, y = p.get_x() + p.get_width() / 2, p.get_y() + p.get_height() * 10 + 10000
    ax[0].annotate(count, (x, y), fontsize = 10, ha = 'center')
    
missing_cols_row.rename("count", inplace = True)
concat_df = pd.concat([missing_cols_row, labels], axis = 1)
new_counts = concat_df.groupby(['count'])['claim'].value_counts(normalize=True).rename('percentage').mul(100).reset_index().sort_values('percentage')
sns.barplot(x = 'count', y = 'percentage', hue = 'claim',data = new_counts, ax = ax[1])
for p in ax[1].patches:
    p.set_width(p.get_width() * 0.5)
    p.set_x(p.get_x() + p.get_width() * 0.5)
ax[1].set_title('Claim Proportion (Normalized)', fontsize = 15)
ax[1].set_xlabel('Missing Cols', fontsize = 12); ax[1].set_ylabel('Percentage (%)', fontsize=12)
plt.show()

In [None]:
train_max_missing_col, test_max_missing_col = missing_cols_row.max(), df_test.isnull().T.sum().max()
print(f"Training data max # of missing cols in one row : {train_max_missing_col}")
print(f"Testing data max # of missing cols in one row : {test_max_missing_col}")

## Dropping Rows with Any Missing Col

What happens if we drop all rows with any missing col?

In [None]:
df_drop = df_train.copy()
df_drop.dropna(axis = 0, inplace=True) 
print(f"# of removed rows : {df_train.shape[0] - df_drop.shape[0]} out of {df_train.shape[0]} rows")
print(f"{np.round(100 * (df_train.shape[0] - df_drop.shape[0]) / df_train.shape[0], 4)}% rows gone")
print(df_drop.shape)
del df_drop

Dropping all rows with any missing col won't work since it wipes out two-third of all the rows

# Feature Distribution

In [None]:
fig, axes = plt.subplots(12,10,figsize = (20, 15))
axes = axes.flatten()[:-2]
for idx, ax in enumerate(axes):
    sns.kdeplot(data = df_train.sample(n = 10000), x = f'f{idx + 1}', fill = True, ax = ax)
    sns.kdeplot(data = df_test.sample(n = 10000), x = f'f{idx + 1}', fill = True, ax = ax)    
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel(''); ax.set_ylabel('')
    ax.set_title(f'f{idx + 1}', loc = 'right', fontsize = 12)
fig.tight_layout()
plt.show()

# Feature Skewness & Kurtosis

In [None]:
skewness_train = df_train.skew(axis = 0, skipna = True)
kurtosis_train = df_train.kurtosis(axis = 0, skipna = True)

In [None]:
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(1,2,figsize = (18, 4))
plt.xticks(size=12); plt.yticks(size = 12)
ax = ax.flatten()
sns.boxplot(x=skewness_train, ax= ax[0]); sns.boxplot(x=kurtosis_train, ax= ax[1])
ax[0].set_title('Skewness', fontsize=20); ax[1].set_title('Kurtosis', fontsize=20)
plt.show()

# Feature Correlation

In [None]:
corr = df_train.corr()
f, ax = plt.subplots(figsize = (16, 16))
ax.set_title('Correlation on Train Data Features', fontsize = 24, y = 1.05)
mask = np.triu(np.ones_like(corr, dtype = bool))
sns.heatmap(corr, annot = False, mask = mask, center=0, linewidths = .5, cmap = "coolwarm")#, vmin=-0.05, vmax= 0.05)
plt.show()