In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

import plotly.figure_factory as ff
from plotnine import *
import plotnine as pn

import seaborn as sns

In [None]:
# read data
train  = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')

## 1. Data Overview

This section provides a summary of the data.

**Summary**
* Total number of observations in the train data is 957,919. Observations to be predicted on (test data) is 493,474.
* About 1.6% of the data is missing in each column. This will need a further deep dive for modeling.
* Overall, there are 118 features (named from f1 to f118) that we will be used to predict the target variable (claim). Includes an 'id' column of type integer, which from a cursory look is just a row identifier and will be dropped.
* All 118 predictor variables not including the id column are float variables.
* The target variable, claim is a probablility prediction and will be evaluated on the auc score.

In [None]:
# train and test data shape
shape_df = pd.DataFrame({'Data':['Train','Test'],
                       'Shape':[train.shape[0], test.shape[0]]})
colors = ['#FFBF00','#40E0D0']
data = go.Bar(x =shape_df.Shape[::-1],y=shape_df.Data[::-1], orientation='h', text=shape_df.Shape[::-1], textposition='auto', marker_color=colors)
layout = go.Layout(font=dict(family='Arial',size=14),
                  paper_bgcolor='white',
                  plot_bgcolor = '#FFFAFA',
                 showlegend=False,width=800, height=400,title='Train & Test Data Size')
fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')
fig.show()
plt.savefig('data_shape.png')

In [None]:
# get column dtype counts
labels = ['float','int']
values = [118,0]
data = go.Pie(labels=labels, values=values,pull=[0.2,0],textinfo='label+value', marker=dict(colors=colors))
layout = go.Layout(font=dict(family='Arial',size=14),
                  paper_bgcolor='white',
                  plot_bgcolor = '#FFFAFA',
                 showlegend=False,title='Feature Count by Data Type', height=500, width=500)
fig = go.Figure(data=data,layout=layout)
fig.show()

In [None]:
missing_value_count = train.drop(['id','claim'], axis=1).isna().sum()
missing_value_count_df = pd.DataFrame({'Feature':missing_value_count.index,'Missing Value Count':missing_value_count.values})
missing_value_count_df['Missing %'] = np.round(missing_value_count_df['Missing Value Count']/train.shape[0] * 100, 1)
missing_value_count_df =  missing_value_count_df.sort_values(by='Missing %', ascending=False)

data = go.Bar(x =missing_value_count_df.Feature[::-1],y=missing_value_count_df['Missing %'][::-1],
              text=missing_value_count_df['Missing %'][::-1], textposition='auto', marker_color='#FFBF00')
layout = go.Layout(font=dict(family='Arial',size=14),
                  paper_bgcolor='white',
                  plot_bgcolor = '#FFFAFA',
                 showlegend=False,title='Missing Values - Train Data')
fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', title='Feature')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black',title='Missing %')
fig.show()
plt.savefig('missing_values.png')

## 2. Target Variable - Claim
We will explore the target variable in this section. As per the description thought the 'claim' variable is binary, the probability that a person will make a claim is what is needed for the submission. The predictions will be evaluated on the auc score.

First, we look at the counts for the 'claim' and no'claim' in the training dataset.

In [None]:
target_count = train.claim.value_counts()
target_count_df = pd.DataFrame({'Target':target_count.index,'Count':target_count.values})

target_map = {1:'Claim',0:'No Claim'}
target_count_df['Target'] = target_count_df['Target'].map(target_map)

colors = ['#FFBF00','#40E0D0']
data = go.Bar(x =target_count_df.Count[::-1],y=target_count_df.Target[::-1], orientation='h', text=target_count_df.Count[::-1], 
              textposition='auto', marker_color=colors)
layout = go.Layout(font=dict(family='Arial',size=14),
                  paper_bgcolor='white',
                  plot_bgcolor = '#FFFAFA',
                 showlegend=False,width=800, height=400,title='Train Data - Claim & No Claim Counts')
fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')
fig.show()
plt.savefig('target_counts.png')

The data is very balanced. Phew! One challenge less.

### 2.1 Feature Distribution for Target

We will compare the feature distribution for the 'Claim' and 'No Claim'.

In [None]:
feat_list = train.drop(['id','claim'], axis=1).columns
claim_df = train[train.claim == 1][feat_list]
no_claim_df = train[train.claim == 0][feat_list]

feat_list_set1 = feat_list[0:20]

fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set1[n] 
        sns.kdeplot(claim_df[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(no_claim_df[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()
plt.savefig('feature_distribution.png')

In [None]:
feat_list_set2 = feat_list[20:40]

fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set2[n] 
        sns.kdeplot(claim_df[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(no_claim_df[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

In [None]:
feat_list_set3 = feat_list[40:60]

fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set3[n] 
        sns.kdeplot(claim_df[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(no_claim_df[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

In [None]:
feat_list_set4 = feat_list[60:80]

fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set4[n] 
        sns.kdeplot(claim_df[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(no_claim_df[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

In [None]:
feat_list_set5 = feat_list[80:100]

fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set5[n] 
        sns.kdeplot(claim_df[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(no_claim_df[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

In [None]:
feat_list_set6 = feat_list[100:119]

fig, axes = plt.subplots(6, 3,figsize=(21, 20))

n = 0
sns.despine()
for row in range(6):
    for col in range(3):
        feat = feat_list_set6[n] 
        sns.kdeplot(claim_df[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(no_claim_df[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

The distributions for the features are similar for the 'claim' and 'no claim' targets. And some distributions are very weird! These need to be tackled seprately with some transformations.

## 3. Features
We will compare the feature distribution for the train and test data. Yellow shows train data and green indicates test data. We will also plot and study the correlation between the features.

In [None]:
fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set1[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set2[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set3[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set4[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set5[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(6, 3,figsize=(21, 20))

n = 0
sns.despine()
for row in range(6):
    for col in range(3):
        feat = feat_list_set6[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

Again, the distribution of features for the train and test data are near identical.

In [None]:
corr = train.drop(['id','claim'], axis=1).corr()

f, ax = plt.subplots(figsize=(16, 16))

cmap = sns.diverging_palette(230, 20, as_cmap=True)

mask = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

There seems to be no correlation between the features. Since we have no specific information about the features, we can jump directly to feature engineering and modeling.

_**As always...work in progress!**_

**Keep learning and have fun**