In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

print(f'Train shape: {data.shape}')
print(f'Test shape: {test.shape}')

In [None]:
data.head()

957919 train samples

493474 test samples

118 features

# Missing values

In [None]:
plt.figure(figsize=(16, 20))
plt.title('Train data missing values')
sns.heatmap(data.drop('id', axis=1).isna())
plt.show()

In [None]:
plt.figure(figsize=(16, 20))
plt.title('Test data missing values')
sns.heatmap(test.drop('id', axis=1).isna())
plt.show()

In [None]:
data.drop('id', axis=1).isna().sum().sort_values() / data.shape[0] * 100

In [None]:
test.drop('id', axis=1).isna().sum().sort_values() / test.shape[0] * 100

All features have around 1.6% missing values.

In [None]:
features = ['f' + str(i) for i in range(1, 119)]

# Data Distribution

## Target distribution

In [None]:
data['claim'].value_counts().plot.pie(figsize=(6, 6), autopct='%1.1f%%', title='claim percentages')
plt.show()

The two classes are balanced.

## Features distribution

In [None]:
for feature in features:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    train_col = data[feature].dropna()
    test_col = test[feature].dropna()
    axes[0].set_title(f'{feature} distribution in train data')
    sns.histplot(train_col, ax=axes[0])
    axes[1].set_title(f'{feature} distribution in test data')
    sns.histplot(test_col, ax=axes[1], color='orange')
    plt.show()

# Feature/Target relationships

## Boxplots

In [None]:
for feature in features:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x='claim', y=feature, data=data)
    plt.show()

In [None]:
for feature in features:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    neg_col = data[data['claim'] == 0][feature].dropna()
    pos_col = data[data['claim'] == 1][feature].dropna()
    axes[0].set_title(f'{feature} distribution for claim == 0')
    sns.histplot(neg_col, ax=axes[0])
    axes[1].set_title(f'{feature} distribution for claim == 1')
    sns.histplot(pos_col, ax=axes[1], color='orange')
    plt.show()

# Correlation

In [None]:
plt.figure(figsize=(16, 20))
plt.title('Train data correlation')
sns.heatmap(data.drop(['id', 'claim'], axis=1).corr())
plt.show()

In [None]:
plt.figure(figsize=(16, 20))
plt.title('Test data correlation')
sns.heatmap(test.drop(['id'], axis=1).corr())
plt.show()

No correlation between variables.