## First Look

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
# Train columns that do not exist in Test columns
cols_missing = [col for col in train.columns if col not in test.columns]
print("Train columns that do not exist in Test columns:", cols_missing)

# Test columns that do not exist in Train columns
cols_missing = [col for col in test.columns if col not in train.columns]
print("Test columns that do not exist in Train columns:", cols_missing)

## Missing data

In [None]:
# how many total missing values do we have?
missing_values_count = train.isnull().sum()
# missing_values_count = test.isnull().sum()
total_cells = np.product(train.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

There are a few missing values

In [None]:
# Missing values
pd.options.display.max_rows = 120
pd.DataFrame({'Train': train.isnull().sum(), 'Test': test.isnull().sum(), 'Type': train.dtypes}, index=train.columns)

Looks like all rows have a few missing values in both train and test datasets

In [None]:
# Target distribution
plt.figure(figsize=(8,5))
sns.countplot(x=train.claim)

In [None]:
# Rows with missing target
train.loc[train.claim.isnull()]

No rows with missing claim column

In [None]:
# List of categorial features
cat_features = [col for col in train.columns if train[col].dtypes == 'object']
print(len(cat_features), cat_features)

# List of numerical features
num_features = [col for col in train.columns if train[col].dtypes == 'float64']
print(len(num_features), num_features)

No categorical data  
118 numerical columns

## Visualization

In [None]:
# Combined dataframe containing numerical features only
df = pd.concat([train[num_features], test[num_features]], axis=0)
columns = df.columns.values

# Calculating required amount of rows to display all feature plots
cols = 3
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(20,100), sharex=False)

# Adding some distance between plots
plt.subplots_adjust(hspace = 0.3)

# Plots counter
i=0
for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns): # If there is no more data columns to make plots from
            axs[r, c].set_visible(False) # Hiding axes so there will be clean background
        else:
            # Train data histogram
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            # Test data histogram
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=14, pad=5)
            axs[r, c].tick_params(axis="y", labelsize=13)
            axs[r, c].tick_params(axis="x", labelsize=13)
            axs[r, c].grid(axis="y")
            axs[r, c].legend(fontsize=13)
                                  
        i+=1
# plt.suptitle("Numerical feature values distribution in both datasets", y=0.99)
plt.show();

Train and test datasets are very similar. This is one of the important features of synthetic data.

In [None]:
# Feature correlation
df = train.drop("id", axis=1)

# Calculatin correlation values
df = df.corr().round(2)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(df)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(50,50))
ax = sns.heatmap(df, annot=True, mask=mask, cmap="RdBu", annot_kws={"weight": "normal", "fontsize":9})
ax.set_title("Feature correlation heatmap", fontsize=17)
plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
         rotation_mode="anchor", weight="normal")
plt.setp(ax.get_yticklabels(), weight="normal",
         rotation_mode="anchor", rotation=0, ha="right")
plt.show();

Not much correlation between target and 118 numerical features. Again, this is common in synthetic data.

## Feature vs Target

In [None]:
for feature in num_features:
    fig, ax =plt.subplots(ncols=3, nrows=1, figsize=(20,5))
    sns.distplot(train[feature], ax=ax[0])
    sns.boxplot(train[feature], ax=ax[1])
    sns.stripplot(x=train.claim, y=train[feature], ax=ax[2])
    fig.show()