# Tabular Playground (March)

This is a preliminary exploratory data analysis and will evolve over the next days/weeks. Comments, suggestions and improvements are very welcome!!

## EDA and Data Visualization

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns 
from pylab import rcParams
from warnings import filterwarnings
style.use('fivethirtyeight')
rcParams['figure.figsize'] = (12,8)

In [None]:
def cp(n, b=220):
    return sns.diverging_palette(20, b, n=n)

In [None]:
DATA_PATH = '/kaggle/input/tabular-playground-series-mar-2021/'
train = pd.read_csv(DATA_PATH + 'train.csv', index_col='id')
train.head()

In [None]:
train.describe()

In [None]:
train.dtypes

In [None]:
test = pd.read_csv(DATA_PATH + 'test.csv', index_col='id')
test.head()

In [None]:
CAT = [col for col in train.columns if col.startswith('cat')]
CONT = [col for col in train.columns if col.startswith('cont')]
print(f'Number of categorical features:\t {len(CAT)}.')
print(f'Number of continuous features:\t {len(CONT)}.')

# EDA

We start by checking both the test and train data for any missing values.

In [None]:
assert train.isna().sum().sum() == 0, 'No go!'
assert test.isna().sum().sum() == 0, 'No go!'

By looking at the train dataset it's clear that the target variable is a binary categorical variable. Plotting the distribution we get

In [None]:
sns.catplot(data = train, x='target', kind='count',palette=cp(2))
_ = plt.title('Target Variable Count')

In [None]:
counts = train.target.value_counts()
print(f"Percent of data with target label 0: {100 * counts[0]/(counts[0] + counts[1]):.1f}%")
print(f"Percent of data with target label 1: {100 * counts[1]/(counts[0] + counts[1]):.1f}%")

The labels are quite unbalanced with three quarters of the data with label 0.

In [None]:
mask = np.zeros_like(train[CONT].corr())
mask[np.triu_indices_from(mask)] = True

plt.title("Correlation heatmap of continuous features (Train)")

_ = sns.heatmap(
    train[CONT].corr(),
    cmap = cp(200),
    annot=True,
    mask=mask,
    center = 0,
)

I found this heatmap layout [here](https://www.kaggle.com/arashnic/cats-on-a-hot-tin-roof-cats-encoding-method).

In [None]:
mask = np.zeros_like(test[CONT].corr())
mask[np.triu_indices_from(mask)] = True

plt.title("Correlation heatmap of continuous features (Train)")

_ = sns.heatmap(
    test[CONT].corr(),
    cmap = cp(200),
    annot=True,
    mask=mask,
    center = 0,
)

In [None]:
fig, axs = plt.subplots(3, 4, figsize=(18, 14))

palette = cp(2)

for i, row in enumerate(axs):
    for j, ax in enumerate(row):
        idx = 4 * i + j
        if idx == 11:
            ax.axis('off')
            break
        sns.kdeplot(data=train, x=CONT[idx], shade=True, ax=ax, color=palette[idx % 2])
        ax.set(xlabel='',ylabel='', title=CONT[idx], xticks=[],)
_ = fig.suptitle("KDE Plots (Train)")

In [None]:
fig, axs = plt.subplots(3, 4, figsize=(18, 14))

palette = cp(2)

for i, row in enumerate(axs):
    for j, ax in enumerate(row):
        idx = 4 * i + j
        if idx == 11:
            ax.axis('off')
            break
        sns.kdeplot(data=test, x=CONT[idx], shade=True, ax=ax, color=palette[idx % 2])
        ax.set(xlabel='',ylabel='', title=CONT[idx], xticks=[])
_ = fig.suptitle("KDE Plots (Test)")

## Categorical variables

Check if the categories in the train dataset match the categories in the test dataset.

In [None]:
for col in CAT:
    if set(train[col].unique()) != set(test[col].unique()):
        print(f"Different categories found in column: {col}")

There are different classes in the cat10 column. Let's investigate this further.

In [None]:
train_cat10 = set(train['cat10'].unique())
test_cat10 = set(test['cat10'].unique())

print(f'Categories in the training set but not in the test set: {train_cat10.difference(test_cat10)}.')
print(f'Categories in the test set but not in the training set: {test_cat10.difference(train_cat10)}.')

In [None]:
nuniques = [train[col].nunique() for col in CAT]
nuniques_test = [test[col].nunique() for col in CAT]
pd.DataFrame({"Unique class count (train)": nuniques, "Unique class count (test)": nuniques_test}, index=CAT)

The largest number of distinct categories is 299! We separate categories into small and large categories based on the number of classes.

In [None]:
small_cat = [CAT[idx] for idx in np.argsort(nuniques)[:-4]]
large_cat = [CAT[idx] for idx in np.argsort(nuniques)[-4:]]

In [None]:
fig, axs = plt.subplots(3,5,figsize=(18,14))

for i, row in enumerate(axs):
    for j, ax in enumerate(row):
        idx = 5 * i + j
        counts = train[small_cat[idx]].value_counts()
        sns.barplot(x = counts.index, y = counts.values, ax=ax, palette = cp(len(counts.index)))
        ax.set(yticks=[], title = small_cat[idx])
plt.suptitle('Small categories (Train)');

In [None]:
fig, axs = plt.subplots(3,5,figsize=(18,14))

for i, row in enumerate(axs):
    for j, ax in enumerate(row):
        idx = 5 * i + j
        counts = train[small_cat[idx]].value_counts()
        sns.barplot(x = counts.index, y = counts.values, ax=ax, palette = cp(len(counts.index)))
        ax.set(yticks=[], title = small_cat[idx])
plt.suptitle('Small categories (Test)');