In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Tabular Playground Series - Oct 2021

The tabular series on kaggle are meant to help novices in data science field like me get acquainted with kaggle competitions.

The dataset created for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the biological response of molecules given various chemical properties. 

The first step in almost every data science project is to perfom some exploratory data analysis. This is what we will present here in this notebook.

# Train dataset

Lets first explore the train dataset and subsequently the test dataset.

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/train.csv', sep=',')

In [None]:
train_df.head()

In [None]:
print('Dataset shape: ', train_df.shape )

In [None]:
train_df.info()

In [None]:
train_df.describe().transpose()

In [None]:
print("There are", train_df.isna().any().sum(), "missing values")
print()
print(train_df.isna().sum())
print()
print(train_df.isnull().sum())

# Summary 1

The dataset contains 1 million of rows and 287 variables. Of those 287, 240 variables are of float64 type and 47 of int type. Those of int type suggest some categorical variables, lets verify how many categories are there among these variables.

In [None]:
min_cat_n = train_df.select_dtypes(include='int64').drop(['id', 'target'], axis=1).nunique().min()
max_cat_n = train_df.select_dtypes(include='int64').drop(['id', 'target'], axis=1).nunique().max()

print('The minimum number of classes in the categorical variables is', min_cat_n)
print('The maximum number of classes in the categorical variables is', max_cat_n)

In [None]:
fig, ax = plt.subplots(15,3, figsize=(15,25))
ax = ax.flatten()
columns = train_df.select_dtypes(include='int64').drop(['id', 'target'], axis=1).columns

for i, column in enumerate(columns):
    sns.countplot(x=train_df[column], ax=ax[i])

plt.tight_layout()
fig.show()

In [None]:
# Target Distribution
sns.countplot(train_df['target'])
plt.title('Distribution of classes in target variable (target) \n')
plt.xlabel('Target')
plt.ylabel('Count')

# Summary 2

As can be seen, the target variable isn't unbalanced, however there are many categorical variables that are unbalanced. When trainning a model we can test removing those to see how they affect the overall performance. 

Now lets check the non-categorical variables distribution.

In [None]:
fig, axes = plt.subplots(48,5,figsize=(15, 75))
axes = axes.flatten()
sns.set_palette(sns.color_palette(["#2a9d8f", "#e9c46a"]))
columns = train_df.select_dtypes(include='float64').columns

for i, ax in enumerate(axes):
    sns.kdeplot(data=train_df[columns], x=columns[i],ax=ax,palette = ["#2a9d8f"])
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.set_title(columns[i], loc='right', weight='bold', fontsize=10)

fig.supxlabel('Average (float features)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
train_df.select_dtypes(include='float64').hist(figsize=(32, 32), sharey=True);
plt.tight_layout()

# Test dataset

Here we will do the same analysis to test datast 

In [None]:
del train_df

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/test.csv', sep=',')

In [None]:
test_df.head()

In [None]:
print('Dataset shape: ', test_df.shape )

In [None]:
test_df.info()

In [None]:
test_df.describe().transpose()

In [None]:
print("There are", test_df.isna().any().sum(), "missing values")
print()
print(test_df.isna().sum())
print()
print(test_df.isnull().sum())

In [None]:
min_cat_n = test_df.select_dtypes(include='int64').drop(['id'], axis=1).nunique().min()
max_cat_n = test_df.select_dtypes(include='int64').drop(['id'], axis=1).nunique().max()

print('The minimum number of classes in the categorical variables is', min_cat_n)
print('The maximum number of classes in the categorical variables is', max_cat_n)

In [None]:
fig, ax = plt.subplots(15,3, figsize=(15,25))
ax = ax.flatten()
columns = test_df.select_dtypes(include='int64').drop(['id'], axis=1).columns

for i, column in enumerate(columns):
    sns.countplot(x=test_df[column], ax=ax[i])

plt.tight_layout()
fig.show()

In [None]:
fig, axes = plt.subplots(48,5,figsize=(15, 75))
axes = axes.flatten()
sns.set_palette(sns.color_palette(["#2a9d8f", "#e9c46a"]))
columns = test_df.select_dtypes(include='float64').columns

for i, ax in enumerate(axes):
    sns.kdeplot(data=test_df[columns], x=columns[i],ax=ax,palette = ["#2a9d8f"])
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.set_title(columns[i], loc='right', weight='bold', fontsize=10)

fig.supxlabel('Average (float features)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
test_df.select_dtypes(include='float64').hist(figsize=(32, 32), sharey=True);
plt.tight_layout()