In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm

In [None]:
sns.set_style('whitegrid')

Loading the data and checking the head to make sure it is loaded correctly

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')
test_df.head()

Lets Look at the target to understand the distribution and type of target

In [None]:
train_df['target'].unique()

In [None]:
train_df['target'].value_counts(), train_df['target'].value_counts(normalize=True)

### Observations:
* So there are about 300000 observations in the training data.
* About 26.5% positive samples
* About 73.5% negative samples

# Type of data in the dataset

In [None]:
train_df.info()

We can see that there are `19` categorical variables and `11` continuous variables

In [None]:
cont_names = [col for col in train_df.columns if 'cont' in col]
cat_names = [col for col in train_df.columns if 'cat' in col]

# Numerical Data

Lets understand the numerical data distribution

In [None]:
def create_histograms(df1, df2, feature, ax):
    sns.histplot(df1[feature],ax=ax, kde=True, color='c',label=f"train_{feature}" )
    sns.histplot(df2[feature],ax=ax, kde=True, color='mediumvioletred',label=f"test_{feature}")
    ax.set_title(f'{feature} Histogram')


In [None]:
fig, axes = plt.subplots(int(np.round(len(cont_names)/3)),3, constrained_layout=True, figsize=(17,17))
for ax, col in tqdm(zip(axes.flatten(), cont_names), total=len(cont_names)):
    create_histograms(train_df, test_df, col, ax)
plt.show()

### Observations

* Train and test distributions are almost the same. 

# Categorical Data

In [None]:
def create_barplots(df1, df2, feature, ax):
    df1[feature].value_counts().plot(kind='bar', ax=ax, alpha=0.3, color='c')
    df2[feature].value_counts().plot(kind='bar', ax=ax, alpha=0.3, color='mediumvioletred')
    ax.set_title(f"{feature} Bar Plot")

In [None]:
fig, axes = plt.subplots(int(np.round(len(cat_names)/3)),3, constrained_layout=True, figsize=(20,17))
for ax, col in tqdm(zip(axes.flatten(), cat_names), total=len(cat_names)):
    create_barplots(train_df, test_df, col, ax)
plt.show()

# Observations
* We can see that the variables are distributed similarly in test and train datasets.