In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### Reading data

In [None]:
%%time

train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv')

In [None]:
train.info()

In [None]:
test.info()

### Target distribution

In [None]:
train['target'].plot.hist(bins=50, figsize=(12, 6));

In [None]:
train['target'].describe()

### Counting relative frequencies

In [None]:
fig, axes = plt.subplots(5, 2, figsize=(12, 28), sharey=True)
axes = np.ravel(axes)    
    
for i in range(0, 10):
    x_, y_ = train[f'cat{i}'].value_counts(normalize=True).sort_index().index, train[f'cat{i}'].value_counts(normalize=True).sort_index().values
    axes[i].bar(x_, y_)
    axes[i].set_title(f'cat{i}')
    
    # get x position
    x_pos = [x - (x * 0.1) for x in axes[i].get_xticks()]
    
    for index, value in enumerate(y_):
        axes[i].text(index, value + .03, '{:.1%}'.format(value), ha='center', va='center')

### Target distribution for each categorical feature

In [None]:
fig, axes = plt.subplots(5, 2, figsize=(12, 26), sharey=True)
axes = np.ravel(axes)    
    
for i in range(0, 10):
    plt.sca(axes[i])
    sns.boxplot(x=f'cat{i}', y='target', data=train)
    axes[i].set_xlabel('')
    axes[i].set_title(f'cat{i}')

plt.show();

In [None]:
fig, axes = plt.subplots(5, 2, figsize=(12, 30), sharey=True)
axes = np.ravel(axes)    
    
for i in range(0, 10):
    plt.sca(axes[i])
    sns.histplot(x='target', hue=f'cat{i}', bins=50, alpha=0.2, element='step', stat='density', data=train)
    axes[i].set_xlabel('target')
    axes[i].set_title(f'cat{i}')

plt.show();

Some levels are hard to see due low frequency.

In [None]:
for cat in ['cat0', 'cat2', 'cat4', 'cat6']:
    print(cat)
    print(dict(train[cat].value_counts()))
    print()

### Relation between numerical features and `target`

In [None]:
train.describe()

All the numerical features seems to be into similar ranges.

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=train.filter(regex=r'cont'));

In [None]:
fig, axes = plt.subplots(5, 3, figsize=(12, 21), sharex=True, sharey=True)
axes = np.ravel(axes)    
    
for i in range(0, 14):
    plt.sca(axes[i])
    sns.scatterplot(x=f'cont{i}', y='target', alpha=0.5, data=train.sample(10000, random_state=0))
    axes[i].set_xlabel(f'cont{i}')
    axes[i].xaxis.set_tick_params(labelbottom=True)
axes[-1].axis('off')

plt.show();