In [None]:
!pip install seaborn==0.11

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.offsetbox import AnchoredText
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

train.describe()

In [None]:
train.describe(include='O')

## Target Distribution

In [None]:
target_count = train['target'].value_counts()


fig, axes = plt.subplots(1, 2, figsize=(8, 5), dpi=150)


color = ['lightgray', '#244747']  # To express the meaning of survival
new_xlabel = list(map(str, target_count.index))

# Axes[0] : Bar Plot Custom
axes[0].bar(new_xlabel, # redefine for categorical x labels 
            target_count, 
            color=color,# color
            width=0.65, # bar width 
        ) 

axes[0].margins(0.2, 0.2) # margin control (leftright, topbottom)
axes[0].set_xlabel('Target') # label info
axes[0].spines['top'].set_visible(False)
axes[0].spines['right'].set_visible(False)


# Axes[0] : Pie Chart Custom
explode = [0, 0.1]

axes[1].pie(target_count,
            labels=new_xlabel,
            colors=color, # color
            explode=explode, # explode
            textprops={'fontsize': 12, 'fontweight': 'bold'}, # text setting
            autopct='%1.1f%%', # notation
            shadow=True # shadow
           )

fig.suptitle('Target Distribution', fontsize=13, fontweight='bold') 

plt.show()

## Categorical Feature Distribution

In [None]:
fig, ax = plt.subplots(5, 4, figsize=(18, 20))
fig.set_facecolor('#d0d0d0') 

for i in range(19):
    # count 
    cnt_tmp = train[f'cat{i}'].value_counts().sort_index()
    ax[i%5][i//5].bar(cnt_tmp.index, cnt_tmp, color='#244747', label='Count')
    ax[i%5][i//5].set_yticks([])
    ax[i%5][i//5].margins(0.05, 0.2)
    
    divider = make_axes_locatable(ax[i%5][i//5])
    cax = divider.append_axes("top", size="8%", pad=0)
    cax.get_xaxis().set_visible(False)
    cax.get_yaxis().set_visible(False)
    cax.set_facecolor('black')

    at = AnchoredText(f'cat{i}', loc=10, 
                      prop=dict(backgroundcolor='black',
                                size=10, color='white', weight='bold'))
    cax.add_artist(at)
    
ax[-1][-1].set_visible(False)
fig.text(0.018, 1.03, 'Categorical Feature Distribution [Train]', fontsize=20, fontweight='bold', fontfamily='serif')


plt.tight_layout()
plt.show()

In [None]:
for feat in ['cat5','cat7', 'cat8', 'cat10']:
    print('=================')
    print(feat)
    print(f'Total {len(train[feat].unique())}')
    print(train[feat].value_counts().sort_index().index)
    
    print('\n')

In [None]:
# fig, ax = plt.subplots(5, 4, figsize=(18, 20))
# fig.set_facecolor('#d0d0d0') 

# for i in range(19):
#     # count 
#     cnt_tmp = test[f'cat{i}'].value_counts().sort_index()
#     ax[i%5][i//5].bar(cnt_tmp.index, cnt_tmp, color='#244747', label='Count')
#     ax[i%5][i//5].set_yticks([])
#     ax[i%5][i//5].margins(0.05, 0.2)
    
#     divider = make_axes_locatable(ax[i%5][i//5])
#     cax = divider.append_axes("top", size="8%", pad=0)
#     cax.get_xaxis().set_visible(False)
#     cax.get_yaxis().set_visible(False)
#     cax.set_facecolor('black')

#     at = AnchoredText(f'cat{i}', loc=10, 
#                       prop=dict(backgroundcolor='black',
#                                 size=10, color='white', weight='bold'))
#     cax.add_artist(at)
    
# ax[-1][-1].set_visible(False)
# fig.text(0.018, 1.03, 'Categorical Feature Distribution [Test]', fontsize=20, fontweight='bold', fontfamily='serif')


# plt.tight_layout()
# plt.show()

## Continuous Feature Distribution 

- gray for negative
- green for positive

In [None]:
fig, ax = plt.subplots(3, 4, figsize=(17, 12), sharex=True)
fig.set_facecolor('#d0d0d0') 

for i in range(11): 
    sns.kdeplot(data=train[train['target']==1], x=f'cont{i}', 
                fill=True,
                linewidth=0,
                color='#244747', alpha=1,
                ax=ax[i%3][i//3])
    
    sns.kdeplot(data=train[train['target']==0], x=f'cont{i}', 
                fill=True,
                linewidth=0,
                color='#d0d0d0', alpha=0.8,
                ax=ax[i%3][i//3])
    
    ax[i%3][i//3].set_yticks([])
    ax[i%3][i//3].set_xlabel('',visible=False)
    ax[i%3][i//3].margins(0.05, 0.2)
    
    # dviider
    divider = make_axes_locatable(ax[i%3][i//3])
    cax = divider.append_axes("top", size="8%", pad=0)
    cax.get_xaxis().set_visible(False)
    cax.get_yaxis().set_visible(False)
    cax.set_facecolor('black')

    at = AnchoredText(f'cont{i}', loc=10, 
                      prop=dict(backgroundcolor='black',
                                size=10, color='white', weight='bold'))
    cax.add_artist(at)
    
ax[-1][-1].set_visible(False)
fig.text(0.018, 1.03, 'Continuous Feature Distribution [Train]', fontsize=20, fontweight='bold', fontfamily='serif')

plt.tight_layout()
plt.show()

In [None]:
# fig, ax = plt.subplots(3, 4, figsize=(17, 12), sharex=True)
# fig.set_facecolor('#d0d0d0') 

# for i in range(11): 
#     sns.kdeplot(data=test, x=f'cont{i}', 
#                 fill=True,
#                 linewidth=0,
#                 color='#244747', alpha=1,
#                 ax=ax[i%3][i//3])
#     ax[i%3][i//3].set_yticks([])
#     ax[i%3][i//3].set_xlabel('',visible=False)
#     ax[i%3][i//3].margins(0.05, 0.2)
    
#     # dviider
#     divider = make_axes_locatable(ax[i%3][i//3])
#     cax = divider.append_axes("top", size="8%", pad=0)
#     cax.get_xaxis().set_visible(False)
#     cax.get_yaxis().set_visible(False)
#     cax.set_facecolor('black')

#     at = AnchoredText(f'cont{i}', loc=10, 
#                       prop=dict(backgroundcolor='black',
#                                 size=10, color='white', weight='bold'))
#     cax.add_artist(at)
    
# ax[-1][-1].set_visible(False)
# fig.text(0.018, 1.03, 'Continuous Feature Distribution [Test]', fontsize=20, fontweight='bold', fontfamily='serif')

# plt.tight_layout()
# plt.show()

## Correlation between Feature

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
fig.set_facecolor('#d0d0d0') 
corr = train.drop('id', axis=1).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(corr, 
            square=True, 
            linewidth=0.2,
            cbar=False,
            mask=mask,
            annot=True,
            center=0,
            cmap=sns.diverging_palette(240, 10),
            ax=ax)

fig.text(0.075, 1, 'Correlation: Continuous Feature & Target', fontweight='bold', fontfamily='serif', fontsize=15) 
ax.set_xticklabels(ax.get_xticklabels(), fontfamily='serif', rotation=90, fontsize=11)
ax.set_yticklabels(ax.get_yticklabels(), fontfamily='serif', rotation = 0, fontsize=11)
plt.tight_layout()
plt.show()

## Pair Grids

In [None]:
sns.pairplot(train.iloc[:,11:].sample(10000), kind="hist", corner=True)

## UMAP

Only Use Continuous Feature for Dimension Reduction

In [None]:
from umap import UMAP

# Dimension Reduction
train_sub = train.sample(30000)
target = train_sub['target'] 
data_sub = train_sub.drop([col for col in train.columns if 'cat' in col], axis=1)

umap = UMAP(random_state=0)
train_umap = umap.fit_transform(data_sub, target)

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12, 12))

ax[0][0].scatter(train_umap[target==1][:,0], train_umap[target==1][:,1], c='#8abbd0', alpha=0.25, label='Target:1')
ax[0][0].scatter(train_umap[target==0][:,0], train_umap[target==0][:,1], c='#4a4a4a', alpha=0.25, label='Target:0')


ax[0][1].set_visible(False)

ax[1][0].scatter(train_umap[target==1][:,0], train_umap[target==1][:,1], c='#8abbd0', alpha=0.25, label='Target:1')
ax[1][1].scatter(train_umap[target==0][:,0], train_umap[target==0][:,1], c='#4a4a4a', alpha=0.25, label='Target:0')

for i in range(2):
    for j in range(2):
        ax[i][j].set_xticks([])
        ax[i][j].set_yticks([])
        for s in ["top","right","left", 'bottom']:
            ax[i][j].spines[s].set_visible(False)

        ax[i][j].legend()

# Text Part
fig.text(0.97, 1, '[UMAP] Embedding Space', fontweight='bold', fontfamily='serif', fontsize=20, ha='right')   

fig.text(0.97, 0.94, '''
In dimension reduction, 
you can see that the data with target=0 are gathered in the center.
Performance can be expected when used as a feature.

''', 
         fontweight='light', fontfamily='serif', fontsize=12, va='top', ha='right')           
        
plt.tight_layout()
plt.show()


