<center><h1>TPS April 2022</h1></center>
<center><h1>Inspecting the Labels</h1></center>

# Simple Setup

In [None]:
%reset -sf

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings(action='ignore')

# Data

In [None]:
from pandas import read_csv, DataFrame, Series, IndexSlice
#from pathlib import Path
#list(Path('/kaggle/input').rglob('*.*'))

train = read_csv('/kaggle/input/tabular-playground-series-apr-2022/train.csv')
y_train = read_csv('/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv')
test = read_csv('/kaggle/input/tabular-playground-series-apr-2022/test.csv')

# Reduce memory

In [None]:
from numpy import float64, float32, int64, int32, dtype

def reduce_mem(df):
    df = df.copy()
    
    map_dtypes = {'int': dtype(int64), 'float': dtype(float32)}
    
    for col in df:
        if df[col].dtype == dtype(int64):
            df[col] = df[col].astype(int32)
        if df[col].dtype == dtype(float64):
            df[col] = df[col].astype(float32)
    return df

train = reduce_mem(train)
test = reduce_mem(test)

# Labels

## <em>First, let's convert sequences into some scalar criteria, i.e.</em>

In [None]:
train_ = train.pivot_table(
    index=['subject', 'sequence'],
    aggfunc='median'
)

f'I decide to summarize sequences by using a simple median'

## <em>Second, let's project that data into 2 dimensions, for easy visualization</em>

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
train_pca = pca.fit_transform(train_)

f'These 2 components explain {pca.explained_variance_ratio_.sum()*100:.2f}% of original data variance'

## <em>How does it look like?</em>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
sns.set_context('talk')

fig, ax = plt.subplots(1, 1, figsize=(12,8), sharex=True, sharey=True, constrained_layout=True)

_ = ax.scatter(train_pca[(y_train['state']==0),0], train_pca[(y_train['state']==0),1], facecolor='none', edgecolor='blue', alpha=0.5, label='State 0')
_ = ax.scatter(train_pca[(y_train['state']==1),0], train_pca[(y_train['state']==1),1], facecolor='none', edgecolor='orange', alpha=0.5, label='State 1')
_ = ax.legend()

f'At this stage, it doesn\'t seem like there is a clear distinction between states'
f'So, let\'s try non-linear decomposition'

## <em>Non-Linear Decomposition</em>

In [None]:
import umap

mani = umap.UMAP()
train_umap = mani.fit_transform(train_)

fig, ax = plt.subplots(1, 1, figsize=(12,8), sharex=True, sharey=True, constrained_layout=True)

_ = ax.scatter(train_umap[(y_train['state']==0),0], train_umap[(y_train['state']==0),1], facecolor='none', edgecolor='blue', alpha=0.5, label='State 0')
_ = ax.scatter(train_umap[(y_train['state']==1),0], train_umap[(y_train['state']==1),1], facecolor='none', edgecolor='orange', alpha=0.5, label='State 1')
_ = ax.legend()

f'Even with non-linear decomposition, it doesn\'t seem like there is a clear distinction between states'

## <em>Let's return to the sequences</em>

In [None]:
train__ = train.pivot_table(
    index='sequence',
    columns='step'
)

## <em>And apply PCA again</em>

In [None]:
pca = PCA(n_components=2)
train_pca = pca.fit_transform(train__)

f'These 2 components explain {pca.explained_variance_ratio_.sum()*100:.2f}% of original data variance'

fig, ax = plt.subplots(1, 1, figsize=(12,8), sharex=True, sharey=True, constrained_layout=True)

_ = ax.scatter(train_pca[(y_train['state']==0),0], train_pca[(y_train['state']==0),1], facecolor='none', edgecolor='blue', alpha=0.5, label='State 0')
_ = ax.scatter(train_pca[(y_train['state']==1),0], train_pca[(y_train['state']==1),1], facecolor='none', edgecolor='orange', alpha=0.5, label='State 1')
_ = ax.legend()

f'We may start to see some distinction between states'

## <em> And UMAP again</em>

In [None]:
mani = umap.UMAP()
train_umap = mani.fit_transform(train_)

fig, ax = plt.subplots(1, 1, figsize=(12,8), sharex=True, sharey=True, constrained_layout=True)

_ = ax.scatter(train_umap[(y_train['state']==0),0], train_umap[(y_train['state']==0),1], facecolor='none', edgecolor='blue', alpha=0.5, label='State 0')
_ = ax.scatter(train_umap[(y_train['state']==1),0], train_umap[(y_train['state']==1),1], facecolor='none', edgecolor='orange', alpha=0.5, label='State 1')
_ = ax.legend()

## <em>For the sake of comparison, let's see the same techniques over MNIST</em>

In [None]:
from torch import load

mnist = load('/kaggle/input/pytorch-mnist/training.pt')
m_train = mnist[0].reshape(-1, 784).numpy()[:10000, :]  # just 10000 images for quick
m_y_train = mnist[1].numpy()[:10000]

pca = PCA(n_components=2)
m_train_pca = pca.fit_transform(m_train)

f'These 2 components explain {pca.explained_variance_ratio_.sum()*100:.2f}% of original data variance'

fig, ax = plt.subplots(1, 1, figsize=(12,8), sharex=True, sharey=True, constrained_layout=True)
colors = ['blue', 'orange', 'red', 'purple', 'yellow', 'grey', 'green', 'brown', 'aquamarine', 'navy']
for i, c in zip(range(10), colors):
    _ = ax.scatter(m_train_pca[m_y_train==i,0], m_train_pca[m_y_train==i,1], facecolor='none', edgecolor=c, alpha=0.25, label=f'Number {i}')
_ = ax.legend()

## <em>And with UMAP</em>

In [None]:
mani = umap.UMAP()
train_umap = mani.fit_transform(m_train)

fig, ax = plt.subplots(1, 1, figsize=(12,8), sharex=True, sharey=True, constrained_layout=True)
colors = ['blue', 'orange', 'red', 'purple', 'yellow', 'grey', 'green', 'brown', 'aquamarine', 'navy']
for i, c in zip(range(10), colors):
    _ = ax.scatter(train_umap[m_y_train==i,0], train_umap[m_y_train==i,1], 
                   facecolor='none', 
                   edgecolor=c, 
                   alpha=0.5, 
                   label=f'Number {i}')
_ = ax.legend()

## Conclusion

### - There doesn't seem to be a clear distinction between States given the Features.
### - What else could we do in order to assess this?
### - Feel free to suggest. 
### - Bye.

## Feel free to upvote if you like. Thanks