In [None]:
import pandas as pd
import numpy as np
import datetime,os,gc
import matplotlib.pyplot as plt

import random
random.seed(1234)

# This examines the feature 'time_step' and derivatives via unsupervised learning using TSNE and UMAP

It was motivated by a [discussion](https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/285435) exchanged with [Chris Deotte](https://www.kaggle.com/cdeotte).

I produced initially a TSNE embedding [here](https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/285773). The conclusion was that 'time_step', suprisingly, contains much information about the 'R' and 'C' features. But how much? One issue with that earlier plot was that the labeling of the learned clusters were not disjoint. That made the last layer to dominate in the plots. One can examine that certain clusters, all three values for R or C were present. Others had two values. Some clusters had only one.

In this notebook, I color the clusters when only one of the values in R or C dominates the cluster. Clusters with two or more values will appear gray. It should give a better indicator for the correlation between time_step and R or C.

In [None]:
features = ['R','C','breath_id','time_step']

if True:
    train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv',usecols=features)
    test  = pd.read_csv('../input/ventilator-pressure-prediction/test.csv',usecols=features)

Data is read and then train and test are concatenated.

In [None]:
train_BID = train.breath_id.unique()
test_BID = test.breath_id.unique()
df = pd.concat([train,test], ignore_index=True, axis=0)

assert df.shape[0] % 80 == 0
assert len(train_BID) + len(test_BID) == df.shape[0]//80
df = df.sort_values(['breath_id','time_step'])
df.index = np.arange(0,df.shape[0]).astype(np.int64)

time_step is renamed to time_step0 for convenience.
timpe_step1 is the first order difference.
If you want to examine higher order differences increase 'order'

In [None]:
def add_features(df):
    order = 1
    df.rename(columns={'time_step':'time_step0'},inplace=True)
    for i in range(1):
        df[f'time_step{i+1}'] = df.groupby('breath_id')[f'time_step{i}'].diff().shift(-1).fillna(method='ffill')
    return df
df = add_features(df)

# let's pick 50000 breathing patterns from train and test randomly
sample_BID = np.random.choice(df.breath_id.unique(),50000)
# comment the line bellow if you want to run over the entire data
df = df[df.breath_id.isin(sample_BID)]

In [None]:
!pip install -U scikit-learn
from sklearn.manifold import TSNE
import umap

C   = np.array(df.C).reshape(-1,80)[:,0]
R   = np.array(df.R).reshape(-1,80)[:,0]

import re

cluster_features = [col for col in df.columns if  re.match('time_step[[0-9]',col) ]

In [None]:
def plot_embedding(embedding,embedding_name,label_feature,label_feature_name):
    fig = plt.figure()
    CB_color_cycle = ['y','k','b']
    x = 1
    point_size=1
    for g,c,label in zip(np.unique(label_feature), CB_color_cycle, [ label_feature_name+str(cc) for cc in np.unique(label_feature)]):
        ix = np.where(label_feature[:] == g)
        ix_bar = np.where(label_feature[:] != g)
        ax = fig.add_subplot(1,3,x)
        # plot all
        ax.scatter(embedding[:,0],  embedding[:,1],s=point_size,c='0.8',label = 'All',marker='.',alpha=0.25)
        # plot desired layer
        ax.scatter(embedding[ix,0],embedding[ix,1],s=point_size,c=c,    label = label,marker='.',alpha=1)
        # plot other than the desired layer
        ax.scatter(embedding[ix_bar,0],embedding[ix_bar,1],s=point_size+1,c='0.5',marker='.',alpha=1)
        ax.legend()
        x += 1
    plt.suptitle(f'{embedding_name}')
    fig.set_size_inches(12,4)
    plt.show()
    
for step in cluster_features:
    
    dat = np.array(df[step]).reshape(-1,80,1)
    dat = np.transpose(dat,axes=(0,2,1))
    dat = dat.reshape(dat.shape[0],-1).astype(np.float32)
    
    tsne = TSNE(n_components=2, learning_rate='auto',init='random',perplexity=10,random_state=1234).fit_transform(dat)
    UMP  = umap.UMAP(n_neighbors=15, random_state=1234).fit(dat)
    
    plot_embedding(tsne,'TSNE '+step,R,'R')
    plot_embedding(UMP.embedding_,'UMAP '+step,R,'R')
    plot_embedding(tsne,'TSNE '+step,C,'C')
    plot_embedding(UMP.embedding_,'UMAP '+step,C,'C')

The plots above show that while certain clusters are mostly exclusively dominated by one R or C value, many of them have more than two values. Thus it seems that although time_step knows much about R or C, it is not sufficient to fully discern them.