# Pressure / MAE exploration with umap / hdbscan

The general idea of this notebook is to explore where we get high MAE in terms of clusters of target. We use umap and hdbscan to proceed to dimensionnality reduction and clustering. This allow for 2D plot and aggregation of MAE by cluster to priorize wich clusters we have to deal with. Another version of this notebook, exploring u_in and MAE, including some specific changes (masking u_in ?) is available here (https://www.kaggle.com/lucasmorin/u-in-mae-exploration-with-umap-hdbscan).

# Base construction

In [None]:
import numpy as np
import pandas as pd

import optuna

import os 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/274717

import matplotlib.pyplot as plt
import tensorflow as tf, gc
from tensorflow import keras
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.callbacks import LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.model_selection import train_test_split, GroupKFold, KFold

from IPython.display import display

DEBUG = False
TRAIN_MODEL = False

df_train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
df_test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

if DEBUG:
    train = train[:80*1000]

def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df

train = add_features(df_train)
test = add_features(df_test)

targets = train[['pressure']].to_numpy().reshape(-1, 80)
train.drop(['pressure', 'id', 'breath_id'], axis=1, inplace=True)
test = test.drop(['id', 'breath_id'], axis=1)

RS = RobustScaler()
train = RS.fit_transform(train)
test = RS.transform(test)

train = train.reshape(-1, 80, train.shape[-1])
test = test.reshape(-1, 80, train.shape[-1])

# Infer train data to compare

In [None]:
EPOCH = 300
BATCH_SIZE = 1024
NUM_FOLDS = 10

gpu_strategy = tf.distribute.get_strategy()

with gpu_strategy.scope():
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2021)
    train_preds = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
        if fold>0:
            break
        K.clear_session()
        print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
        checkpoint_filepath = f"folds{fold}.hdf5"
        model = keras.models.load_model('../input/finetune-of-tensorflow-bidirectional-lstm/'+checkpoint_filepath)
        train_preds.append(model.predict(train, batch_size=BATCH_SIZE, verbose=2).squeeze().reshape(-1, 1).squeeze())
        del model
        gc.collect()

In [None]:
df = pd.DataFrame({'target': targets.flatten(), 'preds': np.mean(train_preds,axis=0), 'id': [i+1 for i in range(75450) for e in range(80)]})
preds_by_id = np.mean(train_preds,axis=0).reshape(-1, 80)
df['u_out_mask'] = (train[:,:,2] == -1).flatten()
df['error'] = np.abs(df['target']-df['preds'])
df_masked = df.mask(~df['u_out_mask'])
MAE_id = df_masked.groupby('id').agg(np.nanmean)['error']

# Basic Exploration

In [None]:
MAE_id.describe()

In [None]:
np.log(MAE_id).hist(bins=100)

In [None]:
MAE_id_sorted = MAE_id.sort_values(ascending=False)

# worst predictions

In [None]:
for i in MAE_id_sorted.index[:10]:
    ind = np.int(i)
    ind_labels = (MAE_id.index.values == i)
    
    print('cluster:'+str(i) + ' MAE: '+str(np.round(MAE_id_sorted[i],3)))

    plt.figure(figsize=(24,8))
    
    plt.subplot(1, 3, 1)
    plt.plot(train[ind_labels,:,1].transpose());
    plt.title('u_in')
    
    plt.subplot(1, 3, 2)
    plt.plot(targets[ind_labels].transpose(), label='target');
    plt.plot(preds_by_id[ind_labels].transpose(), label='preds');
    plt.legend()
    plt.title('pressure')
    
    plt.subplot(1, 3, 3)
    plt.plot((targets[ind_labels] - preds_by_id[ind_labels]).transpose());
    plt.title('error')
    
    plt.show()

LTSM seems to be able to predict weird noise, but has some constant error on some ids. However such errors seems rather rare. The instance with 15 MAE only account for 0.0002 global MAE. We must find if some clusters of output have more important global error (less individual MAE but high number of instances).

# UMAP / Hdbscan

In [None]:
!mkdir -p /tmp/pip/cache/
!cp ../input/hdbscan0827-whl/hdbscan-0.8.27-cp37-cp37m-linux_x86_64.whl /tmp/pip/cache/
!pip install --no-index --find-links /tmp/pip/cache/ hdbscan

# exploration on pressure - UMAP Embedding - hdbscan clustering

As the idea is to observe the target, we embed pressure with umap then we fit hdbscan on the embedding. Maybe better to fit on the original data. 


In [None]:
%%time

import hdbscan
import umap

n=75450

# Switch to do the exploration on u_in
#X = train[:,:,1]
X = targets

reducer = umap.UMAP(random_state=42, n_components=2)
embedding = reducer.fit_transform(X)
clusterer = hdbscan.HDBSCAN(prediction_data=True, min_cluster_size = 50).fit(embedding)
u, counts = np.unique(clusterer.labels_, return_counts=True)

print(u)
print(counts)

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=clusterer.labels_, edgecolors='none', cmap='jet');

Doesn't seems to good of a clustering... there seems to be lot of 'outliers'.

# Plot error - log_scale

In [None]:
import matplotlib.colors as colors
plt.figure(figsize=(10, 8))
plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=MAE_id[:n], edgecolors='none', cmap='jet', norm=colors.LogNorm(vmin=MAE_id[:n].quantile(0.05), vmax=MAE_id[:n].quantile(0.95)));
plt.colorbar();

# MAE by cluster

not exact as mean of MAE is not exactly the evaluation metric (due to u_out not starting at the same time), but should be ok for exploration.

In [None]:
MAE_by_cluster = pd.DataFrame({'cluster':clusterer.labels_,'MAE':MAE_id})

MAE_cluster = MAE_by_cluster.groupby('cluster').agg(np.sum)['MAE']

#remove some base MAE, then divide by gloabl number
MAE_cluster_global = (MAE_cluster - counts * 0.15)/counts.sum()
MAE_cluster_global = MAE_cluster_global.sort_values(ascending=False)

In [None]:
MAE_cluster_global

In [None]:
for i in MAE_cluster_global.index[:10]:
    ind_labels = (clusterer.labels_ == i)
    print('cluster:'+str(i) + ' Global MAE: '+str(np.round(MAE_cluster_global[i],4)) + ' count: '+str(np.sum(ind_labels)))

    plt.figure(figsize=(24,6))
    
    plt.subplot(1, 4, 1)
    plt.scatter(embedding[:, 0], embedding[:, 1], s=5+100*ind_labels, c=ind_labels, edgecolors='none', cmap='viridis');
    plt.title('position')
    
    plt.subplot(1, 4, 2)
    plt.plot(train[ind_labels,:,1].transpose());
    plt.title('u_in')
    
    plt.subplot(1, 4, 3)
    plt.plot(targets[ind_labels].transpose());
    plt.title('pressure')
    
    plt.subplot(1, 4, 4)
    plt.plot((targets[ind_labels] - preds_by_id[ind_labels]).transpose());
    plt.title('error: target-preds')
    
    plt.show()

We have found some clusters with high MAE. The cluster 4 with 0.52 MAE and 360 instances account for 0.002 global MAE, while a single instance with 15 MAE account for 0,0002 of global MAE. It appears to be 10 times more important to deal with this cluster than to deal with the higher individual MAE. 

Regarding specific clusters and how to deal with them, some spike of errors seems concentrated. We might want to try some adapted features. 

# Study by R and C

In [None]:
df_rc = df_train[['breath_id','R','C']].groupby('breath_id').agg(np.min)

for r in [5,20,50]:
    for c in [10,20,50]:
        
        print('r: '+str(r)+', c: '+str(c))
        ind_rc = (df_rc.R==r) & (df_rc.C==c)

        X = targets[ind_rc]
        #u_in
        X_train = train[ind_rc,:,1]
        X_targets = targets[ind_rc]
        X_preds = preds_by_id[ind_rc]

        reducer = umap.UMAP(random_state=42, n_components=2)
        
        # switch to embed on u_in (not tested)
        #embedding = reducer.fit_transform(X_train)
        embedding = reducer.fit_transform(X)

        clusterer = hdbscan.HDBSCAN(prediction_data=True, min_cluster_size = 20).fit(embedding)
        u, counts = np.unique(clusterer.labels_, return_counts=True)

        plt.figure(figsize=(20, 8))
        plt.subplot(1, 2, 1)
        plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=clusterer.labels_, edgecolors='none', cmap='jet');
        plt.title('clusters')
        plt.subplot(1, 2, 2)
        plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=MAE_id[ind_rc.values], edgecolors='none', cmap='jet', norm=colors.LogNorm(vmin=1e-2, vmax=1e0));
        plt.title('MAE')
        plt.colorbar();
        plt.show();
        
        
        MAE_by_cluster = pd.DataFrame({'cluster':clusterer.labels_,'MAE':MAE_id[ind_rc.values]})
        MAE_cluster = MAE_by_cluster.groupby('cluster').agg(np.sum)['MAE']
        #remove some base MAE, then divide by gloabl number
        MAE_cluster_global = (MAE_cluster - counts * 0.15)/counts.sum()
        MAE_cluster_global = MAE_cluster_global.sort_values(ascending=False)
        
        
        for i in MAE_cluster_global.index[:3]:
            ind_labels = (clusterer.labels_ == i)
            print('cluster:'+str(i) + ' Global MAE: '+str(np.round(MAE_cluster_global[i],4)) + ' count: '+str(np.sum(ind_labels)))

            plt.figure(figsize=(24,6))
            plt.subplot(1, 4, 1)
            plt.scatter(embedding[:, 0], embedding[:, 1], s=5+100*ind_labels, c=ind_labels, edgecolors='none', cmap='viridis');
            plt.title('position')
            plt.subplot(1, 4, 2)
            plt.plot(X_train[ind_labels,:].transpose());
            plt.title('u_in')
            plt.subplot(1, 4, 3)
            plt.plot(X_targets[ind_labels].transpose());
            plt.title('pressure')
            plt.subplot(1, 4, 4)
            plt.plot((X_targets[ind_labels] - X_preds[ind_labels]).transpose());
            plt.title('error: target-preds')
            plt.show()
        
        
        
        

Some observations :

    - High Errors appear quite clustered for some R&C values. 
    
    - This does seems exploitable as we identify some cluster that seems to be responsible for more than 0.01 MAE gloably.
    
    - There might be some work to get better parameters for hdbscan (the three clusters for r: 5, c: 50 should be one). 
    
    - From the u_in graph and further inverstigation (coloring by first and last u_in - see below) the weirdest cluster appears linked to first and last values of u_in. 

# Role of u_in_first, u_in_last

In [None]:
df_rc = df_train[['breath_id','R','C']].groupby('breath_id').agg(np.min)

for r in [20]:
    for c in [50]:
        
        print('r: '+str(r)+', c: '+str(c))
        ind_rc = (df_rc.R==r) & (df_rc.C==c)

        X = targets[ind_rc]
        #u_in
        X_train = train[ind_rc,:,1]
        X_targets = targets[ind_rc]
        X_preds = preds_by_id[ind_rc]

        reducer = umap.UMAP(random_state=42, n_components=2)
        embedding = reducer.fit_transform(X)

        clusterer = hdbscan.HDBSCAN(prediction_data=True, min_cluster_size = 20).fit(embedding)
        u, counts = np.unique(clusterer.labels_, return_counts=True)

        plt.figure(figsize=(20, 8))
        plt.subplot(1, 2, 1)
        plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=clusterer.labels_, edgecolors='none', cmap='jet');
        plt.title('clusters')
        plt.subplot(1, 2, 2)
        plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=MAE_id[ind_rc.values], edgecolors='none', cmap='jet', norm=colors.LogNorm(vmin=1e-2, vmax=1e0));
        plt.title('MAE')
        plt.colorbar();
        plt.show();
        
        plt.figure(figsize=(20, 8))
        plt.subplot(1, 2, 1)
        plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=X_train[:,1], edgecolors='none', cmap='jet');
        plt.title('u_in_first')
        plt.colorbar();
        plt.subplot(1, 2, 2)
        plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=X_train[:,-1], edgecolors='none', cmap='jet');
        plt.title('u_in_last')
        plt.colorbar();
        plt.show();

 My idea was to  get the first and last values separately. I implemented some dedicated functions (in OPTIVER comptetition). But it doesn't exactly seems optimal to build constant time series for LSTM. Maybe we can build a second head that would take individual time series features. From my time series feature engineering notebook (https://www.kaggle.com/lucasmorin/time-series-agregation-functions) we have the following proof of concept for aggregating individual features:

In [None]:
get_first = lambda x: x.iloc[0]
get_first.__name__ = 'get_first'

get_last = lambda x: x.iloc[-1]
get_last.__name__ = 'get_last'

get_first_fn = [get_first,get_last]

create_feature_dict = {
    'u_in': get_first_fn,
}

train_features = df_train.groupby('breath_id').agg(create_feature_dict)
train_features.columns = ['_'.join(col) for col in train_features.columns]

train_features.head()