In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from scipy import interpolate
from tqdm.auto import tqdm

My idea here was that perhaps the magic had to do with the data being messy. If that was the case, if only I could identify which samples needed to be replaced, I could use a nonparametric model like kNN-Regressor with k=1 or =3 or whatever to 'snap' those positions into place where the model fails to do so correctly.

Here, I look at one of my good single model's outputs (not the best one but a good one) and examine its MAE sorted from descending. Then I target the worst predictions for analysis and look at their neighbors.

An interesting note: time deltas are heterogeneous. Due to this, I first resample everything to a common format before applying neighbors search. Also, I restrict the length of time searched to the area covering up until the earliest breath exhale.

# To Proceed

In [None]:
data = np.load('../input/ventexp55/exp055_oof.npz')
preds = data['preds']
masks = data['masks']
pressures = data['pressures']
pressures.shape

In [None]:
train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
train_uins = train.u_in.values.reshape(-1,80)[:,:33]
train_ts = train.time_step.values.reshape(-1,80)[:,:33]
train_pressures = train.pressure.values.reshape(-1,80)[:,:33]

In [None]:
mae = np.abs(masks * (pressures - preds))
mae = mae.sum(axis=-1) / masks.sum(axis=-1)
mae_sort = np.argsort(mae)[::-1]
mae

In [None]:
plt.hist(mae, bins=100); plt.show()
mae[mae_sort[0]]

In [None]:
# Our goal:
mae.mean(), mae[mae<=1.1].mean()

Hahaha yeah right. I wish.

In [None]:
# We only need to try and fix the worst 675 predictions to have this
# Much of an affect on MAE!
(mae>1.1).sum()

# Resampling

I find a good average timestep size and then resample to 1/3 of that so that we have enough resolution to cover samples that aren't nicely aliased by the sampling fs.

In [None]:
# To make time easier, we can descritize time into 0.03144001960754393 buckets...
mean_ts_step = np.diff(train_ts, axis=-1).mean()

# Start by interpolating u_in using cublic interpolation...
resampled_ts = np.arange(0, train_ts[masks].max(), mean_ts_step / 3)

resampled_uins = np.zeros((train_uins.shape[0], resampled_ts.shape[0]))
resampled_pressures = np.zeros((train_uins.shape[0], resampled_ts.shape[0]))
resampled_preds = np.zeros((train_uins.shape[0], resampled_ts.shape[0]))

for idx_sample in tqdm(range(train_uins.shape[0])):
    # uin is the input
    # pressure is the target we hope to clip our predictions to
    interp_uin      = interpolate.Akima1DInterpolator(train_ts[idx_sample], train_uins[idx_sample])
    interp_pressure = interpolate.Akima1DInterpolator(train_ts[idx_sample], train_pressures[idx_sample])
    interp_preds    = interpolate.Akima1DInterpolator(train_ts[idx_sample], preds[idx_sample])
    
    # NOTE: We need to terminate @ last uout=0 because if we dont, it drops like crazy
    # and the coslerp gets jacked up. Alternatively, we can make judgement call to only match
    # first "n" ts predictions, e.g. up to 0.8227286338806152, and only apply our post
    # processing algorithm to that.
    resampled_uins[idx_sample] = interp_uin(resampled_ts)
    resampled_pressures[idx_sample] = interp_pressure(resampled_ts)
    resampled_preds[idx_sample] = interp_preds(resampled_ts)

In [None]:
# Always check, my friends..
resampled_uins.shape

# Top 15 Worst Predictions

In [None]:
for i in mae_sort[:15]:
    plt.figure(figsize=(15,5))
    
    mae = np.abs(masks[i] * (pressures[i] - preds[i]))
    mae = mae.sum() / masks[i].sum()

    plt.title(f'{mae:.5f}  MAE')
    plt.plot(resampled_ts, resampled_uins[i], linewidth=1, label='uin')
    plt.scatter(train_ts[i], train_uins[i], linewidth=1)
    
    plt.plot(resampled_ts, resampled_pressures[i], linewidth=1, label='pressure')
    
    plt.plot(resampled_ts, resampled_preds[i], linewidth=1, label='preds', linestyle='--')
    plt.scatter(train_ts[i], pressures[i], linewidth=1)
    
    plt.plot()
    plt.legend()
    plt.show()

Do you notice anything? 

I don't, but my analysis skills suck.

# Good Neighbors

Now we attempt to find the nearest neighbor and visualize its pressure. Is it closer to our target pressure than our prediction? Any way to can make that a `f(uin)`?

In [None]:
unique_R = train.R.unique()
unique_C = train.C.unique()
RCs = train[['R','C']].values.reshape(-1, 80, 2)[:,0::80].reshape(-1,2)
RCs

In [None]:
RCs.shape

In [None]:
# Thanks @cdelotte...

from cuml.neighbors import NearestNeighbors 

NEIGHBORS = 2  # 2 neighbors so we dont select ourselves
SIZE = 74      # 74 is the earliest an exhale occurs..
RCModels = {}
RCIndices = {}
RCFilters = {}

# Only select candidates within our R/C group
for R in unique_R:
    for C in unique_C:
        rcfilter = np.all(RCs == np.array([R,C]), axis=1)
        real_indices = np.nonzero(rcfilter)[0]
        
        model = NearestNeighbors(n_neighbors=NEIGHBORS, metric='l1')
        model.fit(resampled_uins[rcfilter, :SIZE])
        RCModels[(R,C)] = model
        RCIndices[(R,C)] = real_indices # so we can find them...
        RCFilters[(R,C)] = rcfilter

# Revisit The Top-100 Trash Predictions

In [None]:
SHOW = True

for i in mae_sort[:100]:
    if SHOW:
        plt.figure(figsize=(15,5))
    
    mae = np.abs(masks[i] * (pressures[i] - preds[i]))
    mae = mae.sum() / masks[i].sum()

    rc_vals = tuple(RCs[i])
    haystack = resampled_uins[RCFilters[rc_vals], :SIZE]
    distances, indices = RCModels[rc_vals].kneighbors(resampled_uins[i,:SIZE].reshape(1,-1))
    # NOTE: If we're grabbing a set, we can use [:,1] instead
    k_dist = np.round(distances[0,1]).astype(np.int) # dont select self for train set
    k_idx = indices[0,1] # dont select self for train set.
    real_idx = RCIndices[rc_vals][k_idx]
    
    # We have to asmple (interpolate) from kneighbor -> our timestamp space
    # To compute MAE
    # TODO:
    interp_k_pressures_proj = interpolate.Akima1DInterpolator(train_ts[real_idx], train_pressures[real_idx])
    k_pressures_proj = interp_k_pressures_proj(train_ts[i])
    # In case we go out of bounds, fill with the original data
    k_pressures_proj[np.isnan(k_pressures_proj)] = preds[i, np.isnan(k_pressures_proj)]
    k_mae = np.abs(masks[i] * (pressures[i] - k_pressures_proj))
    k_mae = k_mae.sum() / masks[i].sum()
    
    if SHOW:
        real_dist = np.abs(resampled_uins[i] - resampled_uins[real_idx]).sum() / SIZE
        plt.title(f'{mae:.5f} MAE.  K-Dist: {k_dist},  K-MAE: {k_mae:.5f}.   RealDist {real_dist}')
        plt.plot(resampled_ts, resampled_uins[i], linewidth=1, label='uin')
        plt.plot(resampled_ts, resampled_uins[real_idx], linewidth=1, label='k-uin', c='lightblue')
        plt.scatter(train_ts[i], train_uins[i], linewidth=1)

        plt.plot(resampled_ts, resampled_pressures[i], linewidth=1, label='pressure')
        plt.plot(resampled_ts, resampled_pressures[real_idx], linewidth=1, label='k-pressure', linestyle='-')

        plt.plot(resampled_ts, resampled_preds[i], linewidth=1, label='preds', linestyle='--')
        plt.scatter(train_ts[i], pressures[i], linewidth=1)

        plt.plot()
        plt.legend()
        plt.show()

# Alright now lets bake some of these in so we can derive thresholds...

Build k=1 NN stats for top 1000 worst offenders...

In [None]:
SHOW = False

cols = ['orig_mae', 'k_mae', 'dist', 'R', 'C']
results = []
for i in tqdm(mae_sort[:675]):
    if SHOW:
        plt.figure(figsize=(15,5))
    
    mae = np.abs(masks[i] * (pressures[i] - preds[i]))
    mae = mae.sum() / masks[i].sum()

    rc_vals = tuple(RCs[i])
    haystack = resampled_uins[RCFilters[rc_vals], :SIZE]
    distances, indices = RCModels[rc_vals].kneighbors(resampled_uins[i,:SIZE].reshape(1,-1))
    # NOTE: If we're grabbing a set, we can use [:,1] instead
    k_dist = distances[0,1] / SIZE  # dont select self for train set
    k_idx = indices[0,1] # dont select self for train set.
    real_idx = RCIndices[rc_vals][k_idx]
    
    # We have to asmple (interpolate) from kneighbor -> our timestamp space
    # To compute MAE
    # TODO:
    interp_k_pressures_proj = interpolate.Akima1DInterpolator(train_ts[real_idx], train_pressures[real_idx])
    k_pressures_proj = interp_k_pressures_proj(train_ts[i])
    # In case we go out of bounds, fill with the original data
    k_pressures_proj[np.isnan(k_pressures_proj)] = preds[i, np.isnan(k_pressures_proj)]
    k_mae = np.abs(masks[i] * (pressures[i] - k_pressures_proj))
    k_mae = k_mae.sum() / masks[i].sum()
    
    results.append([
        mae,
        k_mae,
        k_dist,
        rc_vals[0],
        rc_vals[1]
    ])
    if SHOW:
        plt.title(f'{mae:.5f} MAE.  K-Dist: {k_dist},  K-MAE: {k_mae:.5f}')
        plt.plot(resampled_ts, resampled_uins[i], linewidth=1, label='uin')
        plt.plot(resampled_ts, resampled_uins[real_idx], linewidth=1, label='k-uin', c='lightblue')
        plt.scatter(train_ts[i], train_uins[i], linewidth=1)

        plt.plot(resampled_ts, resampled_pressures[i], linewidth=1, label='pressure')
        plt.plot(resampled_ts, resampled_pressures[real_idx], linewidth=1, label='k-pressure', linestyle='-')

        plt.plot(resampled_ts, resampled_preds[i], linewidth=1, label='preds', linestyle='--')
        plt.scatter(train_ts[i], pressures[i], linewidth=1)

        plt.plot()
        plt.legend()
        plt.show()

In [None]:
rez = pd.DataFrame(results, columns=cols)
rez.to_csv('knn_stuff.csv', index=False)
rez

In [None]:
plt.hist(rez.dist, bins=100)
plt.show()

In [None]:
for R in train.R.unique():
    for C in train.C.unique():
        ds = []
        vs_old, vs_new = [], []
        for dist in np.arange(0.1,3, 0.02):
            look = rez[(rez.dist<dist)&(rez.R==R)&(rez.C==C)]
            vs_old.append(look.orig_mae.mean())
            vs_new.append(look.k_mae.mean())
            ds.append(dist)
        plt.figure(figsize=(15,4))
        plt.title(f'k=1 NN MAE for Group: {R}, {C}')
        plt.plot(ds, vs_old, label='old')
        plt.plot(ds, vs_new, label='new')
        plt.legend()
        plt.show()

In [None]:
# Most of our jacked up values occur in R=50
# Unfortunately, that's also where our algo post-processing fails the hardest...
rez.R.value_counts()

Dist < 50 seems to be consistent...

In [None]:
plt.plot(rez.orig_mae); plt.show()

In [None]:
for R in train.R.unique():
    ds = []
    vs_old, vs_new = [], []
    for dist in np.arange(0.08,1, 0.01):
        look = rez[(rez.dist<dist)&(rez.R==R)&(rez.orig_mae>0.8904941219161745)]
        vs_old.append(look.orig_mae.mean())
        vs_new.append(look.k_mae.mean())
        ds.append(dist)
    plt.figure(figsize=(15,4))
    plt.title(f'k=1 NN MAE for Group: {R}, {C}')
    plt.plot(ds, vs_old, label='old')
    plt.plot(ds, vs_new, label='new')
    plt.legend()
    plt.show()

In [None]:
look = rez[(rez.dist<1)&(rez.R!=50)]#&(rez.orig_mae>0.8904941219161745)]
look.R.value_counts()

Only a handful of guys we want to attempt to patch.