In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Description 

Collection of some ideas and hypothesis I've tested to try building better features. 

In [None]:
df_train = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/test.csv')
df_test.head()

In [None]:
df_train.describe(include='all')

In [None]:
df_train.nunique()

In [None]:
df_train[df_train['u_out'] == 0].groupby(['C', 'R'])[['time_step', 'u_in', 'pressure']].agg(['mean', 'max', 'min', 'std'])

# Hysteresis of $\Delta P$

It seems that the change in $u_{in}$ does not represent an instant change on $P$, there's some hysteresis in play here (likely related to $C$ and the internal circuitry), so to correctly represent this, we need to find out the right lag for $P$.

In [None]:
df_train.query('id < 500').plot.line(x='id', y=['pressure', 'u_in'])

In [None]:
# this breath_id represents an experiment on oscillation
df_train.query('breath_id == 928 and id < 46910').plot.line(x='id', y=['pressure', 'u_in'])

In [None]:
(df_train
 .query('breath_id == 928 and id < 46910')
 .assign(u_in_shift_1=lambda x: x['u_in'].shift(1),
         u_in_shift_2=lambda x: x['u_in'].shift(2),
         u_in_shift_3=lambda x: x['u_in'].shift(3))
 .plot.line(x='id', y=['pressure', 'u_in', 'u_in_shift_1', 'u_in_shift_2', 'u_in_shift_3'], figsize=(15, 8)))

It seems that a shift equal to two units is appropriate. However, let's use a more robust approach, comparing the correlation between the pressure and shifted $u_{in}$

In [None]:
(df_train
 .query('u_out == 0')[['u_in', 'pressure']]
 .assign(u_in_shift_1=lambda x: x['u_in'].shift(1),
         u_in_shift_2=lambda x: x['u_in'].shift(2),
         u_in_shift_3=lambda x: x['u_in'].shift(3))
 .corr())

We have our highest correlation between the pressure and $u_{in}$ using a shift of two units (why?).

# Impact of R

R (measured in $cmH2O/l/s$) indicates how restricted the airway is or alternatively, the $\Delta Pressure$ per $\Delta Flow$, how much the pressure increases by a change in the flow rate.  This value should be related to $u_{in}$ somewhat.  

## First try (WRONG)

Define $u_{in}$ as a dimensionless variable that is the percentage the valve is opened. Let's suppose that, at 100%, the valve delivers $F = F_{max} = X \, l/s$, and at 0%, it delivers $F = 0 \, l/s$. Hence, $F$ can be thought of as:

$$ F = u_{in} \cdot F_{max} \quad (l/s)$$

and $R$:

$$ R = \frac{\Delta P}{\Delta F} = \frac{\Delta P}{\Delta u_{in} \cdot F_{max}} $$

If this holds for the dataset, then the plot of $\Delta P$ per $\Delta u_{in}$ should be a straight line, with its slope given by $R \cdot F_{max}$. 

THIS IS WRONG: We can have a decrease in flow (negative $\Delta F$) with an increase in pressure (positive $\Delta P$). The flow just has to stay positive inside the lung. Lets go for second try.

## Second Try

Define $u_{in}$ as a dimensionless variable that is the percentage the valve is opened. Let's suppose that, at 100%, the valve delivers $F = F_{max} = X \, l/s$, and at 0%, it delivers $F = 0 \, l/s$. Hence, $F$ can be thought of as:

$$ F = u_{in} \cdot F_{max} \quad (l/s)$$

and $R$:

$$ R = \frac{\Delta P}{ F} = \frac{\Delta P}{u_{in} \cdot F_{max}} $$

If this holds for the dataset, then the plot of $\Delta P$ per $u_{in}$ should be a straight line, with its slope given by $R \cdot F_{max}$. 

## Analysis

Weird, it seems that the procedure in First Try leads to a better result than in Second Try (higher correlation).
 Why?


In [None]:
delta_lags_path = '/kaggle/working/df_delta_lags.parquet'
os.path.exists(delta_lags_path)

In [None]:
#os.remove(delta_lags_path)

In [None]:
# We'll create different sizes for the Delta window (pressure and flow)
steps = range(0, 10)
# And diffferent lags for Delta P
lags_pressure = range(0, 5)

In [None]:
if os.path.exists(delta_lags_path):
    df_ = pd.read_parquet(delta_lags_path)
else:
    # only analyse the inspiratory phase
    df_ = df_train[(df_train['u_out'] == 0)]

    # Delta windows
    vars_ = ['u_in', 'pressure']
    grp = df_.groupby('breath_id')[vars_]
    for step in steps:
        df_step = grp.diff(step)
        df_step.columns = [f'delta_{c}_{step}' for c in df_step.columns]
        df_[df_step.columns] = df_step
        
        # For Second Try, we'll get the average u_in between steps, for better approx
        for var in [vars_[0]]:  # just u_ins, not pressure        
            df_u_in_avg = (grp.shift(0)[[var]] + grp.shift(-step)[[var]])/2
            df_u_in_avg.columns = [f'{var}_avg_{step}']
            df_[df_u_in_avg.columns] = df_u_in_avg

    # we'll also accumulate those diffs
    df_cumsum = df_.groupby('breath_id')[[c for c in df_.columns if 'delta_' in c]].cumsum()
    df_cumsum.columns = [f'{c}_cumsum' for c in df_cumsum.columns]
    df_[df_cumsum.columns] = df_cumsum
    
    # Pressure lags
    grp = df_.groupby('breath_id')[[c for c in df_.columns if 'pressure' in c]]
    for lag in lags_pressure:
        df_lag = grp.shift(-lag)
        df_lag.columns = [f'{c}_l{lag}' for c in df_lag]
        df_[df_lag.columns] = df_lag

    df_.to_parquet(delta_lags_path)


In [None]:
df_.head()

In [None]:
df_.columns

Now, having this, we can calculate the correlation between all the $\Delta u_{in}$ columns and different (lagged) $\Delta P$, to check whether the hypothesis of hysteresis of $\Delta P$ makes sense

In [None]:
cols = [c for c in df_.columns if ('pressure' in c) or ('u_in' in c) and ('cumsum' not in c)]
cols_pressure = [c for c in df_.columns if ('pressure' in c)]
corr = df_[cols].corr()

In [None]:
cols_pressure = [c for c in df_.columns if ('pressure' in c) and ('cumsum' not in c)]

In [None]:
corr[cols_pressure].drop(cols_pressure, axis='index')

In [None]:
(df_
 .reset_index()
 .query('id > 1000 and id < 2000')
 .plot.line(y=['u_in', 'delta_u_in_1_cumsum', 'pressure'], figsize=(14,8)))

In [None]:
corr_df = pd.DataFrame()
for step in steps:
    corrs = df_[[f'delta_u_in_{step}'] + [f'delta_pressure_{step}_l{lag}' for lag in lags_pressure]].corr().iloc[:, 0]
    corrs.index = [c.replace(f'_{step}', '') for c in corrs.index]
    corr_df[f'st_{step}'] = corrs

print('First try: corrs of delta u_in vs delta_pressure')
corr_df

In [None]:
corr_df = pd.DataFrame()
for step in steps:
    corrs = df_[[f'u_in_avg_{step}'] + [f'delta_pressure_{step}_l{lag}' for lag in lags_pressure]].corr().iloc[:, 0]
    corrs.index = [c.replace(f'_{step}', '') for c in corrs.index]
    corr_df[f'st_{step}'] = corrs

print('Second try: corrs of average u_in versus delta_pressure')
corr_df

In [None]:
corr_df = pd.DataFrame()
for step in steps:
    corrs = df_[[f'u_in'] + [f'delta_pressure_{step}_l{lag}' for lag in lags_pressure]].corr().iloc[:, 0]
    corrs.index = [c.replace(f'_{step}', '') for c in corrs.index]
    corr_df[f'st_{step}'] = corrs

print('Second try: corrs of u_in versus delta_pressure')
corr_df

### First try
We can see that the $\Delta P$s with $lag = 2$ have higher correlations with $\Delta u_{in}$ for every step, meaning that the hysteresis hypothesis might be true.

### Second try

However, we found out that we shouldn't be using $\Delta u_{in}$, but $u_{in}$ itself for the analysis. Redoing that, we find out lower correlations. Why?

### Continuing...

Finally, we can make some plots of $\Delta P$ _versus_ $\Delta u_{in}$ and $u_{in}$ to visualize the (expected) linearity between them.

In [None]:
df_['C'] = df_['C'].astype("category")
df_['R'] = df_['R'].astype('category')

In [None]:
step = 2
g = sns.lmplot(data=df_.sample(20000),
               x=f'u_in_avg_{step}', 
               y=f'delta_pressure_{step}_l2', 
               row='R',
               col='C',
               hue='R')

In [None]:
(df_['delta_u_in_2'] == 0).sum()

In [None]:
g = sns.lmplot(data=df_[df_[f'delta_u_in_{step}'] != 0],
               x=f'delta_u_in_{step}', 
               y=f'delta_pressure_{step}_l2', 
               row='R',
               col='C',
               hue='R')

In [None]:
step = 9
g = sns.lmplot(data=df_[df_[f'delta_u_in_{step}'] != 0],
               x=f'delta_u_in_{step}', 
               y=f'delta_pressure_{step}_l2', 
               row='R',
               col='C',
               hue='R')

In [None]:
step = 1

g = sns.lmplot(data=df_[df_[f'delta_u_in_{step}'] != 0],
               x=f'delta_u_in_{step}', 
               y=f'delta_pressure_{step}_l2', 
               row='R',
               col='C',
               hue='R')

Our hypothesis seems to hold true, with higher values of $R$ having a bigger slope on the plots.

Now let's calculate this slope

In [None]:
from scipy import stats

linreg = (df_[df_[f'delta_u_in_{step}'] != 0]
          .dropna(subset=[f'delta_u_in_{step}', f'delta_pressure_{step}_l2'])
          .groupby(['C', 'R'])
          .apply(lambda x: pd.Series(stats.linregress(x[f'delta_u_in_{step}'], x[f'delta_pressure_{step}_l2'])))
          .rename(columns={0: 'slope', 1: 'intercept', 2: 'rvalue', 3: 'pvalue', 4: 'stderr'})
          .assign(f_max=lambda x: x['slope']/x.index.get_level_values('R').astype(int)))

linreg

In [None]:
sns.lineplot(data=linreg.reset_index(), x='C', y='f_max', hue='R')

In [None]:
sns.lineplot(data=linreg.reset_index(), x='R', y='f_max', hue='C')

$F_{max}$ seems to vary with $R$, which wasn't in our hypothesis.

In [None]:
sns.relplot(data=df_[df_['breath_id'] < 1000],
            x='delta_u_in_2',
            y='delta_pressure_2_l2',
            hue='breath_id',
            col='C',
            row='R',
            kind='line')

In [None]:
df_[df_['breath_id'] == 1].plot.scatter(x='delta_u_in_2', y='delta_pressure_2_l2')

In [None]:
df_[df_['breath_id'] == 39].plot.scatter(x='u_in', y='pressure')

In [None]:
g = sns.PairGrid(data=df_[df_['breath_id'] == 39],
                 x_vars=['time_step', 'delta_pressure_2_l2'],
                 y_vars=['pressure', 'u_in', 'delta_u_in_2'])
g.map(sns.scatterplot)