In [None]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#%matplotlib notebook

In [None]:
file_sub = '/kaggle/input/ventilator-pressure-prediction/sample_submission.csv'
file_train = '/kaggle/input/ventilator-pressure-prediction/train.csv'
file_test = '/kaggle/input/ventilator-pressure-prediction/test.csv'

## Data sanity check 

In [None]:
df_train = pd.read_csv(file_train)

#in order to spare some memory space, we upload the test and sumbission files later.
#however, they have been checked in previous versions, as seen in the following commented lines.
#df_test_out  = pd.read_csv(file_test)
#df_sub = pd.read_csv(file_sub)

let's see the general infos with '.info()' and '.describe()'

In [None]:
df_train.info(show_counts=True)

In [None]:
#df_test_out.info(show_counts=True)

In [None]:
df_train.describe()

There are negative pressure on some breath ids. We remove them from the training set.

In [None]:
#remove the breath_id with negative pressure
boole_neg = df_train['pressure'] < 0.
breath_id_2drop = df_train.loc[boole_neg,'breath_id'].unique().tolist()
boole_id2drop = df_train['breath_id'].apply(lambda x: True if x in breath_id_2drop else False) 
df_train.drop(index=df_train[boole_id2drop].index, inplace=True)


In [None]:
#df_test_out.describe()

with the next cells we want to check the 
- regularity of breath_id 
- numbers of time_steps per breath_id
- if the breath_id numbers are consecutive
- if the time_steps is constant

In [None]:
from collections import Counter

def counts_steps(df_):
    #counts if all the breath_id have the same number of steps
    lista = df_['breath_id'].tolist()
    counts = Counter(lista)
    print('unique numbers of steps are {}'.format(set(list(counts.values()))))
    #
    #counts how many steps for u_out=0 (u_out=1)
    boole_close = df_['u_out']<0.5
    boole_open = df_['u_out']>0.5
    lista_close = df_.loc[boole_close,'breath_id'].tolist()
    lista_open = df_.loc[boole_open,'breath_id'].tolist()
    counts_close = Counter(lista_close)
    counts_open = Counter(lista_open)
    print('unique numbers of steps with u_out=0 are {}'.format(set(list(counts_close.values()))))
    print('unique numbers of steps with u_out=1 are {}'.format(set(list(counts_open.values()))))
    #breath_id are not consecutive
    print('here we see that breath_id are not consecutive')
    print(Counter(df_['breath_id'].diff().tolist()), 'nan correspond to the first row')
    
counts_steps(df_train)

In [None]:
#do the same for df_test_out
#counts_steps(df_test_out)

In [None]:
#take the difference between two consecutive steps
df_diff = df_train['time_step'].diff()
#the time step start from zero at every breath_id. therefore we must remove the first difference at every step
boole_diff = df_diff > 0.0
df_diff[boole_diff].describe()

- the table has non null values
- R and C have integer values which stay constant during each breath (labelled with breath_id)
- u_out has integers 0 and 1 for close and open expiratory valve, respectively 
- the variable is u_in is variable during time (given by time_step)
- every breath_id has 80 steps
- the number of steps with u_out=0 (u_out=1) is variable between 25 and 32 (48 and 55)
- the breath_id are not consecutive, some of them are missing (for instance 8 and 11, among others...)
- the time step is not exactly constant, but nearly constant with value ~0.03 seconds

Let's verify that the R anc C combinations are the same in the training and test set

In [None]:
#show the R and C combination for the train_df
df_train[['R','C']].value_counts()

In [None]:
# show the same for the df_test_out
#df_test_out[['R','C']].value_counts()

The 'R' and 'C' values represent physical features of the simulated lung. From these values depend the resulting pressure. from the counts of values showed above, we see that nine different combinations of such features have been tested, being the combination R=50 and C=10 the more frequent.

Let's visualize how the variables 'u_in', 'u_out', and 'pressure' changes as a function of time ('time_step') for few breath_ids and different 'C' and 'R'

In [None]:
def make_df2plot(df, pos):

    df_counts = df[['R','C']].value_counts().reset_index(name='counts')
    df_out = pd.DataFrame(columns=df.columns)

    for index,row in df_counts.iterrows():
        R_ = row['R']
        C_ = row['C']
        boole_ = (df['R'] == R_) & (df['C'] == C_)
        df_ = df[boole_].copy()
        df_.sort_values('breath_id', inplace=True)
        df_.reset_index(drop=True, inplace=True)
        breath_id_unique = df_['breath_id'].unique()
        id_ = breath_id_unique[pos]
        boole_id = df_['breath_id'] == id_
        df_slice = df_[boole_id]        
        df_out = pd.concat([df_out, df_slice], ignore_index=True)
 
    return df_out.convert_dtypes()


In [None]:
df_plot = make_df2plot(df_train, 1)

cols = ['R','C','breath_id','time_step', 'u_in','u_out','pressure']

df_melted = pd.melt(df_plot[cols], id_vars=cols[0:4], value_vars=cols[4:]).convert_dtypes()
grid = sns.FacetGrid(df_melted, col="R", row='C', hue='variable', palette="tab10",
                     height=2.5)
grid.map(sns.lineplot, "time_step", 'value')
grid.add_legend()
plt.show()

# Feature engineering

We here add few new variables:
- 'volume' which want to approximate the air volume inflated in a time step
- 'volume_tot' which want to approximate the total volume of air inflate into the lung

no need to add that these are poor approximations. However, they show to have some correlation with the pressure, i.e. they turn out to be useful

In [None]:
def add_volume_var(df_):
    df_['time_step_diff'] = df_['time_step'].groupby(df_['breath_id']).diff().fillna(0)
    df_['volume'] = df_['time_step_diff'] * df_['u_in']
    df_['volume_tot'] = df_['volume'].groupby(df_['breath_id']).cumsum()
    #
    return df_

df_train = add_volume_var(df_train)
#df_test_out = add_volume_var(df_test_out)

now we want to check the correlations of the variables 'u_in' and 'volume_tot' against 'pressure with time shift. We cover the past and future by 30 time steps each. We visualize the correlations with a graph

In [None]:
cols = ['u_in','volume_tot','pressure']

list_RC = [[5.0, 10.0],[5.0, 20.0],[5.0, 50.0],
           [20.0, 50.0],[20.0, 20.0],[20.0, 10.0],
           [50.0, 10.0],[50.0, 20.0],[50.0, 50.0]]

list_corr = []

for R_label,C_label in list_RC:

    boole =  (df_train['C'] == C_label) & (df_train['R'] == R_label)
    df2=df_train.loc[boole,cols].copy()

    #check up to 30 days in the past
    for i in range(-30,30):
        df2['pressure'] = df_train.loc[boole,'pressure'].shift(i)
        corr = df2.dropna().corr()
        list_corr.append([corr.loc['u_in','pressure'],i,R_label,C_label,'u_in'])
        list_corr.append([corr.loc['volume_tot','pressure'],i,R_label,C_label,'volume_tot'])

df_corr = pd.DataFrame(list_corr, columns=['correlation','time shift','R','C','variable'])

In [None]:
grid = sns.FacetGrid(df_corr, col="R", row='C', palette="tab10", hue='variable',
                     height=2.5)
grid.map(sns.lineplot, "time shift", 'correlation')
grid.add_legend()
plt.show()

The y-axis refers to the correlation between pressure and the variables, while the x-axis is the shift in time steps: negative refer to the correlation between the actual pressure and the past "u_in" ("volume_tot") value, while positive refer to the future.
Obscillations of the correlation with the time shift are clearly visible with peaks at  7, 2, -2, -7, -11, and -13 time step shift for high R, while for low R there is a single peak around -15. We use such shifts for the 'u_in' and 'volume_tot' as new variables

In [None]:
def add_vars_shift(df_,list_shift):
    for i in list_shift:
        col_vol = 'volume_tot' + str(i)
        col_u_in = 'u_in' + str(i)
        df_[col_vol] = df_['volume_tot'].shift(i).fillna(0)
        df_[col_u_in] = df_['u_in'].shift(i).fillna(0)



    return df_

list_shift = [7,2,-2,-7,-11,-15]
df_train = add_vars_shift(df_train,list_shift)
#df_test_out = add_vars_shift(df_test_out,list_shift)

Before setting a model, let's see how a naive model would perform. This will be the model to beat.
We set the simple model that predicts the pressure of the next time step equal to the present one.

In [None]:
from sklearn.metrics import mean_absolute_error

#naive forecasting on the last 20% of the data
y_true = df_train['pressure'][-int(len(df_train)*0.2):]
naive_ypred = y_true.shift(1).fillna(0)
#for the next time step, predict the pressure equal to the previous one
mae = mean_absolute_error(y_true, naive_ypred) #[1:] to avoid the NaN in y_pred
print('naive forecast MAE:{:.4f}'.format(mae))

This is the MAE to beat!

# XGBoost model

We chose to use the XGBoost model. After some playing with the parameters, we chose the parameters seen here bottom.

In [None]:
#remove some dataframes to free memory space
del df_melted, df_corr
#load packages
from sklearn.model_selection import train_test_split, cross_val_score, KFold

#from xgboost import XGBRegressor as xgbr
import xgboost as xgb

In [None]:
df_target = df_train['pressure'] 
#drop useless variables
df_train.drop(columns=['id','breath_id','pressure'], inplace=True)
#df_test_out.drop(columns=['id','breath_id'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df_train, df_target, test_size=0.2, shuffle = True, stratify = None)

In [None]:
params = {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'colsample_bytree': 0.8, 'learning_rate': 0.1,
                'max_depth': 12, 'alpha': 1, 'lambda': 1, 'tree_method':'gpu_hist'}

def set_DMatrix(X_,y_):
    boole_uout0 = X_['u_out'] == 0
    dmatrix = xgb.DMatrix(data=X_[boole_uout0],label=y_[boole_uout0])
    return dmatrix, boole_uout0

dtrain, boole_train_uout0 = set_DMatrix(X_train, y_train)
dtest, boole_test_uout0 = set_DMatrix(X_test, y_test)

Now we free some memory space and fit the model

In [None]:
#free memory space
del df_train, df_target
del X_train, y_train
del X_test, y_test

#fit the model
model = xgb.train(params, dtrain, 5000, early_stopping_rounds = 10, evals=[(dtest, 'dtest')], verbose_eval=500)

# Prediction for the submission set

In [None]:
df_test_out  = pd.read_csv(file_test)
df_test_out = add_volume_var(df_test_out)
df_test_out = add_vars_shift(df_test_out,list_shift)
df_test_out.drop(columns=['id','breath_id'], inplace=True)

In [None]:
#set the Dmatrix
dtest_out, boole_test_out = set_DMatrix(df_test_out, df_test_out['u_out']) # df_test_out['u_out'] is a bogus y

#predict test_out with the first model
y_test_out_pred = model.predict(dtest_out)

#fre memory
del df_test_out, dtest_out
del dtrain, dtest

In [None]:
df_sub = pd.read_csv(file_sub)
#write
df_sub.loc[boole_test_out,'pressure'] = y_test_out_pred
df_sub.to_csv('submission.csv', index = False)
