![](https://www.nhlbi.nih.gov/sites/default/files/inline-images/19-1096-NHLBI-OY1-Q41-ES-Ventilator-Support_900px_dev1.jpg)

The picture is taken from [NHLBI institute](https://www.nhlbi.nih.gov/health-topics/ventilatorventilator-support)

In [None]:
import numpy as np 
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import KFold, GroupKFold

from sklearn.ensemble import VotingRegressor

import optuna

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

pd.set_option('display.max_columns', None)
#########################################################
train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
ss = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

# Basic information

In [None]:
train.head(3)

In [None]:
print(f'Length of TRAIN dataset: {len(train)}')
print(f'Length of TEST dataset: {len(test)}')
print('')
print('Missing values in TRAIN dataset')
for i in train.iloc[:, 0:-1].columns.tolist():
    print(f'{i}: {train[i].isna().sum()}')
print('')
print('Missing values in TEST dataset')
for i in train.iloc[:, 0:-1].columns.tolist():
    print(f'{i}: {train[i].isna().sum()}')
print('')
print(f'Number of breaths in train dataset: {train["breath_id"].nunique()}')
print(f'Number of breaths in test dataset: {test["breath_id"].nunique()}')
print(f'The number of observations for each breath: {train["breath_id"].value_counts().reset_index()["breath_id"].unique()[0]}')

# EDA

In [None]:
fig = plt.figure(figsize = (13, 8))
rc = ['R', 'C']
for i in rc:
    plt.subplot(2, 2, rc.index(i)+1)
    plt.title(i, y = 1.2, size = 25, fontname = 'monospace', color = 'black')
    a = sns.countplot(x = i, data = train, palette = ['#488a99', '#dbae58', '#4b585c'])
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace', size = 12)
    plt.yticks([])
    for j in ['right', 'top']:
        a.spines[j].set_visible(False)
    for j in ['bottom', 'left']:    
        a.spines[j].set_linewidth(1.2)
        
    summ = 0
    for p in a.patches:
        summ += p.get_height()

    for p in a.patches:
        height = p.get_height()
        a.annotate(f'{height}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 13,
                   xytext = (1, -15), 
                   textcoords = 'offset points',
                   fontname = 'monospace', color = 'white')
        a.annotate(f'{round((height/summ) * 100, 1)}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 15,
                   xytext = (1, 13), 
                   textcoords = 'offset points',
                   fontname = 'monospace', color = 'black')   
        
for i in rc:
    plt.subplot(2, 2, rc.index(i)+3)
    a = sns.countplot(x = i, data = test, palette = ['#488a99', '#dbae58', '#4b585c'])
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace', size = 12)
    plt.yticks([])
    for j in ['right', 'top']:
        a.spines[j].set_visible(False)
    for j in ['bottom', 'left']:    
        a.spines[j].set_linewidth(1.2)
        
    summ = 0
    for p in a.patches:
        summ += p.get_height()

    for p in a.patches:
        height = p.get_height()
        a.annotate(f'{height}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 13,
                   xytext = (1, -15), 
                   textcoords = 'offset points',
                   fontname = 'monospace', color = 'white')
        a.annotate(f'{round((height/summ) * 100, 1)}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 15,
                   xytext = (1, 13), 
                   textcoords = 'offset points',
                   fontname = 'monospace', color = 'black')
        
plt.figtext(0.15, 1.1, 'Distribution of lung attributes (R/C)', fontname = 'monospace', size = 30, color = 'black')
plt.figtext(1.03, 0.15, 'TEST', fontname = 'monospace', size = 25, color = 'black', rotation = 90)
plt.figtext(1.03, 0.7, 'TRAIN', fontname = 'monospace', size = 25, color = 'black', rotation = 90)
        
fig.tight_layout(h_pad = 10)
plt.show()

In [None]:
fig = plt.figure(figsize = (15, 15))
r, c, plot = [5, 20, 50], [10, 20, 50], 1
for i in range(3):
    rr = r[i]
    for k in range(3):
        cc = c[k]
        br_id = train.query('R == @rr & C == @cc').iloc[0,1]
        plt.subplot(3, 3, plot)
        plt.title(f'breath id = {br_id} | R = {rr} | C = {cc}', fontname = 'monospace', size = 14)
        a = sns.lineplot(data = train.query("breath_id == @br_id"), x = "time_step", y = "u_in", color = '#4b585c', linewidth = 2)
        sns.lineplot(data = train.query("breath_id == @br_id"), x = "time_step", y = "u_out", color = '#dbae58', linewidth = 2)
        sns.lineplot(data = train.query("breath_id == @br_id"), x = "time_step", y = "pressure", color = '#488a99', linewidth = 2)
        plt.ylabel('')
        plt.xlabel('time stemp', size = 14, fontname = 'monospace', labelpad = 10)
        plt.xticks(size = 12, fontname = 'monospace')
        plt.yticks(size = 12, fontname = 'monospace')

        for j in ['right', 'top']:
            a.spines[j].set_visible(False)
        for j in ['bottom', 'left']:    
            a.spines[j].set_linewidth(1.2)
            
        plot += 1

plt.figtext(0.01, 1.08, 'Observations on breaths with all possible lung attributes', fontname = 'monospace', size = 30, color = 'black')
plt.figtext(0.35, 1.03, 'u_in', fontname = 'monospace', size = 27, color = '#4b585c')
plt.figtext(0.45, 1.03, 'u_out', fontname = 'monospace', size = 27, color = '#dbae58')
plt.figtext(0.55, 1.03, 'pressure', fontname = 'monospace', size = 27, color = '#488a99')
fig.tight_layout(h_pad = 3)
plt.show()

In [None]:
fig = plt.figure(figsize = (15, 12))
plot = 1
for i in range(3):
    rr = r[i]
    for k in range(3):
        cc = c[k]
        plt.subplot(3, 3, plot)
        plt.title(f'R = {rr} | C = {cc}', fontname = 'monospace', size = 15, color = 'black')
        a = sns.kdeplot(train.query('time_step < 0.000001 & u_in < 0.000001 & R == @rr & C == @cc')['pressure'], color = '#488a99', shade = True, alpha = 1, linewidth = 1.5, edgecolor = 'black')
        plt.ylabel('')
        plt.xlabel('')
        plt.xticks(size = 12, fontname = 'monospace')
        plt.yticks([])

        for j in ['right', 'top']:
            a.spines[j].set_visible(False)
        for j in ['bottom', 'left']:    
            a.spines[j].set_linewidth(1.2)
            
        plot += 1

y = 1.27
for i in range(3):
    rr = r[i]
    y -= 0.333
    x = -0.315
    for k in range(3):
        cc = c[k]
        x += 0.333
        plt.figtext(x, y, f'Min: {round(train.query("time_step < 0.000001 & u_in < 0.000001 & R == @rr & C == @cc")["pressure"].min(),2)}', fontname = 'monospace', color = 'black')
        plt.figtext(x, y-0.02, f'Max: {round(train.query("time_step < 0.000001 & u_in < 0.000001 & R == @rr & C == @cc")["pressure"].max(),2)}', fontname = 'monospace')
        plt.figtext(x, y-0.04, f'Mean: {round(train.query("time_step < 0.000001 & u_in < 0.000001 & R == @rr & C == @cc")["pressure"].mean(),2)}', fontname = 'monospace', color = 'black')
        plt.figtext(x, y-0.06, f'Median: {round(train.query("time_step < 0.000001 & u_in < 0.000001 & R == @rr & C == @cc")["pressure"].median(),2)}', fontname = 'monospace', color = 'black')
        
plt.figtext(0.01, 1.08, 'Distribution of pressure depending on lung attributes', fontname = 'monospace', size = 30, color = 'black')
        
fig.tight_layout(h_pad = 3)
plt.show()

# Modeling

In [None]:
from sklearn.preprocessing import RobustScaler, normalize
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay

In [None]:
def features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag2'] = df['u_in'].shift(2).fillna(0)
    df['u_in_lag4'] = df['u_in'].shift(4).fillna(0)
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df = pd.get_dummies(df)
    
    df['ewm_u_in_mean'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).mean().reset_index(level=0,drop=True)
    df['ewm_u_in_std'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).std().reset_index(level=0,drop=True)
    df['ewm_u_in_corr'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).corr().reset_index(level=0,drop=True)
    
    df['rolling_10_mean'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).mean().reset_index(level=0,drop=True)
    df['rolling_10_max'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).max().reset_index(level=0,drop=True)
    df['rolling_10_std'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).std().reset_index(level=0,drop=True)
    
    df['expand_mean'] = df.groupby('breath_id')['u_in'].expanding(2).mean().reset_index(level=0,drop=True)
    df['expand_max'] = df.groupby('breath_id')['u_in'].expanding(2).max().reset_index(level=0,drop=True)
    df['expand_std'] = df.groupby('breath_id')['u_in'].expanding(2).std().reset_index(level=0,drop=True)
    
    return df

train = features(train)
test = features(test)

In [None]:
train = train.fillna(0)
test = test.fillna(0)

In [None]:
targets = train[['pressure']].to_numpy().reshape(-1, 80)
train.drop(['pressure', 'id', 'breath_id'], axis = 1, inplace = True)
test = test.drop(['id', 'breath_id'], axis = 1)

In [None]:
RS = RobustScaler()
train = RS.fit_transform(train)
test = RS.transform(test)

In [None]:
train = train.reshape(-1, 80, train.shape[-1])
test = test.reshape(-1, 80, train.shape[-1])

In [None]:
EPOCH = 300
BATCH_SIZE = 1024

tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

with tpu_strategy.scope():
    kf = KFold(n_splits = 5, shuffle = True, random_state = 228)
    test_preds = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
        print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
        X_train, X_valid = train[train_idx], train[test_idx]
        y_train, y_valid = targets[train_idx], targets[test_idx]
        model = keras.models.Sequential([
            keras.layers.Input(shape = train.shape[-2:]),
            keras.layers.Bidirectional(keras.layers.LSTM(400, return_sequences = True)),
            keras.layers.Bidirectional(keras.layers.LSTM(300, return_sequences = True)),
            keras.layers.Bidirectional(keras.layers.LSTM(200, return_sequences = True)),
            keras.layers.Bidirectional(keras.layers.LSTM(100, return_sequences = True)),
            keras.layers.Dense(50, activation = 'selu'),
            keras.layers.Dense(1),
        ])
        model.compile(optimizer = "adam", loss = "mae")

        scheduler = ExponentialDecay(1e-3, 400*((len(train)*0.8)/BATCH_SIZE), 1e-5)
        lr = LearningRateScheduler(scheduler, verbose = 1)

        model.fit(X_train, y_train, validation_data = (X_valid, y_valid), epochs = EPOCH, batch_size = BATCH_SIZE, callbacks = [lr])

        test_preds.append(model.predict(test).squeeze().reshape(-1, 1).squeeze())

In [None]:
ss['pressure'] = sum(test_preds) / 5
ss.to_csv('lstm.csv', index = False)