# Hybrid Model CNN + ENC_DEC LSTM/GRU

**This model produces ~0.161 MAE loss on test dataset. Model can be investigated much more on the optimization to reduce overfitting. It was performed on external GPU supported machine.**

In [None]:
#basic imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.callbacks import LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.model_selection import train_test_split, GroupKFold, KFold
from tqdm import tqdm
from IPython.display import display
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [None]:
#upload train, test and sample sumbition datasets
train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

all_pressure = np.sort(train.pressure.unique())
PRESSURE_MIN = all_pressure[0].item()
PRESSURE_MAX = all_pressure[-1].item()
PRESSURE_STEP = ( all_pressure[1] - all_pressure[0] ).item()

In [None]:
train.head()

# Feature Engineering
Feature engineering process it literally based on two public notebooks with some little modifications. Due to pandas 1.15.0 bug I had to turn off **ewm_u_in_mean** parameter. There is option to use continuous representations of **R** and **C** values - **if flag == True**.

***Notebooks:***
* https://www.kaggle.com/dlaststark/gb-vpp-whoppity-dub-dub
* https://www.kaggle.com/cdeotte/ensemble-folds-with-median-0-153

In [None]:
def add_features(df, deg, flag):
    '''
    df - processed dataframe
    deg - degree of lag shifts
    flag - if True add continuous R, C params
    '''
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    df['time_step_cumsum'] = df.groupby(['breath_id'])['time_step'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    for i in range(1,deg,1):
        df['u_in_lag{}'.format(i)] = df.groupby('breath_id')['u_in'].shift(i)
        df['u_out_lag{}'.format(i)] = df.groupby('breath_id')['u_out'].shift(i)
        df['u_in_lag_back{}'.format(i)] = df.groupby('breath_id')['u_in'].shift(-i)
        df['u_out_lag_back{}'.format(i)] = df.groupby('breath_id')['u_out'].shift(-i)
        
    df = df.fillna(0)
    
    if flag == True:
    #has to turn off this parameter due to pandas 1.15.0 bug
        df['ewm_u_in_mean'] = (df\
                               .groupby('breath_id')['u_in']\
                               .ewm(halflife=9)\
                               .mean()\
                               .reset_index(level=0,drop=True))

    df[["15_in_sum","15_in_min","15_in_max","15_in_mean"]] = (df\
                                                              .groupby('breath_id')['u_in']\
                                                              .rolling(window=15,min_periods=1)\
                                                              .agg({"15_in_sum":"sum",
                                                                    "15_in_min":"min",
                                                                    "15_in_max":"max",
                                                                    "15_in_mean":"mean"})\
                                                               .reset_index(level=0,drop=True))
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_in__mean'] =df.groupby(['breath_id'])['u_in'].mean()
    #df['breath_id__u_in__min'] = df.groupby(['breath_id'])['u_in'].transform('min')
    
    for i in range(1,deg,1):
        df['u_in_diff{}'.format(i)] = df['u_in'] - df['u_in_lag{}'.format(i)]
        df['u_out_diff{}'.format(i)] = df['u_out'] - df['u_out_lag{}'.format(i)]

    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']

    
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] /df['count']
    
    
    df['breath_id_lag']=df['breath_id'].shift(1).fillna(0)
    df['breath_id_lag2']=df['breath_id'].shift(2).fillna(0)
    df['breath_id_lagsame']=np.select([df['breath_id_lag']==df['breath_id']],[1],0)
    df['breath_id_lag2same']=np.select([df['breath_id_lag2']==df['breath_id']],[1],0)
    df['breath_id__u_in_lag'] = df['u_in'].shift(1).fillna(0)
    df['breath_id__u_in_lag'] = df['breath_id__u_in_lag'] * df['breath_id_lagsame']
    df['breath_id__u_in_lag2'] = df['u_in'].shift(2).fillna(0)
    df['breath_id__u_in_lag2'] = df['breath_id__u_in_lag2'] * df['breath_id_lag2same']
    
    
    df['time_step_diff'] = df.groupby('breath_id')['time_step'].diff().fillna(0)

    
    df['u_in_lagback_diff1'] = df['u_in'] - df['u_in_lag_back1']
    df['u_out_lagback_diff1'] = df['u_out'] - df['u_out_lag_back1']
    df['u_in_lagback_diff2'] = df['u_in'] - df['u_in_lag_back2']
    df['u_out_lagback_diff2'] = df['u_out'] - df['u_out_lag_back2']
    
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    
    if flag == True:
        df['RCmul'] = df['R'].astype(float) * df['C'].astype(float)
        df['CRdiv'] = df['C'].astype(float) / df['R'].astype(float)
        df['Ruin'] = df['R'].astype(float) * df['u_in'].astype(float)
        df['Cuin'] = df['C'].astype(float) * df['u_in'].astype(float)
    else:
        pass

    df = pd.get_dummies(df)
    print(df.shape)

    df = df.fillna(0)
    return df

In [None]:
#perform feature engineering process
import gc

train = add_features(train, 5, False)
test = add_features(test, 5, False)

gc.collect()

In [None]:
#check columns encoded columns
train.columns

# Scaling and Sample Weights Mechanism
Code below prepares final form of model input data by droping useless columns and performing a Robust Scale. This code also includes simple sample weights implementation idea proposed by **Chris Deotte** in the discussion below (here is turned off by setting **do_sample_weights** flag to **False**).

https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/278360

In [None]:
#set targets and remove unused columns
targets = train['pressure'].values.reshape(-1, 80)
    
train.drop(['pressure','id', 'breath_id','one','count',
        'breath_id_lag','breath_id_lag2','breath_id_lagsame',
        'breath_id_lag2same'], axis=1, inplace=True)


test = test.drop(['id', 'breath_id','one','count','breath_id_lag',
              'breath_id_lag2','breath_id_lagsame',
              'breath_id_lag2same'], axis=1)

print(train.shape, test.shape, targets.shape)

In [None]:
### SAMPLE WEIGHTS ###
def get_sample_weight_param(train, targets, u_out_1_weight):
    cols = train.columns.tolist()
    u_out_index = cols.index("u_out")
    cols = cols[u_out_index]
    x_train = train[[cols]].values.reshape((-1, 80, len([cols])))

    # GET SAMPLE WEIGHT
    U_OUT_IDX = cols.index("u_out")
    y_weight = np.ones_like(targets)
    u_out_values = x_train[:,:,U_OUT_IDX]

    #DEFINE U_out == 1 samples weights, if 1 => sampling is turned off
    y_weight[u_out_values==1] = u_out_1_weight
    del x_train
    return y_weight

#set True to do sample weighting during training
do_sample_weights = False

if do_sample_weights == True:
    y_weight = get_sample_weight_param(train, targets, 0.1)
else:
    pass

In [None]:
#Scale Data by Robust Scaler
RS = RobustScaler()
train = RS.fit_transform(train)
test = RS.transform(test)

train = train.reshape(-1, 80, train.shape[-1])
test = test.reshape(-1, 80, train.shape[-1])

In [None]:
#check final dimensions
print(train.shape, test.shape, targets.shape)

# Model Definition

Basing on Enc/Dec LSTM and GRU model by **DLASTSTARK** I've decided to modify it. I added a CNN/LSTM independent branch which is ten concatenated with ENC/DEC model output. Training and testing process also includes saving partial results to csv files and sample weight mechanism defined before.

In [None]:
import time
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.layers import Concatenate, Add, GRU, Conv1D
from tensorflow.keras.layers import Bidirectional, LSTM
from tensorflow.keras.models import Model, load_model
from keras.models import Model

def hybrid_model():
    
    x_input = Input(shape=(train.shape[-2:]))
    
    x1 = Bidirectional(LSTM(units=768, return_sequences=True))(x_input)
    x2 = Bidirectional(LSTM(units=512, return_sequences=True))(x1)
    x3 = Bidirectional(LSTM(units=256, return_sequences=True))(x2)
    
    z2 = Bidirectional(GRU(units=256, return_sequences=True))(x2)
    z3 = Bidirectional(GRU(units=128, return_sequences=True))(Add()([x3, z2]))
    
    y1 = Conv1D(128, kernel_size=15, padding='same', activation='relu')(x_input)
    y2 = Bidirectional(LSTM(units=128, return_sequences=True))(y1)
    y3 = Bidirectional(LSTM(units=64, return_sequences=True))(y2)
    
    x = Concatenate(axis=2)([x3, z2, z3])
    x = Bidirectional(LSTM(units=256, return_sequences=True))(x)
    x = Concatenate(axis=2)([x, y3])
    x = Dense(units=196, activation='selu')(x)
    x = Dropout(0.01)(x)
    x_output = Dense(units=1)(x)

    model = Model(inputs=x_input, outputs=x_output, 
                  name='Hybrid_Model')
    return model

In [None]:
#init and summarize model
model = hybrid_model()
model.summary()

In [None]:
#plot structure of defined model
plot_model(
    model, 
    to_file='Hybrid_Model.png', 
    show_shapes=True,
    show_layer_names=True
)

In [None]:
#GPU training process
EPOCH = 300
BATCH_SIZE = 512
NUM_FOLDS = 5
TRAIN_MODEL = False
gpu_strategy = tf.distribute.get_strategy()

if TRAIN_MODEL:
    if os.path.isdir('./lstm_models'):
        pass
    else:
        os.mkdir('./lstm_models/')

    if os.path.isdir('./logs/'):
        pass
    else:
        os.mkdir('./logs/')
    


with gpu_strategy.scope():
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2021)
    test_preds = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
        K.clear_session()
        print("{} - Starting {} fold...".format(time.strftime("%Y-%m-%d %H:%M:%S"), fold))
        if TRAIN_MODEL:
            X_train, X_valid = train[train_idx], train[test_idx]
            y_train, y_valid = targets[train_idx], targets[test_idx]

            if do_sample_weights == True:
                y_w_train, y_w_valid = y_weight[train_idx], y_weight[test_idx]
                y_w_train = y_w_train.reshape(y_w_train.shape[0], y_w_train.shape[1], 1)
                y_w_valid = y_w_valid.reshape(y_w_valid.shape[0], y_w_valid.shape[1], 1)
                print(y_train.shape, y_w_train.shape)
            else:
                pass

            checkpoint_filepath = './lstm_models/hybrid_folds_{}.hdf5'.format(fold)
            csv_name = './logs/hybrid_folds_{}.csv'.format(fold)
        
        
            model = hybrid_model()
            model.compile(optimizer="adam",
                          #sample_weight_mode="temporal",
                          loss="mae")

            lr = ReduceLROnPlateau(monitor="val_loss", factor=0.75, patience=10, verbose=1)
            es = EarlyStopping(monitor="val_loss", patience=50, verbose=1, mode="min", restore_best_weights=True)
            csv = keras.callbacks.CSVLogger(csv_name, separator=",", append=False)
            sv = keras.callbacks.ModelCheckpoint(
                checkpoint_filepath, monitor='val_loss', verbose=1, save_best_only=True,
                save_weights_only=False, mode='auto', save_freq='epoch',
                options=None
            )
            
            if do_sample_weights == True:
                val_data = (X_valid, y_valid, y_w_valid)
                sw = y_w_train
            else:
                val_data = (X_valid, y_valid)
                sw = None
            
            model.fit(X_train, y_train,
                      validation_data=val_data,
                      sample_weight=sw,
                      epochs=EPOCH, 
                      batch_size=BATCH_SIZE, 
                      callbacks=[lr, es, sv, csv])
            
            test_preds.append(model.predict(test, batch_size=BATCH_SIZE, verbose=2).squeeze().reshape(-1, 1).squeeze())
            del model, X_train, X_valid; gc.collect()
            
        else:
            checkpoint_filepath = '../input/hybrid-data/hybrid_folds_{}.hdf5'.format(fold)
            model = keras.models.load_model('{}'.format(checkpoint_filepath))
            test_preds.append(model.predict(test, batch_size=BATCH_SIZE, verbose=2).squeeze().reshape(-1, 1).squeeze())
            del model
        

# Results

In [None]:
from matplotlib.pyplot import figure
from os import listdir
from os.path import isfile, join

In [None]:
def print_results(file):
    df = pd.read_csv(file)
    x = df.epoch.values
    y1 = df.val_loss.values
    y2 = df.loss.values
    y3 = df.lr.values * 100
    print('Min value of val_loss is equal {}'.format(df.val_loss.min()))
    
    plt.figure(figsize=(10, 8))
    
    plt.plot(x, y1, color = 'g', linestyle = 'dashed',
            marker = 'o',label = 'val_loss')
    
    plt.plot(x, y2, color = 'r', linestyle = 'dashed',
            marker = 'o',label = "loss")
    
    plt.plot(x, y3, color = 'b', linestyle = 'dashed',
            marker = 'o',label = "lr * 100")
    
    
    plt.ylim(0, max(y1))
    plt.xticks(rotation = 25)
    plt.yticks(np.arange(0, max(y1)+0.1, 0.05))
    plt.xlabel('epoch')
    plt.ylabel('value')
    try:
        plt.title('Training on fold {}'.format(file.split('.')[2][-1]), fontsize = 10)
    except:
        plt.title('Training on fold {}'.format(file.split('.')[1][-1]), fontsize = 10)
    plt.grid()
    plt.legend()
    plt.show()

In [None]:
#get logs files and print graphic results per fold
if TRAIN_MODEL == True:
    path = './logs/'
    csvs = [f for f in listdir(path) if isfile(join(path, f))]
else:
    path = '../input/hybrid-data/'
    files = [f for f in listdir(path) if isfile(join(path, f))]
    csvs = []
    for f in files:
        if f[-4:] == '.csv':
            csvs.append(f)
        else:
            pass

csvs.remove('submission_median_hybrid.csv')
csvs.sort()

for c in csvs:
    print('-----------------------------------------------')
    file = path + c
    print_results(file)

In [None]:
# ENSEMBLE FOLDS WITH MEDIAN
submission["pressure"] = np.median(np.vstack(test_preds),axis=0)
submission.to_csv('submission.csv', index=False)