## Config

In [None]:
class Config:
    name_v1 = "lstm"
    BATCH_SIZE = 512
    EPOCHS = 300
    n_fold = 3
    n_a = 128
    seeds = [2021]
    target_col = "pressure"
    debug = False
    cv_shuffel = False

## Library

In [None]:
import os
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn import model_selection
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import RobustScaler

from lightgbm import LGBMModel
from matplotlib_venn import venn2
from tqdm import tqdm

In [None]:
# import sys
# import importlib
# sys.path.append('../../useful_modules')

# from logger_m import Logger
# import cv_m
# from reduce_memory_m import reduce_mem_usage
# from utilities_m import Util

# Useful modules

In [None]:
import os
import logging
import datetime

class Logger:
    """save log"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
import numpy as np
import pandas as pd

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 
    dfs = []
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dfs.append(df[col].astype(np.int8))
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dfs.append(df[col].astype(np.int16))
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dfs.append(df[col].astype(np.int32))
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dfs.append(df[col].astype(np.int64) ) 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dfs.append(df[col].astype(np.float16))
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dfs.append(df[col].astype(np.float32))
                else:
                    dfs.append(df[col].astype(np.float64))
        else:
            dfs.append(df[col])
    
    df_out = pd.concat(dfs, axis=1)
    if verbose:
        end_mem = df_out.memory_usage().sum() / 1024**2
        num_reduction = str(100 * (start_mem - end_mem) / start_mem)
        print(f'Mem. usage decreased to {str(end_mem)[:3]}Mb:  {num_reduction[:2]}% reduction')
    return df_out

In [None]:
import joblib

class Util:
    """save & load"""
    @classmethod
    def dump(cls, value, path):
        joblib.dump(value, path, compress=True)

    @classmethod
    def load(cls, path):
        return joblib.load(path)

In [None]:
from sklearn import model_selection
import numpy as np

class GroupKFold_:
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """
    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X=None, y=None, group=None):
        if self.shuffle == False:
            self.random_state = None
        
        kf = model_selection.KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = group.unique()
        for tr_group_idx, va_group_idx in kf.split(unique_ids):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(group.isin(tr_group))[0]
            val_idx = np.where(group.isin(va_group))[0]
            yield train_idx, val_idx # returns a generator

## SetUp

In [None]:
INPUT = "../input/ventilator-pressure-prediction/"
EXP = "./"
EXP_MODEL = os.path.join(EXP, "model")
EXP_FIG = os.path.join(EXP, "fig")
EXP_PREDS = os.path.join(EXP, "preds")

# make dirs
for d in [EXP_MODEL, EXP_FIG, EXP_PREDS]:
    os.makedirs(d, exist_ok=True)
    
# utils
logger = Logger(EXP)
warnings.filterwarnings("ignore")
sns.set(style='whitegrid')

## Load Data

In [None]:
train = pd.read_csv(os.path.join(INPUT, "train.csv"))
test = pd.read_csv(os.path.join(INPUT, "test.csv"))
sample_submission = pd.read_csv(os.path.join(INPUT, "sample_submission.csv"))

if Config.debug:
    np.random.seed(Config.seeds[0])
    train = train[train["breath_id"].isin(np.random.choice(train["breath_id"].unique(), 100))].reset_index(drop=True)
    test = test[test["breath_id"].isin(np.random.choice(test["breath_id"].unique(), 100))].reset_index(drop=True)
    sample_submission = sample_submission[sample_submission["id"].isin(test["id"].tolist())].reset_index(drop=True)
    
print(f"Train shape: {train.shape}, test shape: {test.shape}, submission shape: {sample_submission.shape}")

## Feature Enginnering

In [None]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] /df['count']
    
    #df['u_in_lag']=0
    #df['u_in_lag2']=0
    #for i in range(df.shape[0]):
        #if df['breath_id'][i]==df['breath_id'][i+1]:
        #    df['u_in_lag'][i+1]=df['u_in'][i]
        #else:
        #    df['u_in_lag'][i+1]=0
        #if df['breath_id'][i]==df['breath_id'][i+2]:
        #    df['u_in_lag'][i+2]=df['u_in'][i]
        #else:
        #    df['u_in_lag'][i+2]=0
        #if i/10000==round(i/10000):
        #    print(i)
    
    df['breath_id_lag']=df['breath_id'].shift(1).fillna(0)
    df['breath_id_lag2']=df['breath_id'].shift(2).fillna(0)
    df['breath_id_lagsame']=np.select([df['breath_id_lag']==df['breath_id']],[1],0)
    df['breath_id_lag2same']=np.select([df['breath_id_lag2']==df['breath_id']],[1],0)
    df['u_in_lag'] = df['u_in'].shift(1).fillna(0)
    df['u_in_lag'] = df['u_in_lag']*df['breath_id_lagsame']
    df['u_in_lag2'] = df['u_in'].shift(2).fillna(0)
    df['u_in_lag2'] = df['u_in_lag2']*df['breath_id_lag2same']
    df['u_out_lag2'] = df['u_out'].shift(2).fillna(0)
    df['u_out_lag2'] = df['u_out_lag2']*df['breath_id_lag2same']
    #df['u_in_lag'] = df['u_in'].shift(2).fillna(0)
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['RC'] = df['R']+df['C']
    df = pd.get_dummies(df)
    return df

## Funcs

In [None]:
def gkf(X, group, n_splits, random_state, shuffle):
    gkf = GroupKFold_(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    return list(gkf.split(X, group=group))

## Main

In [None]:
# preprocess
print("# ============= # Preprocess # ============= #")
train_x = add_features(train)
test_x = add_features(test)
train_x = reduce_mem_usage(train_x)
test_x = reduce_mem_usage(test_x)
train_y = train_x[['pressure']]
features = [col for col in train_x.columns if col not in ['pressure','id', 'breath_id','one','count','breath_id_lag','breath_id_lag2','breath_id_lagsame','breath_id_lag2same','u_out_lag2']]

In [None]:
print("# ============= # RobustScaler # ============= #")
RS = RobustScaler()
train_x = RS.fit_transform(train_x[features])
test_x = RS.transform(test_x[features])
train_x = pd.DataFrame(data=train_x, columns=features)
test_x = pd.DataFrame(data=test_x, columns=features)
print(train_x.shape)
print(test_x.shape)

In [None]:
# cv_index = [folds, (train/evaluate), index]
print("# ============= # Cross validation # ============= #")
# importlib.reload(cv_m)
cv_index=gkf(train, group=train["breath_id"],n_splits=Config.n_fold, random_state=Config.seeds[0], shuffle=Config.cv_shuffel)
print(f"Cv list shape: {np.shape(cv_index)}")
print(f"Train len: {len(cv_index[0][0])}")
print(f"Evaluation len: {len(cv_index[0][1])}")

## Building model

In [None]:
import gc
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.callbacks import LearningRateScheduler, ReduceLROnPlateau

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.model_selection import train_test_split, GroupKFold, KFold

In [None]:
def create_model(strategy, tr_x, n_a):   
    np.random.seed(Config.seeds[0])
    tf.random.set_seed(Config.seeds[0])

#     with strategy.scope():
    model = Sequential([
        tf.keras.layers.Input(shape=tr_x.shape[-2:]),
#         tf.keras.layers.Bidirectional(keras.layers.LSTM(300, return_sequences=True)),
#         tf.keras.layers.Bidirectional(keras.layers.LSTM(250, return_sequences=True)),
#        tf.keras.layers.Bidirectional(keras.layers.LSTM(150, return_sequences=True)),
        tf.keras.layers.Bidirectional(keras.layers.LSTM(n_a, return_sequences=True)), # Return hidden state output in each time step "return_sequences=True".
        tf.keras.layers.Dense(n_a, activation='gelu'),
        tf.keras.layers.Dense(1),
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mae') #0.002
    
    return model

def plot_hist(hist):
    plt.plot(hist.history["loss"])
    plt.plot(hist.history["val_loss"])
    plt.title("Model Performance")
    plt.ylabel("MAE")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()

In [None]:
# # Detect hardware, return appropriate distribution strategy
# print(tf.version.VERSION)
# try: # detect TPU
#     tpu = None
#     tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
#     tf.config.experimental_connect_to_cluster(tpu)
#     tf.tpu.experimental.initialize_tpu_system(tpu)
#     strategy = tf.distribute.experimental.TPUStrategy(tpu)
# except ValueError: # detect GPU(s) and enable mixed precision
#     strategy = tf.distribute.MirroredStrategy() # works on GPU and multi-GPU
#     policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
#     tf.config.optimizer.set_jit(True) # XLA compilation
#     tf.keras.mixed_precision.experimental.set_policy(policy)
#     print('Mixed precision enabled')

# print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
%%time
Tx = 80
n_a = Config.n_a # number of dimensions for the hidden state of each LSTM cell.
test_preds = []
models = []
test_x = test_x.values.reshape(-1, 80, test_x[features].shape[-1])

for fold, (train_idx, eval_idx) in enumerate(cv_index):
    
    K.clear_session()
    print(f"\nFOLD: {fold}")

    # train
    tr_x = train_x.iloc[train_idx]
    tr_y = train["pressure"].iloc[train_idx]
    # evaluating
    vl_x = train_x.iloc[eval_idx]
    vl_y = train["pressure"].iloc[eval_idx]

    # reshape for lstm
    tr_x = tr_x.values.reshape(-1, Tx, tr_x.shape[-1])
    tr_y = tr_y.values.reshape(-1, Tx)
    vl_x = vl_x.values.reshape(-1, Tx, vl_x.shape[-1])
    vl_y = vl_y.values.reshape(-1, Tx)

    # model creation
    checkpoint_filepath = f"./checkpoint/checkpoint_sv.hdf5"
    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.55, patience=8, verbose=0)
    es = EarlyStopping(monitor="val_loss", patience=50, verbose=0, mode="min", restore_best_weights=True)
    sv = keras.callbacks.ModelCheckpoint(
            checkpoint_filepath,
            monitor='val_loss',
            verbose=0, 
            save_best_only=True,
            save_weights_only=False,
            mode='auto',
            save_freq='epoch'
        )

    model = create_model(None, tr_x, n_a)

    # model train
    history = model.fit(tr_x, tr_y,
                    validation_data=(vl_x, vl_y),
                    epochs=Config.EPOCHS,
                    batch_size=Config.BATCH_SIZE,
                    verbose= "auto",
                    callbacks = [lr, es, sv])


    # model predict/test
    y_true = vl_y.squeeze().reshape(-1, 1)
    y_pred = np.array(model.predict(vl_x, verbose=1, batch_size=Config.BATCH_SIZE)).squeeze().reshape(-1, 1)
    score = mean_absolute_error(y_true, y_pred)
    print(f"OOF MAE Fold {1}: {score}")

    del tr_x, tr_y, vl_x, vl_y
    _ = gc.collect()

    models.append(model)
    a0 = np.zeros((len(test_x), n_a))
    c0 = np.zeros((len(test_x), n_a))
    test_preds.append(np.array(model.predict(test_x, batch_size=Config.BATCH_SIZE, verbose=1)).squeeze().reshape(-1, 1).squeeze())

    plot_hist(history)

    if fold == 0:
        dot_img_file = './fig/model.png'
        tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)

In [None]:
max_pressure = 64.82099173863948
min_pressure = -1.8957442945646408
diff_pressure = 0.07030215

sample_submission["pressure"] = np.median(np.vstack(test_preds), axis=0)    
sample_submission["pressure"] = np.round((sample_submission.pressure - min_pressure)/diff_pressure) * diff_pressure + min_pressure
sample_submission.pressure = np.clip(sample_submission.pressure, min_pressure, max_pressure)
sample_submission.to_csv('./submission.csv', index=False)
sample_submission.head()

# Bi directional lstm model with initial state

In [None]:
# # return_sequences=True --> in each time step returns the hidden state
# # return_state=True --> will provide access to the hidden state output (state_h) and the cell state (state_c).  lstm1, state_h, state_c = LSTM(1, return_state=True)
# def create_desglosed_model(m, Tx, hot_v, n_a):
#     outputs = []
#     X = tf.keras.layers.Input(shape=(Tx, hot_v))
#     a0 = tf.keras.layers.Input(shape=(n_a,), name='a0')
#     c0 = tf.keras.layers.Input(shape=(n_a,), name='c0')
#     forward_h = a0
#     forward_c = c0
#     backward_h = a0
#     backward_c = c0

#     reshaper = tf.keras.layers.Reshape((1, hot_v))  
#     bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(n_a, return_state=True), input_shape=(m, Tx, n_a))
#     dense1 = tf.keras.layers.Dense(n_a, activation='gelu')
#     dense2 = tf.keras.layers.Dense(1)
    
#     for t in range(Tx):
#         x = X[:,t,:]
#         x = reshaper(x)
        
#         lstm_out, forward_h, forward_c, backward_h, backward_c = bilstm(x, initial_state=[forward_h, forward_c, backward_h, backward_c])
#         lstm_out = dense1(lstm_out)
#         out = dense2(lstm_out)
#         outputs.append(out) 
    
#     model = tf.keras.models.Model(inputs=[X,a0,c0],outputs=outputs)
#     model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mae') #0.002
#     return model

In [None]:
# Tx = 80
# n_a = Config.n_a # number of dimensions for the hidden state of each LSTM cell.
# test_preds = []
# models = []
# test_x = test_x.values.reshape(-1, 80, test_x.shape[-1])

# for fold, (train_idx, eval_idx) in enumerate(cv_index):
    
#     K.clear_session()
#     print(f"\nFOLD: {fold}")

#     # train
#     tr_x = train_x.iloc[train_idx]
#     tr_y = train["pressure"].iloc[train_idx]
#     # evaluating
#     vl_x = train_x.iloc[eval_idx]
#     vl_y = train["pressure"].iloc[eval_idx]

#     # reshape for lstm
#     tr_x = tr_x.values.reshape(-1, Tx, tr_x.shape[-1])
#     tr_y = tr_y.values.reshape(-1, Tx)
#     vl_x = vl_x.values.reshape(-1, Tx, vl_x.shape[-1])
#     vl_y = vl_y.values.reshape(-1, Tx)

#     # model creation
#     checkpoint_filepath = f"./checkpoint/checkpoint_sv.hdf5"
#     lr = ReduceLROnPlateau(monitor="val_loss", factor=0.55, patience=8, verbose=0)
#     es = EarlyStopping(monitor="val_loss", patience=50, verbose=0, mode="min", restore_best_weights=True)
#     sv = keras.callbacks.ModelCheckpoint(
#             checkpoint_filepath,
#             monitor='val_loss',
#             verbose=0, 
#             save_best_only=True,
#             save_weights_only=False,
#             mode='auto',
#             save_freq='epoch'
#         )

# #     model = create_model(None, tr_x, n_a)

#     model = create_desglosed_model(len(tr_x), len(tr_x[0]), len(tr_x[0][0]), n_a)

#     # model train
# #     history = model.fit(tr_x, tr_y,
# #                     validation_data=(vl_x, vl_y),
# #                     epochs=Config.EPOCHS,
# #                     batch_size=Config.BATCH_SIZE,
# #                     verbose= "auto",
# #                     callbacks = [lr, es, sv])
#     a0 = np.zeros((len(tr_x), n_a))
#     c0 = np.zeros((len(tr_x), n_a))
#     vl_a0 = np.zeros((len(vl_x), n_a))
#     vl_c0 = np.zeros((len(vl_x), n_a))

#     history = model.fit([tr_x,a0,c0], tr_y,
#                     validation_data=([vl_x,vl_a0,vl_c0], vl_y),
#                     epochs=Config.EPOCHS,
#                     batch_size=Config.BATCH_SIZE,
#                     verbose= "auto",
#                     callbacks = [lr, es, sv])

#     # model predict/test
#     y_true = vl_y.squeeze().reshape(-1, 1)
#     y_pred = np.array(model.predict([vl_x,vl_a0,vl_c0], verbose=1, batch_size=Config.BATCH_SIZE)).squeeze().reshape(-1, 1)
#     score = mean_absolute_error(y_true, y_pred)
#     print(f"OOF MAE Fold {1}: {score}")

#     del tr_x, tr_y, vl_x, vl_y
#     _ = gc.collect()

#     models.append(model)
#     a0 = np.zeros((len(test_x), n_a))
#     c0 = np.zeros((len(test_x), n_a))
#     test_preds.append(np.array(model.predict([test_x,a0,c0], batch_size=Config.BATCH_SIZE, verbose=1)).squeeze().reshape(-1, 1).squeeze())

#     plot_hist(history)

#     if fold == 0:
#         dot_img_file = './fig/model.png'
#         tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)

dot_img_file = './fig/model.png'
tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)