In [None]:
from time import time
import psutil

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import RobustScaler

import tensorflow.keras as ks
from tensorflow.keras import optimizers
import tensorflow as tf

!pip install plot-keras-history
from plot_keras_history import plot_history

from scipy.special import erfinv

import gc
gc.collect()

import warnings
warnings.filterwarnings("ignore")

In [None]:
class NBConfig:
    
    general = {
        "seed": 2022,
        "folds": 7,
        "rounding": False
    }
    opt = {
      "leRa": 0.0025,
      "dec": 0.000,
      "clip": 100
    }
    nn = {
      "eps": 400,
      "bs": 512+128+64
    }
    compiler = {
      "loss": "mae",
      "metric": "mae"
    }

print("Learningrate ok: " + str(NBConfig.opt["leRa"] - NBConfig.opt["dec"] * NBConfig.nn["eps"] >= 0))

In [None]:
trainpath = "../input/ventilator-pressure-prediction/train.csv"
testpath = "../input/ventilator-pressure-prediction/test.csv"
samsubpath = "../input/ventilator-pressure-prediction/sample_submission.csv"
train, test, samSub = pd.read_csv(trainpath, index_col="id"), pd.read_csv(testpath, index_col="id"), pd.read_csv(samsubpath)

In [None]:
train["timeDiff"] = train["time_step"].groupby(train["breath_id"]).diff(1).fillna(0)
test["timeDiff"] = test["time_step"].groupby(test["breath_id"]).diff(1).fillna(0)

In [None]:
%%time
train["maxu_in"] = train[["breath_id", "u_in"]].groupby("breath_id").transform("max")["u_in"]
test["maxu_in"] = test[["breath_id", "u_in"]].groupby("breath_id").transform("max")["u_in"]
train["meanu_in"] = train[["breath_id", "u_in"]].groupby("breath_id").transform("mean")["u_in"]
test["meanu_in"] = test[["breath_id", "u_in"]].groupby("breath_id").transform("mean")["u_in"]

gc.collect() 

In [None]:
%%time
train["R"] = train["R"].astype("str")
train["C"] = train["C"].astype("str")
train["R__C"] = train["R"].astype("str") + '__' + train["C"].astype("str")
train = pd.get_dummies(train)
gc.collect() 

test["R"] = test["R"].astype("str")
test["C"] = test["C"].astype("str")
test["R__C"] = test["R"].astype("str") + '__' + test["C"].astype("str")
test = pd.get_dummies(test)
gc.collect() 

In [None]:
def ByBreath(method: str, DF, lags=None, center=False, fillNas=0):
    
    start = time()

    output = pd.DataFrame()
    if center == True:
        c = "c"
    else:
        c = ""
    
    if method == "mean":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in", "u_out"]].groupby("breath_id").rolling(window=l, center=center).mean().fillna(fillNas)
            output[["{0}mu_in_l{1}".format(c, l), "{0}mu_out_l{1}".format(c, l)]] = agg[["u_in", "u_out"]]
            gc.collect()
            
    elif method == "max":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in"]].groupby("breath_id").rolling(window=l, center=center).max().fillna(fillNas)  
            output[["{0}mxu_in_l{1}".format(c, l)]] = agg[["u_in"]]
            gc.collect()
            
    elif method == "min":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in"]].groupby("breath_id").rolling(window=l, center=center).min().fillna(fillNas)  
            output[["{0}miu_in_l{1}".format(c, l)]] = agg[["u_in"]]
            gc.collect()
            
    elif method == "std":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in"]].groupby("breath_id").rolling(window=l, center=center).std().fillna(fillNas)  
            output["{0}su_in_l{1}".format(c, l)] = agg["u_in"]
            gc.collect()
            
    elif method == "shift":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in", "u_out"]].groupby("breath_id").shift(l).fillna(fillNas)  
            output[["sftu_in_l{0}".format(l), "sftu_out_l{0}".format(l)]] = agg[["u_in", "u_out"]]
            gc.collect()     
        
    elif method == "diff":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in"]].groupby("breath_id").diff(l).fillna(fillNas)  
            output["du_in_l{0}".format(l)] = agg["u_in"]
            gc.collect()  
            
    elif method == "log":
        output["lgu_in"] = np.log1p(DF["u_in"].values)
        gc.collect()  
        
    elif method == "cumsum":
            agg = \
            DF[["breath_id", "u_in", "u_out"]].groupby("breath_id").cumsum() 
            output[["csu_in", "csu_out"]] = agg[["u_in", "u_out"]]
            gc.collect()   
            
    elif method == "area":
            agg = \
            DF[["time_step", "u_in", "breath_id"]]
            agg["area"] = agg["time_step"] * agg["u_in"]
            output["area"] = agg.groupby("breath_id")["area"].cumsum()
            gc.collect()   
            
    elif method == "centering":
            agg = \
            DF[["u_in", "breath_id"]].groupby("breath_id").transform('mean')#does not aggregate like just mean()
            output["cenu_in"] = DF["u_in"] - agg["u_in"]
            gc.collect()  
    end = time()
    print(c + method + " created in " + str(round(end - start)) + " seconds." + "RAM usage: " + str(psutil.virtual_memory()[2]) + "%")
    return output

In [None]:
def assignment(DF, mDF):
    DF = DF.copy()
    colNames = mDF.columns
    for n in colNames:
        DF["{0}".format(n)] = mDF["{0}".format(n)].values
    gc.collect()
    return DF

In [None]:
train = assignment(train, ByBreath("area", train))
#train = assignment(train, ByBreath("mean", train, lags=[6,9]))
train = assignment(train, ByBreath("mean", train, center=True, lags=[9]))
#train = assignment(train, ByBreath("max", train, lags=[9]))
#train = assignment(train, ByBreath("min", train, lags=[9]))
train = assignment(train, ByBreath("diff", train, lags=[1,2,3,4]))
train = assignment(train, ByBreath("log", train))
#train = assignment(train, ByBreath("std", train, lags=[6]))
train = assignment(train, ByBreath("shift", train, lags=[-3,-2,-1,1,2,3,4]))
train = assignment(train, ByBreath("cumsum", train))
train = assignment(train, ByBreath("centering", train))

test = assignment(test, ByBreath("area", test))
#test = assignment(test, ByBreath("mean", test, lags=[6,9]))
test = assignment(test, ByBreath("mean", test, center=True, lags=[9]))
#test = assignment(test, ByBreath("max", test, lags=[9]))
#test = assignment(test, ByBreath("min", test, lags=[9]))
test = assignment(test, ByBreath("diff", test, lags=[1,2,3,4]))
test = assignment(test, ByBreath("log", test))
#test = assignment(test, ByBreath("std", test, lags=[6]))
test = assignment(test, ByBreath("shift", test, lags=[-3,-2,-1,1,2,3,4]))
test = assignment(test, ByBreath("cumsum", test))
test = assignment(test, ByBreath("centering", test))

In [None]:
print("train shape is: " + str(train.shape))
print("test shape is: " + str(test.shape))
train.head()

# ML Approach

Normalize:

In [None]:
train.reset_index(drop=True, inplace=True)
target = train.pressure
uniTarg = np.array(sorted(target.unique()))
names = [c for c in train.columns if c not in ["breath_id", "u_out", "pressure", "R", "C", "sftu_out_l-1", "sftu_out_l-2", "sftu_out_l-3", "sftu_out_l-4"]]
train.head()

**Normalize**

In [None]:
RS = RobustScaler()
RS.fit(pd.concat([train[names], test[names]]))
gc.collect()
train = RS.transform(train[names])                               
test = RS.transform(test[names])
del(RS)
gc.collect()
print('RAM memory used after scaling:', psutil.virtual_memory()[2], "%")

**Rounder**

In [None]:
def rounder(df, rl):
    for i in rl:
        df[i] = np.round(df[i] ,decimals=5)
    return df

In [None]:
if NBConfig.general["rounding"] == True:
    roundings = [i for i, n in enumerate(names) if n not in ['u_out','R_20','R_5','R_50','C_10','C_20','C_50','R__C_20__10','R__C_20__20','R__C_20__50','R__C_50__10','R__C_50__20','R__C_50__50','R__C_5__10','R__C_5__20','R__C_5__50']]
    train=rounder(train, roundings)
    test=rounder(test, roundings)
    gc.collect()
    print('RAM memory used after rounding:', psutil.virtual_memory()[2], "%")

**Reshape data:**

In [None]:
test = test.reshape((int(test.shape[0]/80), 80, -1))# samples, timesteps, features
gc.collect()

target = target.to_numpy()
target = target.reshape((int(target.shape[0]/80), 80, 1))# samples, timesteps, features
gc.collect()

train = train.reshape((int(train.shape[0]/80), 80, -1))# samples, timesteps, features
gc.collect()

print('RAM memory used after reshaping:', psutil.virtual_memory()[2], "%")
train.shape

Detect hardware, return appropriate distribution strategy

In [None]:
print(tf.version.VERSION)
try: # detect TPU
    tpu = None
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError: # detect GPU(s) and enable mixed precision
    strategy = tf.distribute.MirroredStrategy() # works on GPU and multi-GPU
    policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
    tf.config.optimizer.set_jit(True) # XLA compilation
    tf.keras.mixed_precision.experimental.set_policy(policy)
    print('Mixed precision enabled')
gc.collect()
print("REPLICAS: ", strategy.num_replicas_in_sync)

# LSTM

In [None]:
def LSTM_Model(strategy, dfForShape, seeds, optimizer, loss, metric, bs, cellState = False):
    with strategy.scope():
        np.random.seed(seeds)
        tf.random.set_seed(seeds)

        if cellState == True:
            INPUT = ks.layers.Input(batch_input_shape=(bs, dfForShape.shape[1], dfForShape.shape[2]), name="input")
        else:
            INPUT = ks.layers.Input(shape=(dfForShape.shape[1], dfForShape.shape[2]), name="input")

        L = ks.layers.Bidirectional( 
                ks.layers.LSTM(
                    500, 
                    #kernel_initializer='LecunUniform',
                    activation = "tanh", 
                    return_sequences = True,
                    stateful=cellState,
                    name="L1"
                )
            )(INPUT)

        L = ks.layers.Bidirectional( 
                ks.layers.LSTM(
                    250, 
                    #kernel_initializer='LecunUniform',
                    activation = "tanh", 
                    return_sequences = True, 
                    stateful=cellState,
                    name="L2"
                )
            )(L)

        L3 = ks.layers.Bidirectional( 
                ks.layers.LSTM(
                    125, 
                    #kernel_initializer='LecunUniform',
                    activation = "tanh", 
                    return_sequences = True, 
                    stateful=cellState,
                    name="L3"
                )
            )(L)  

        D = ks.layers.Dense(
                125*2, 
                activation = "selu",
                kernel_initializer='LecunUniform',
                name="dense1"
            )(L3)

        A = ks.layers.Average()([D, L3])

        OUT = ks.layers.Dense(
                1, 
                activation = "linear",
                #kernel_initializer='LecunUniform',
                name="L_out"
            )(A)

        m = ks.Model(inputs=INPUT, outputs=OUT)
        
        m.compile(
            optimizer = optimizer, 
            loss = loss, 
            metrics=metric
        )

        m.summary()
        gc.collect()
        print('RAM memory used after the configuration of LSTM:', psutil.virtual_memory()[2], "%")

    return m

In [None]:
def lr_schaker(epoch, lr):
    if epoch == 25:
        lr = lr*1.05
    elif epoch == 50:
        lr = lr*1.04
    elif epoch == 75:
        lr = lr*1.03
    elif epoch == 100:
        lr = lr*1.02
    elif epoch == 130:
        lr = lr*1.02
    elif epoch == 150:
        lr = lr*1.02
    elif epoch == 200:
        lr = lr*1.03
    elif epoch == 250:
        lr = lr*1.04
    elif epoch == 300:
        lr = lr*1.05
    elif epoch == 350:
        lr = lr*1.06
    return lr

In [None]:
kf = KFold(
    n_splits=NBConfig.general["folds"], 
    random_state=NBConfig.general["seed"], 
    shuffle=True
)

In [None]:
int(train.shape[0]*4/5/80)

In [None]:
trainPahse = True
preds = []

for k, (train_index, test_index) in enumerate(kf.split(train)):
    
    print("Fold: " + str(k+1))
    X, x = train[train_index], train[test_index]
    Y, y = target[train_index], target[test_index]  
    print("Train data has " + str(X.shape[0]) + " observations" + \
          "\n" + "Test data has " + str(x.shape[0]) + " observations")
    print('RAM memory used after setting train and test:', psutil.virtual_memory()[2], "%")
    
#model
    if trainPahse == True:
        ks.backend.clear_session()
        
        checkpoint = f"folds_{k}.hdf5"
        
        sv = ks.callbacks.ModelCheckpoint(
            checkpoint, 
            monitor='val_loss', 
            verbose=0, 
            save_best_only=True,
            save_weights_only=False, 
            mode='min', 
            save_freq='epoch',
            options=None
        )
        
        lrReducer = ks.callbacks.ReduceLROnPlateau(    
            monitor="val_loss",
            factor=0.6,
            patience=15,
            verbose=1,
            mode="min"
        ) 
        
        stop = ks.callbacks.EarlyStopping(
            monitor='val_loss',
            mode='min', 
            patience=100, 
            verbose=1,
            restore_best_weights=True
        )
        
        optimizer = ks.optimizers.Adam(
            lr=NBConfig.opt["leRa"], 
            decay=NBConfig.opt["dec"], 
            clipvalue=NBConfig.opt["clip"]
        )

        model = LSTM_Model(
            strategy, 
            test, 
            NBConfig.general["seed"], 
            optimizer, 
            NBConfig.compiler["loss"], 
            NBConfig.compiler["metric"], 
            NBConfig.nn["bs"]
        )
        
        #TRAINING
        history = model.fit(
            x=X, 
            y=Y,
            validation_data=(x, y),
            epochs = NBConfig.nn["eps"], 
            batch_size = NBConfig.nn["bs"], 
            shuffle = False,
            callbacks=[sv, stop, lrReducer, ks.callbacks.LearningRateScheduler(lr_schaker, verbose=0)],
            verbose=1
        )
        
        plot_history(history)
        plt.show()
        
        model = ks.models.load_model('./'+ checkpoint)
        print(checkpoint + " successfully loaded.")
        
        prediction = np.reshape(model.predict(x=test, batch_size = NBConfig.nn["bs"]), samSub.shape[0]) 

        preds.append(prediction)
        
        gc.collect()

## Submission

In [None]:
samSub.pressure = np.median(np.array(preds), axis=0)

In [None]:
samSub.describe()

In [None]:
samSub[["id", "pressure"]].to_csv("sampleSubmission.csv", index=False)

In [None]:
%%time
samSub["pressure"] = samSub.pressure.map(lambda x: uniTarg[np.argmin(((uniTarg - x)**2))])

In [None]:
samSub[["id", "pressure"]].to_csv("sampleSubmissionPP.csv", index=False)
samSub.head()