In [None]:
import pandas as pd
import numpy as np
from tensorflow import keras
import tensorflow as tf
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import RepeatedKFold
from datetime import datetime

## Reading in Data

In [None]:
nrows=None # ~10k row rapid model iteration

train_data = pd.read_csv("../input/ventilator-pressure-prediction/train.csv", nrows=nrows)
test_data = pd.read_csv("../input/ventilator-pressure-prediction/test.csv", nrows=nrows)
submission = pd.read_csv("../input/ventilator-pressure-prediction/sample_submission.csv", nrows=nrows)

train_data.head()

## Feature Engineering

Adding some additional features, removing unncessary features, and normalizing data using RobustScaler

In [None]:
def difference_operator(df, feature):
    col_name = f"{feature}_diff"

    # (next point - previous point) / (next time / previous time) ~= d/dt at the point
    # iterate for further derivatives
    df[col_name] = (
        df[feature].shift(-1).fillna(method="ffill")
        - df[feature].shift(1).fillna(method="bfill")
    ) / (
        df["time_step"].shift(-1).fillna(method="ffill")
        - df["time_step"].shift(1).fillna(method="bfill")
    )

    return df


def add_features(df):
    # Desc Stats
    df["u_in_mean"] = df.groupby("breath_id")["u_in"].transform("mean")
    df["u_in_median"] = df.groupby("breath_id")["u_in"].transform("median")
    df["u_in_min"] = df.groupby("breath_id")["u_in"].transform("min")
    df["u_in_max"] = df.groupby("breath_id")["u_in"].transform("max")
    df["u_in_delta"] = df["u_in_max"] - df["u_in_min"]
    df["first_value_u_in"] = df.groupby("breath_id")["u_in"].transform("first")
    df["last_value_u_in"] = df.groupby("breath_id")["u_in"].transform("last")

    # Leads and Lags
    df["u_in_lag1"] = df.groupby("breath_id")["u_in"].shift(1).fillna(method="bfill")
    df["u_in_lag_back1"] = (df.groupby("breath_id")["u_in"].shift(-1).fillna(method="ffill"))
    df["u_in_lag2"] = df.groupby("breath_id")["u_in"].shift(2).fillna(method="bfill")
    df["u_in_lag_back2"] = (df.groupby("breath_id")["u_in"].shift(-2).fillna(method="ffill"))
    df["u_in_lag3"] = df.groupby("breath_id")["u_in"].shift(3).fillna(method="bfill")
    df["u_in_lag_back3"] = (df.groupby("breath_id")["u_in"].shift(-3).fillna(method="ffill"))

    df["time_lag"] = (df.groupby("breath_id")["time_step"].shift(1).fillna(method="bfill"))
    df["time_lag_back"] = (df.groupby("breath_id")["time_step"].shift(-1).fillna(method="ffill"))

    # Derivatives at the point.
    difference_operator(df, "u_in")
    difference_operator(df, "u_in_diff")
    difference_operator(df, "u_in_diff_diff")
    difference_operator(df, "u_in_diff_diff_diff")

    # Area under u_in curve
    df["area"] = df["time_lag_back"] * df["u_in"]
    df["area_uout_open"] = df["time_lag_back"] * df["u_in"] * df["u_out"]

    df["tot_area"] = df.groupby(["breath_id"])["area"].transform("sum")
    df["tot_area_uout_open"] = df.groupby(["breath_id"])["area_uout_open"].transform("sum")
    df["tot_area_cum_sum"] = df.groupby(["breath_id"])["area"].cumsum()

    # COMBINE R AND C
    df["R__C"] = df["R"].astype(str) + "__" + df["C"].astype(str)

    # One Hot Encoding of R, C and R__C
    df = df.merge(pd.get_dummies(df["R"], prefix="R"), left_index=True, right_index=True).drop(["R"], axis=1)
    df = df.merge(pd.get_dummies(df["C"], prefix="C"), left_index=True, right_index=True).drop(["C"], axis=1)
    df = df.merge(pd.get_dummies(df["R__C"], prefix="R__C"), left_index=True, right_index=True).drop(["R__C"], axis=1)

    # https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/273974
    df["u_in_cumsum"] = df.groupby(["breath_id"])["u_in"].cumsum()
    return df


def remove_features(df):
    drop_list = ["pressure", "id", "breath_id", "u_out"]
    drop_list = [feat for feat in drop_list if feat in df.columns]
    df.drop(drop_list, axis=1, inplace=True)
    return df


In [None]:
targets = train_data[["pressure"]].to_numpy()

# drop some unneeded features
train_df = remove_features(add_features(train_data))
test_df = remove_features(add_features(test_data))

train_df.head()

In [None]:
RS = RobustScaler()
train_df = RS.fit_transform(train_df)
test_df = RS.transform(test_df)

train_df[:5]

## Training Setup - KFold Validation


In [None]:
# training params
n_splits=3
n_repeats=1
epochs = 100

kf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)

test_preds = []

## Building and Training the Model

In [None]:
for fold, fold_indices in enumerate(kf.split(train_df, targets)):

    x_train, x_valid = train_df[fold_indices[0]], train_df[fold_indices[1]]
    y_train, y_valid = targets[fold_indices[0]], targets[fold_indices[1]]

    start_units = 256 # 128 for rapid model development, 512 seems to be the point of diminishing returns
    model = keras.models.Sequential(
        [
            keras.layers.Dense(units=start_units, input_dim=x_train.shape[1], activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(units=start_units, input_dim=x_train.shape[1], activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(units=start_units / 2, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(units=start_units / 2, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(units=start_units / 4, activation="relu"),
            keras.layers.Dense(units=1, activation="linear"),
        ],
        name=f"fold_{fold}_dnn",
    )

    model.summary()

    optimizer = keras.optimizers.Adam()

    model.compile(optimizer=optimizer, loss="mean_absolute_error")

    # save checkpoints from internal epochs
    checkpoint_name = "checkpoints/checkpoints-{epoch:03d}--{val_loss:.5f}.hdf5"
    checkpoint = keras.callbacks.ModelCheckpoint(
        checkpoint_name, 
        monitor="val_loss", 
        verbose=1, 
        save_best_only=True, 
        mode="auto"
    )
    callbacks_list = [checkpoint]

    model.fit(
        x_train,
        y_train,
        validation_data=(x_valid, y_valid),
        epochs=epochs,
        batch_size=1024,
        callbacks=callbacks_list,
        verbose=1,
    )
    
    # save final model
    model.save(f"models/dnn_vp_fold_{fold}_{datetime.now()}")
    
    # save preds from final model for the given fold
    test_preds.append(model.predict(test_df).squeeze().reshape(-1, 1).squeeze())

## Building Predictions

Build and create csv of test submission details

In [None]:
submission["pressure"] = sum(test_preds)/n_splits/n_repeats
submission.to_csv("submission.csv", index=False)


print(submission.head())