In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 0.Import libraries and CSV files

In [None]:
# Import helpful libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GroupKFold, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay

#Importing CSV files
X_train_full=pd.read_csv("/kaggle/input/ventilator-pressure-prediction/train.csv")
X_test_full=pd.read_csv("/kaggle/input/ventilator-pressure-prediction/test.csv")

# 1.EDA
- Here, we are simply checking for missing values.
- Please refer to my other code for EDA.
https://www.kaggle.com/shashimo/ventilator-very-simple-eda-for-starter

In [None]:
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
print('Missing:',cols_with_missing)

# 2. Feature engeneering

In [None]:
def feature_eng(df):
    df['u_in_first']  = df.groupby('breath_id')['u_in'].transform('first')
    df['u_in_mean']   = df.groupby('breath_id')['u_in'].transform('mean')
    df['u_in_median'] = df.groupby('breath_id')['u_in'].transform('median')
    df['u_in_last']   = df.groupby('breath_id')['u_in'].transform('last')
    df["RCRatio"] = df.R/df.C
    df['u_in_shifted'] = df.groupby('breath_id')['u_in'].shift(2).fillna(method="backfill")
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df.drop(['id','breath_id'], axis=1, inplace=True)
    return df

X_train_full_fe=feature_eng(X_train_full)
X_test_full_fe=feature_eng(X_test_full)

# 3.Data preparation for LSTM


In [None]:
y_train_full=X_train_full_fe[['pressure']].to_numpy().reshape(-1, 80)
X_train_full_fe.drop(['pressure'], axis=1, inplace=True)

In [None]:
RS = RobustScaler()
X_train_full_fe = RS.fit_transform(X_train_full_fe)
X_test_full_fe = RS.transform(X_test_full_fe)

In [None]:
X_train_full_fe = X_train_full_fe.reshape(-1, 80, X_train_full_fe.shape[-1])
X_test_full_fe = X_test_full_fe.reshape(-1, 80, X_train_full_fe.shape[-1])

# 4.LSTM

In [None]:
EPOCH = 100
BATCH_SIZE = 1024
N_SPLITS=3

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
test_preds = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X_train_full_fe,y_train_full)):
    print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    X_train, X_valid = X_train_full_fe[train_idx],X_train_full_fe[test_idx]
    y_train, y_valid = y_train_full[train_idx],y_train_full[test_idx]
    model=keras.Sequential([
        keras.layers.Input(shape=X_train_full_fe.shape[-2:]),
        keras.layers.Bidirectional(keras.layers.LSTM(200, return_sequences=True)),
        keras.layers.Bidirectional(keras.layers.LSTM(150, return_sequences=True)),
        keras.layers.Bidirectional(keras.layers.LSTM(100, return_sequences=True)),
        keras.layers.Dense(100, activation='relu'),
        keras.layers.Dense(1), 
    ])
    model.compile(optimizer='adam',loss='mae')
    scheduler = ExponentialDecay(1e-3, 400*((len(X_train_full_fe)*0.8)/BATCH_SIZE), 1e-5)
    lr = LearningRateScheduler(scheduler, verbose=1)
    model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=EPOCH, batch_size=BATCH_SIZE, callbacks=[lr])
    #model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=EPOCH, batch_size=BATCH_SIZE)
    test_preds.append(model.predict(X_test_full_fe).squeeze().reshape(-1, 1).squeeze())

# 5.Make CSVfile

In [None]:
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')
submission["pressure"] = sum(test_preds)/N_SPLITS
submission.to_csv('submission.csv', index=False)
print('end')