# Ventilator pressure prediction
we'll be using XGBoost regressor, catboost, lgbm libraries to predict our output. But I found XGBoost worked best for me without any finetuning. Other models will too work well if implemented properly.

[amitnikhade.com](https://amitnikhade.com)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import tensorflow as tf
# # detect and init the TPU
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# # instantiate a distribution strategy
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# # instantiating the model in the strategy scope creates the model on the TPU


Load data and analyse it

In [None]:
df = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')

In [None]:
df

In [None]:
test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
test

In [None]:
df.describe()

In [None]:
df.isna().any()

In [None]:
df.info() 

split data into features and label

In [None]:
X = df.drop('pressure', axis=1)

In [None]:
y = df['pressure']

In [None]:
# X = X[:5000]
# y = y[:5000]

In [None]:
# i.shape
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33)

In [None]:
# !pip install lightgbm --install-option=--gpu
# !pip install lightgbm --install-option=--cuda
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import Normalizer

## XGBoost

In [None]:
xgb = XGBRegressor(seed = 2021, n_estimators=500,verbosity=1, eval_metric="mae", tree_method="gpu_hist",gpu_id=0)
xgb.fit(X_train, y_train)

performance

In [None]:
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(xgb, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [None]:

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(xgb, X_valid, y_valid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [None]:
pred = xgb.predict(test)
sub = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')
sub['pressure'] = pred
sub.to_csv('submission.csv', index = 0)

## LGBM

In [None]:
lgbm = LGBMRegressor(random_state=2021,n_estimators=10, metric="mae", device_type="gpu",gpu_platform_id = 0, gpu_device_id = 0)
lgbm.fit(X_train, y_train)

performance

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(lgbm, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [None]:

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(lgbm, X_valid, y_valid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

## CatBoost

In [None]:
cat = CatBoostRegressor(iterations=50, depth=3,eval_metric="MAE", learning_rate=0.1, task_type="GPU",devices="0")
cat.fit(X_train, y_train)

performance

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(cat, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [None]:

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(cat, X_valid, y_valid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

## GRU RNN

In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Dropout, Flatten

def baseline_model():
        model = Sequential()
        model.add(GRU(32, kernel_initializer='normal', activation='relu',return_sequences=True, input_shape=(1, 7)))
        model.add(Dropout(0.2))
        model.add(GRU(32, kernel_initializer='normal', return_sequences=True,activation='relu'))
        model.add(Flatten())
        model.add(Dense(64, kernel_initializer='normal', activation='relu'))
        model.add(Dense(1,kernel_initializer='normal'))
        model.compile(loss='mean_absolute_error', optimizer='adam')
        return model

estimator = KerasRegressor(build_fn=baseline_model, batch_size=16, epochs=2, verbose=1) # increase the epoch to around 20 for better result
kfold = KFold(n_splits=2)  # replace the split with 5
results = cross_val_score(estimator, np.expand_dims(X, axis=1), y,scoring='neg_mean_absolute_error', cv=kfold)
print("Baseline: %.2f (%.2f) MAE" % (results.mean(), results.std()))

In [None]:
# train with the same configuration as below
# from tensorflow.keras.callbacks import EarlyStopping
# early_stopping = EarlyStopping(monitor='val_loss', patience = 3, restore_best_weights=True )
# history = estimator.fit(np.expand_dims(X, axis=1),y,validation_split=0.2, epochs=20,callbacks=[early_stopping])

## GRU TPU

Train with TPU

In [None]:
# from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, LSTM, GRU, Dropout, Flatten

# def baseline_model():
#     with tpu_strategy.scope():
#         model = Sequential()
#         model.add(GRU(32, kernel_initializer='normal', activation='relu',return_sequences=True, input_shape=(1, 7)))
#         model.add(Dropout(0.2))
#         model.add(GRU(32, kernel_initializer='normal', return_sequences=True,activation='relu'))
#         model.add(Flatten())
#         model.add(Dense(64, kernel_initializer='normal', activation='relu'))
#         model.add(Dense(1,kernel_initializer='normal'))
#         model.compile(loss='mean_absolute_error', optimizer='adam')
#         return model

# estimator = KerasRegressor(build_fn=baseline_model, batch_size=16, verbose=1)
# kfold = KFold(n_splits=5)
# results = cross_val_score(estimator, np.expand_dims(X_train, axis=1), y_train,scoring='neg_mean_absolute_error', cv=kfold)
# print("Baseline: %.2f (%.2f) MAE" % (results.mean(), results.std()))

In [None]:
# from tensorflow.keras.callbacks import EarlyStopping
# early_stopping = EarlyStopping(monitor='val_loss', patience = 3, restore_best_weights=True )
# history = estimator.fit(np.expand_dims(X, axis=1),y,validation_split=0.2, epochs=20,callbacks=[early_stopping])

In [None]:
# history.history['loss']

In [None]:
# history.history['val_loss']

In [None]:
# import matplotlib.pyplot as plt
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()

In [None]:
# Thanks!