## DATA LOADING AND LIBRARIES 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split

import xgboost as xgb


from warnings import filterwarnings
filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv',index_col=0)
test  = pd.read_csv('../input/ventilator-pressure-prediction/test.csv', index_col=0)
sample = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

# FEATURE ENGINNERING 

IDEA FROM  - https://www.kaggle.com/artgor/ventilator-pressure-prediction-eda-fe-and-models

In [None]:
# rewritten calculation of lag features from this notebook: https://www.kaggle.com/patrick0302/add-lag-u-in-as-new-feat
train['last_value_u_in'] = train.groupby('breath_id')['u_in'].transform('last')
train['u_in_lag'] = train['u_in'].shift(1)
train['u_out_lag'] = train['u_out'].shift(1)
train = train.fillna(0)

# max value of u_in and u_out for each breath
train['breath_id__u_in__max'] = train.groupby(['breath_id'])['u_in'].transform('max')
train['breath_id__u_out__max'] = train.groupby(['breath_id'])['u_out'].transform('max')

# difference between consequitive values
# IMPORTANT: need to rewrite it so that it is calculated only within separate breaths
train['u_in_diff'] = train['u_in'] - train['u_in_lag']
train['u_out_diff'] = train['u_out'] - train['u_out_lag']

# difference between the current value of u_in and the max value within the breath
train['breath_id__u_in__diffmax'] = train.groupby(['breath_id'])['u_in'].transform('max') - train['u_in']
train['breath_id__u_in__diffmean'] = train.groupby(['breath_id'])['u_in'].transform('mean') - train['u_in']

# OHE
train = train.merge(pd.get_dummies(train['R'], prefix='R'), left_index=True, right_index=True).drop(['R'], axis=1)
train = train.merge(pd.get_dummies(train['C'], prefix='C'), left_index=True, right_index=True).drop(['C'], axis=1)

# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/273974
train['u_in_cumsum'] = train.groupby(['breath_id'])['u_in'].cumsum()

In [None]:
# all the same for the test data
test['last_value_u_in'] = test.groupby('breath_id')['u_in'].transform('last')
test['u_in_lag'] = test['u_in'].shift(1)
test['u_out_lag'] = test['u_out'].shift(1)
test = test.fillna(0)

test['breath_id__u_in__max'] = test.groupby(['breath_id'])['u_in'].transform('max')
test['breath_id__u_out__max'] = test.groupby(['breath_id'])['u_out'].transform('max')

test['u_in_diff'] = test['u_in'] - test['u_in_lag']
test['u_out_diff'] = test['u_out'] - test['u_out_lag']

test['breath_id__u_in__diffmax'] = test.groupby(['breath_id'])['u_in'].transform('max') - test['u_in']
test['breath_id__u_in__diffmean'] = test.groupby(['breath_id'])['u_in'].transform('mean') - test['u_in']

test = test.merge(pd.get_dummies(test['R'], prefix='R'), left_index=True, right_index=True).drop(['R'], axis=1)
test = test.merge(pd.get_dummies(test['C'], prefix='C'), left_index=True, right_index=True).drop(['C'], axis=1)

test['u_in_cumsum'] = test.groupby(['breath_id'])['u_in'].cumsum()

# TRAIN TEST SPLIT

In [None]:
scores = []
feature_importance = pd.DataFrame()
models = []
columns = [col for col in train.columns if col not in ['id', 'breath_id', 'pressure']]
X = train[columns]
y = train['pressure']

In [None]:
param = {'lambda': 0.026484677457314282,
         'alpha': 0.014849938251506533,
         'colsample_bytree': 1.0,
         'subsample': 0.6,
         'learning_rate': 0.012,
         'max_depth': 17,
         'random_state': 24,
         'min_child_weight': 28,
         'tree_method':'gpu_hist'}

In [None]:
from sklearn.model_selection import GroupKFold
import os
import time
from sklearn import metrics


# MODEL DEPLOYED WITH GROUPKFOLD

In [None]:
folds = GroupKFold(n_splits=5)
for fold_n, (train_index, valid_index) in enumerate(folds.split(train, y, groups=train['breath_id'])):
    print(f'Fold {fold_n} started at {time.ctime()}')
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    model = xgb.XGBRegressor(**param, n_estimators=5000)
    model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            verbose=1000, early_stopping_rounds=100)
    score = metrics.mean_absolute_error(y_valid, model.predict(X_valid))    
    models.append(model)
    scores.append(score)

    fold_importance = pd.DataFrame()
    fold_importance["feature"] = columns
    fold_importance["importance"] = model.feature_importances_
    fold_importance["fold"] = fold_n + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

In [None]:
for model in models:
    sample['pressure'] += model.predict(test[columns])
sample['pressure'] /= 5

In [None]:
feature_importance["importance"] /= 5
cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
    by="importance", ascending=False)[:50].index

best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

plt.figure(figsize=(16, 12));
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
plt.title('XGB Features (avg over folds)');

In [None]:
sample.to_csv('XGBregressor.csv', index=False)