In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install autogluon==0.3.1 --quiet # autogluon==0.2.0

In [None]:
!pip install scikit-learn -U --quiet

In [None]:
# Importing core libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
# Importing AutoGluon
from autogluon.tabular import TabularDataset, TabularPredictor

# Scikit Learn
from sklearn.model_selection import train_test_split
### YOUR FEATURE ENGINEERING GOES HERE

from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
%matplotlib inline

In [None]:
train = pd.read_feather('../input/folds-just-added-in-feather-format/train_folds.ftr').drop('index',axis=1)
test = pd.read_feather('../input/folds-just-added-in-feather-format/test_stratfold.ftr').drop('index',axis=1)
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
label = 'pressure'

## EDA

In [None]:
# From paper https://arxiv.org/pdf/2102.06779.pdf 
train["R"].value_counts()  # 50,5 and 20
train["C"].value_counts()   # 10,50,20

In [None]:
train[train["breath_id"]==1].head(500).plot.line(x='time_step', y=['pressure','u_in'])

## Feature Engg

In [None]:
# rewritten calculation of lag features from this notebook: https://www.kaggle.com/patrick0302/add-lag-u-in-as-new-feat
# some of ideas from this notebook: https://www.kaggle.com/mst8823/google-brain-lightgbm-baseline
train['last_value_u_in'] = train.groupby('breath_id')['u_in'].transform('last')
train['u_in_lag1'] = train.groupby('breath_id')['u_in'].shift(1)
train['u_out_lag1'] = train.groupby('breath_id')['u_out'].shift(1)
train['u_in_lag_back1'] = train.groupby('breath_id')['u_in'].shift(-1)
train['u_out_lag_back1'] = train.groupby('breath_id')['u_out'].shift(-1)
train['u_in_lag2'] = train.groupby('breath_id')['u_in'].shift(2)
train['u_out_lag2'] = train.groupby('breath_id')['u_out'].shift(2)
train['u_in_lag_back2'] = train.groupby('breath_id')['u_in'].shift(-2)
train['u_out_lag_back2'] = train.groupby('breath_id')['u_out'].shift(-2)
train = train.fillna(0)

train['R__C'] = train["R"].astype(str) + '__' + train["C"].astype(str)

# max value of u_in and u_out for each breath
train['breath_id__u_in__max'] = train.groupby(['breath_id'])['u_in'].transform('max')
train['breath_id__u_out__max'] = train.groupby(['breath_id'])['u_out'].transform('max')

# difference between consequitive values
train['u_in_diff1'] = train['u_in'] - train['u_in_lag1']
train['u_out_diff1'] = train['u_out'] - train['u_out_lag1']
train['u_in_diff2'] = train['u_in'] - train['u_in_lag2']
train['u_out_diff2'] = train['u_out'] - train['u_out_lag2']
# from here: https://www.kaggle.com/yasufuminakama/ventilator-pressure-lstm-starter
train.loc[train['time_step'] == 0, 'u_in_diff'] = 0
train.loc[train['time_step'] == 0, 'u_out_diff'] = 0

# difference between the current value of u_in and the max value within the breath
train['breath_id__u_in__diffmax'] = train.groupby(['breath_id'])['u_in'].transform('max') - train['u_in']
train['breath_id__u_in__diffmean'] = train.groupby(['breath_id'])['u_in'].transform('mean') - train['u_in']

# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/273974
train['u_in_cumsum'] = train.groupby(['breath_id'])['u_in'].cumsum()
train['time_step_cumsum'] = train.groupby(['breath_id'])['time_step'].cumsum()
# https://www.kaggle.com/yasufuminakama/ventilator-pressure-lstm-starter
train['breath_time'] = train['time_step'] - train.groupby('breath_id')['time_step'].shift(1)

In [None]:
# all the same for the test data
test['last_value_u_in'] = test.groupby('breath_id')['u_in'].transform('last')
test['u_in_lag1'] = test.groupby('breath_id')['u_in'].shift(1)
test['u_out_lag1'] = test.groupby('breath_id')['u_out'].shift(1)
test['u_in_lag_back1'] = test.groupby('breath_id')['u_in'].shift(-1)
test['u_out_lag_back1'] = test.groupby('breath_id')['u_out'].shift(-1)
test['u_in_lag2'] = test.groupby('breath_id')['u_in'].shift(2)
test['u_out_lag2'] = test.groupby('breath_id')['u_out'].shift(2)
test['u_in_lag_back2'] = test.groupby('breath_id')['u_in'].shift(-2)
test['u_out_lag_back2'] = test.groupby('breath_id')['u_out'].shift(-2)
test = test.fillna(0)

test['R__C'] = test["R"].astype(str) + '__' + test["C"].astype(str)

test['breath_id__u_in__max'] = test.groupby(['breath_id'])['u_in'].transform('max')
test['breath_id__u_out__max'] = test.groupby(['breath_id'])['u_out'].transform('max')

test['u_in_diff1'] = test['u_in'] - test['u_in_lag1']
test['u_out_diff1'] = test['u_out'] - test['u_out_lag1']
test['u_in_diff2'] = test['u_in'] - test['u_in_lag2']
test['u_out_diff2'] = test['u_out'] - test['u_out_lag2']
test.loc[test['time_step'] == 0, 'u_in_diff'] = 0
test.loc[test['time_step'] == 0, 'u_out_diff'] = 0

test['breath_id__u_in__diffmax'] = test.groupby(['breath_id'])['u_in'].transform('max') - test['u_in']
test['breath_id__u_in__diffmean'] = test.groupby(['breath_id'])['u_in'].transform('mean') - test['u_in']

test['u_in_cumsum'] = test.groupby(['breath_id'])['u_in'].cumsum()
test['time_step_cumsum'] = test.groupby(['breath_id'])['time_step'].cumsum()

test['breath_time'] = test['time_step'] - test.groupby('breath_id')['time_step'].shift(1)

In [None]:
train = train.fillna(0)
test = test.fillna(0)

In [None]:
display(train.head())

## Trying AutoGluon

In [None]:
!mkdir agModels

In [None]:
train_data = TabularDataset(train.drop("breath_id",axis=1))
val_data = TabularDataset(train.drop("breath_id",axis=1).iloc[:100_000, :])

In [None]:
train_data

In [None]:
save_path = 'agModels'  # specifies folder to store trained models
presets='best_quality'
metric = 'mean_absolute_error'
hours = 4
hyperparameters = {
   # 'NN': {'num_epochs': 500},
   'GBM': { },
   'CAT': {'iterations': 18000 },
   'RF': { },
   'XT': { },
   #'KNN': {},
   #'custom': ['GBM'],
}
predictor = (TabularPredictor(label=label, eval_metric=metric,
                              path=save_path,groups='kfold')
             .fit(train_data,
                  presets=presets, #ag_args_fit={'num_gpus': 1},
                  #hyperparameters=hyperparameters,
                  #num_bag_folds=5, num_bag_sets=1,
                  time_limit= int(60 * 60 * hours)) )

In [None]:
results = predictor.fit_summary(show_plot=True)

In [None]:
leaderboard = predictor.leaderboard(val_data)

In [None]:
X_test = TabularDataset(test.drop("breath_id",axis=1))
test_data = TabularDataset(X_test)
test_preds = predictor.predict(test_data)

In [None]:
# Predicting and submission
submission['pressure'] = test_preds
submission.to_csv("submission.csv", index=False)
submission.head()