## About this notebook

+ using PyTorch Lightning
+ wandb
    + However, I don't know how to hide the API KEY, so I commented it out.
+ Can be run on kaggle, google colab, local machine
+ I'm using the features of a private dataset, but I've output the data :)
+ my LOCAL CV
  + fold0:0.2326
  + fold1:0.221
  + fold2:0.2289
  + fold3:0.2295
  + fold4:0.2263
  + Ensemble Folds with MEDIAN:0.192
  
### sorry. You may get a cpu memory error

It may be solved by not making oof

### reference

Thank you for publishing a very educational notebook

+ https://www.kaggle.com/yasufuminakama/ventilator-pressure-lstm-starter
+ https://www.kaggle.com/hirayukis/pytorch-lstm-cv-0-1942-lb-0-193
+ https://www.kaggle.com/artgor/ventilator-pressure-prediction-eda-fe-and-models
+ https://www.kaggle.com/theoviel/deep-learning-starter-simple-lstm
+ https://www.kaggle.com/cdeotte/ensemble-folds-with-median-0-153

## Get env

In [None]:
!nvidia-smi

In [None]:
import sys
import os
IN_COLAB = 'google.colab' in sys.modules
IN_KAGGLE = 'kaggle_web_client' in sys.modules
LOCAL = not (IN_KAGGLE or IN_COLAB)
print(f'IN_COLAB:{IN_COLAB}, IN_KAGGLE:{IN_KAGGLE}, LOCAL:{LOCAL}')

In [None]:
# # For Colab Download some datasets
# # ================================
# if IN_COLAB:
#     # mount googledrive
#     from google.colab import drive
#     drive.mount('/content/drive')
#     # copy kaggle.json from googledrive
#     ! pip install --upgrade --force-reinstall --no-deps  kaggle > /dev/null
#     ! mkdir ~/.kaggle
#     ! cp "/content/drive/MyDrive/kaggle/kaggle.json" ~/.kaggle/
#     ! chmod 600 ~/.kaggle/kaggle.json
    
#     if not os.path.exists("/content/input/"):
#         !mkdir input
#         !mkdir input/features
#         !kaggle datasets download -d teyosan1229/ventilator-pressure
#         !unzip /content/ventilator-pressure.zip -d input/features
#         !kaggle competitions download -c ventilator-pressure-prediction
#         !unzip /content/ventilator-pressure-prediction.zip -d input

In [None]:
if IN_KAGGLE or IN_COLAB:
    !pip install --upgrade -q wandb
    !pip install -q pytorch-lightning
    !pip install torch_optimizer

## Import Libraries

In [None]:
# Hide Warning
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Python Libraries
import os
import math
import random
import glob
import pickle
from collections import defaultdict
from pathlib import Path

# Third party
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
sns.set(style="whitegrid")

# Utilities and Metrics
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.preprocessing import RobustScaler, normalize, QuantileTransformer
from sklearn.metrics import mean_absolute_error #[roc_auc_score, accuracy_score]

# Pytorch 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from torch.optim.optimizer import Optimizer, required
import torch_optimizer as optim

# Pytorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import Callback, seed_everything
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import WandbLogger, CSVLogger

# Weights and Biases Tool
import wandb
#os.environ["WANDB_API_KEY"]='hoge'
#wandb.login()

## Config

In [None]:
class CFG:
    debug = False
    competition='ventilator'
    exp_name = "public"
    seed = 29
    
    # data
    target_col = 'pressure'
    target_size = 1
    
    # optimizer
    optimizer_name = 'RAdam'#['RAdam', 'sgd']
    lr = 5e-3
    weight_decay = 1e-5
    amsgrad = False
    
    # scheduler
    epochs = 300
    scheduler = 'CosineAnnealingLR'
    T_max = 300
    min_lr = 1e-5
    # criterion
    # u_out = 1 を考慮しないLoss
    criterion_name = 'CustomLoss1'
    
    # training
    train = True
    inference = True
    n_fold = 5
    trn_fold = [0]
    precision = 16 #[16, 32, 64]
    grad_acc = 1
    # DataLoader
    loader = {
        "train": {
            "batch_size": 1024,
            "num_workers": 0,
            "shuffle": True,
            "pin_memory": True,
            "drop_last": True
        },
        "valid": {
            "batch_size": 1024,
            "num_workers": 0,
            "shuffle": False,
            "pin_memory": True,
            "drop_last": False
        }
    }
    # pl
    trainer = {
        'gpus': 1,
        'progress_bar_refresh_rate': 1,
        'benchmark': False,
        'deterministic': True,
        }
    # LSTM
    num_layers = 4

    cate_cols = ['u_out'] + \
                ['u_out_lag','u_out_lag2','u_out_lag3','u_out_lag_back','u_out_lag_back2','u_out_lag_back3'] + \
                ['R_20', 'R_5', 'R_50', 'C_10', 'C_20', 'C_50', 'RC_2010', 'RC_2020', 'RC_2050', 'RC_5010', 'RC_5020', 'RC_5050', 'RC_510', 'RC_520', 'RC_550']

    cont_cols =['time_step', 'u_in'] + ['area'] + ['cross', 'cross2'] + ['u_in_cumsum', 'u_in_cummean'] + \
               ['u_in_lag','u_in_lag2','u_in_lag3','u_in_lag_back','u_in_lag_back2','u_in_lag_back3'] + \
               ['breath_time', 'u_in_time'] + ['u_out0_mean', 'u_out0_max', 'u_out0_std', 'u_out1_mean', 'u_out1_max', 'u_out1_std'] + \
               ['u_in_lag1_diff', 'u_in_lag2_diff','u_in_lag3_diff', 'u_in_lag4_diff'] + \
               ['u_in_rolling_mean2', 'u_in_rolling_mean4','u_in_rolling_mean10', 'u_in_rolling_max2', 'u_in_rolling_max4', 'u_in_rolling_max10',
                'u_in_rolling_min2', 'u_in_rolling_min4', 'u_in_rolling_min10', 'u_in_rolling_std2', 'u_in_rolling_std4', 'u_in_rolling_std10'] + \
               ['breath_id__u_in__max', 'breath_id__u_out__max']

    feature_cols = cate_cols + cont_cols
    dense_dim = 512
    hidden_size = 512
    logit_dim = 512
    
seed_everything(CFG.seed)
if not LOCAL:
    CFG.loader["train"]["num_workers"] = 4
    CFG.loader["valid"]["num_workers"] = 4

In [None]:
len(CFG.feature_cols),CFG.loader["train"]["num_workers"]

## Directory & LoadData

In [None]:
if IN_KAGGLE:
    INPUT_DIR = Path('../input/ventilator-pressure-prediction')
    FEAT_DIR = Path('../input/ventilator-pressure')
    OUTPUT_DIR = './'
elif IN_COLAB:
    INPUT_DIR = Path('/content/input/')
    FEAT_DIR = Path('/content/input/features/')
    OUTPUT_DIR = f'/content/drive/MyDrive/kaggle/Ventilator Pressure/{CFG.exp_name}/'
if LOCAL:
    INPUT_DIR = Path("F:/Kaggle/ventilator-pressure-prediction/data/input/")
    FEAT_DIR = Path("F:/Kaggle/ventilator-pressure-prediction/data/input/features/")
    OUTPUT_DIR = f'F:/Kaggle/ventilator-pressure-prediction/data/output/{CFG.exp_name}/'
    
def load_datasets(feats):
    dfs = [pd.read_feather(FEAT_DIR / f'{f}_train.ftr') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    dfs = [pd.read_feather(FEAT_DIR / f'{f}_test.ftr') for f in feats]
    X_test = pd.concat(dfs, axis=1)
    return X_train, X_test


# you can use own feature engineeringed data
feats = ['Base', 'Area', 'Cross', 'U_in_cumsum_mean', 'U_in_Lag', 'U_out_Lag', 'RC_OHE', 'U_out_stat', 'Time', 'U_in_Lag_Diff', 'U_in_Rolling', 'U_inout_max']
df_train, df_test = load_datasets(feats)
df_oof = df_train[["id","breath_id","u_out", "pressure", "fold"]].copy()

submission = pd.read_csv(INPUT_DIR / "sample_submission.csv")
display(df_train.head())
display(df_test.head())

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

if CFG.debug:
    CFG.epochs = 5
    #CFG.inference = False
    #df_train = df_train.head(240000)

In [None]:
df_train.to_csv('train.csv',index=False)
df_test.to_csv('test.csv',index=False)

## Utils

In [None]:
# # LINEに通知
# import requests
# def send_line_notification(message):
#     env = ""
#     if IN_COLAB: env = "colab"
#     elif IN_KAGGLE: env = "kaggle"
#     elif LOCAL: env = "local"
        
#     line_token = 'hoge'
#     endpoint = 'https://notify-api.line.me/api/notify'
#     message = f"[{env}]{message}"
#     payload = {'message': message}
#     headers = {'Authorization': 'Bearer {}'.format(line_token)}
#     requests.post(endpoint, data=payload, headers=headers)

## CV Split

In [None]:
# df_train["fold"] = -1
# Fold = GroupKFold(n_splits=CFG.n_fold)
# for n, (train_index, val_index) in enumerate(Fold.split(df_train, df_train[CFG.target_col], groups=df_train.breath_id.values)):
#      df_train.loc[val_index, 'fold'] = int(n)
# df_train['fold'] = df_train['fold'].astype(int)
# df_oof = df_train[["id","breath_id","u_out", "pressure", "fold"]].copy()
# print(df_train.groupby(['fold', 'breath_id']).size())

## Transforms

## Dataset

In [None]:
class TrainDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.u_out = self.X[:,:,2]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        x = torch.FloatTensor(self.X[idx])
        u_out = torch.LongTensor(self.u_out[idx])
        label = torch.FloatTensor(self.y[idx]).squeeze(1)
        return x, u_out, label
    
class TestDataset(Dataset):
    def __init__(self, X):
        self.X = X
        self.u_out = self.X[:,:,2]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.FloatTensor(self.X[idx])

## Rankgauss & Reshape
pandas.DataFrameからnumpyに変換、シーケンス分をまとめる

In [None]:
for col in tqdm(CFG.cont_cols):
    qt = QuantileTransformer(random_state=0, output_distribution='normal')
    df_train[[col]] = qt.fit_transform(df_train[[col]])
    df_test[[col]] = qt.transform(df_test[[col]])
#display(df_train.head())
#display(df_test.head())
#df_train.describe().T

In [None]:
X = np.float32(df_train[CFG.feature_cols]).reshape(-1, 80, len(CFG.feature_cols))
test_X = np.float32(df_test[CFG.feature_cols]).reshape(-1, 80, len(CFG.feature_cols))
y = np.float32(df_train["pressure"]).reshape(-1, 80, 1)
Fold = np.int16(df_train["fold"]).reshape(-1, 80, 1)
Fold = Fold.mean(axis=1).flatten()
print(X.shape, y.shape, test_X.shape, Fold.shape)

In [None]:
# X, u_out, y になっているか確認
ds = TrainDataset(X,y)
for i in range(3):
    print("="*50)
    print(ds[0][i])
del ds

## DataModule

In [None]:
class DataModule(pl.LightningDataModule):
    """
    numpy arrayで受け取る
    """
    def __init__(self, tr_X, tr_y, val_X, val_y, test_X, cfg):
        super().__init__()
        self.train_data = tr_X
        self.train_label = tr_y
        self.valid_data = val_X
        self.valid_label = val_y
        self.test_data = test_X
        self.cfg = cfg
        
    def setup(self, stage=None):
        self.train_dataset = TrainDataset(self.train_data, self.train_label)
        self.valid_dataset = TrainDataset(self.valid_data, self.valid_label)
        self.test_dataset = TestDataset(self.test_data)
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, **self.cfg.loader['train'])

    def val_dataloader(self):
        return DataLoader(self.valid_dataset, **self.cfg.loader['valid'])

    def test_dataloader(self):
        return DataLoader(self.test_dataset, **self.cfg.loader['valid'])

## Pytorch Lightning Module

In [None]:
# ====================================================
# model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.dense_dim = cfg.dense_dim
        self.hidden_size = cfg.hidden_size
        self.num_layers = cfg.num_layers
        self.logit_dim = cfg.logit_dim
        self.mlp = nn.Sequential(
            nn.Linear(len(cfg.feature_cols), self.dense_dim // 2),
            nn.ReLU(),
            #nn.Dropout(0.2),
            nn.Linear(self.dense_dim // 2, self.dense_dim),
            nn.ReLU(),
        )
        self.lstm1 = nn.LSTM(self.dense_dim, self.dense_dim,
                            dropout=0., batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(self.dense_dim * 2, self.dense_dim//2,
                            dropout=0., batch_first=True, bidirectional=True)
        self.lstm3 = nn.LSTM(self.dense_dim//2 * 2, self.dense_dim//4,
                            dropout=0., batch_first=True, bidirectional=True)
        self.lstm4 = nn.LSTM(self.dense_dim//4 * 2, self.dense_dim//8,
                            dropout=0., batch_first=True, bidirectional=True)
        self.head = nn.Sequential(
            nn.LayerNorm(self.hidden_size//8 * 2),
            nn.GELU(),
            nn.Linear(self.hidden_size//8 * 2, 1),
        )
        # LSTMやGRUは直交行列に初期化する
        for n, m in self.named_modules():
            if isinstance(m, nn.LSTM):
                print(f'init {m}')
                for param in m.parameters():
                    if len(param.shape) >= 2:
                        nn.init.orthogonal_(param.data)
                    else:
                        nn.init.normal_(param.data)
            elif isinstance(m, nn.GRU):
                print(f"init {m}")
                for param in m.parameters():
                    if len(param.shape) >= 2:
                        nn.init.orthogonal_(param.data)
                    else:
                        nn.init.normal_(param.data)

    def forward(self, x):
        bs = x.size(0)
        features = self.mlp(x)
        features, _ = self.lstm1(features)
        features, _ = self.lstm2(features)
        features, _ = self.lstm3(features)
        features, _ = self.lstm4(features)
        output = self.head(features).view(bs, -1)
        return output
    
def get_model(cfg):
    model = CustomModel(cfg)
    return model

# ====================================================
# criterion
# ====================================================
def compute_metric(df, preds):
    """
    Metric for the problem, as I understood it.
    """
    
    y = np.array(df['pressure'].values.tolist())
    w = 1 - np.array(df['u_out'].values.tolist())
    
    assert y.shape == preds.shape and w.shape == y.shape, (y.shape, preds.shape, w.shape)
    
    mae = w * np.abs(y - preds)
    mae = mae.sum() / w.sum()
    
    return mae


class VentilatorLoss(nn.Module):
    """
    Directly optimizes the competition metric
    """
    def __call__(self, preds, y, u_out):
        w = 1 - u_out
        mae = w * (y - preds).abs()
        mae = mae.sum(-1) / w.sum(-1)

        return mae

def get_criterion():
    if CFG.criterion_name == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss(reduction="mean")
    if CFG.criterion_name == 'CrossEntropyLoss':
        criterion = nn.CrossEntropyLoss()
    if CFG.criterion_name == 'CustomLoss1':
        # [reference]https://www.kaggle.com/theoviel/deep-learning-starter-simple-lstm
        criterion = VentilatorLoss()
    else:
        raise NotImplementedError
    return criterion
# ====================================================
# optimizer
# ====================================================
def get_optimizer(model: nn.Module, config: dict):
    """
    input:
    model:model
    config:optimizer_nameやlrが入ったものを渡す
    
    output:optimizer
    """
    optimizer_name = config.optimizer_name
    if 'Adam' == optimizer_name:
        return Adam(model.parameters(),
                    lr=config.lr,
                    weight_decay=config.weight_decay,
                    amsgrad=config.amsgrad)
    elif 'RAdam' == optimizer_name:
        return optim.RAdam(model.parameters(),
                           lr=config.lr,
                           weight_decay=config.weight_decay)
    elif 'sgd' == optimizer_name:
        return SGD(model.parameters(),
                   lr=config.lr,
                   momentum=0.9,
                   nesterov=True,
                   weight_decay=config.weight_decay,)
    else:
        raise NotImplementedError

# ====================================================
# scheduler
# ====================================================
def get_scheduler(optimizer):
    if CFG.scheduler=='ReduceLROnPlateau':
        """
        factor : 学習率の減衰率
        patience : 何ステップ向上しなければ減衰するかの値
        eps : nanとかInf回避用の微小数
        """
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
    elif CFG.scheduler=='CosineAnnealingLR':
        """
        T_max : 1 半周期のステップサイズ
        eta_min : 最小学習率(極小値)
        """
        scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
    elif CFG.scheduler=='CosineAnnealingWarmRestarts':
        """
        T_0 : 初期の繰りかえし回数
        T_mult : サイクルのスケール倍率
        """
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
    else:
        raise NotImplementedError
    return scheduler

In [None]:
class Trainer(pl.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.model = get_model(cfg)
        self.criterion = get_criterion()
    
    def forward(self, x):
        output = self.model(x)
        return output
    
    def training_step(self, batch, batch_idx):
        x, u_out, y = batch
        output = self.forward(x)
        labels = y
        loss = self.criterion(output, labels ,u_out).mean()
        
        self.log('train_loss', loss, on_step=True, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": output, "labels": labels}
    
    def training_epoch_end(self, outputs):
        self.log("lr", self.optimizer.param_groups[0]['lr'], prog_bar=True, logger=True)
    
    def validation_step(self, batch, batch_idx):
        x, u_out, y = batch
        output = self.forward(x)
        labels = y
        loss = self.criterion(output,labels ,u_out).mean()
        self.log('val_loss', loss, on_step= True, prog_bar=True, logger=True)
        return {"predictions": output,
                "labels": labels,
                "loss": loss.item()}
    
    def validation_epoch_end(self, outputs):
        preds = []
        labels = []
        loss = 0
        for output in outputs:
            preds += output['predictions']
            labels += output['labels']
            loss += output['loss']

        labels = torch.stack(labels)
        preds = torch.stack(preds)
        loss = loss/len(outputs)
        
        self.log("val_loss_epoch", loss, prog_bar=True, logger=True)
        
    def predict_step(self, batch, batch_idx):
        x = batch
        output = self.forward(x)
        return output
        
    def test_step(self, batch, batch_idx):
        x = batch       
        output = self.forward(x)
        return output
    
    def configure_optimizers(self):
        self.optimizer = get_optimizer(self, self.cfg)
        self.scheduler = {'scheduler': get_scheduler(self.optimizer),
                          'interval': 'step', # or 'epoch'
                          'frequency': 1}
        return {'optimizer': self.optimizer, 'lr_scheduler': self.scheduler}

## Train

In [None]:
def train() -> None:
    for fold in range(CFG.n_fold):
        if not fold in CFG.trn_fold:
            continue
        print(f"{'='*38} Fold: {fold} {'='*38}")
        # Logger
        #======================================================
        lr_monitor = LearningRateMonitor(logging_interval='step')
        
        loss_checkpoint = ModelCheckpoint(
            dirpath=OUTPUT_DIR,
            filename=f"best_loss_fold{fold}",
            monitor="val_loss",
            save_last=True,
            save_top_k=1,
            save_weights_only=True,
            mode="min",
        )
        
#         wandb_logger = WandbLogger(
#             project=f'{CFG.competition}',
#             group= f'{CFG.exp_name}',
#             name = f'Fold{fold}',
#             save_dir=OUTPUT_DIR
#         )
        data_module = DataModule(X[Fold!=fold], y[Fold!=fold],
                                 X[Fold==fold], y[Fold==fold],
                                 test_X,
                                 CFG
                                )
        data_module.setup()
        
        CFG.T_max = int(math.ceil(len(data_module.train_dataloader())/CFG.grad_acc)*CFG.epochs)
        print(f"set schedular T_max {CFG.T_max}")
        #early_stopping_callback = EarlyStopping(monitor='val_loss_epoch',mode="min", patience=5)
        
        trainer = pl.Trainer(
#             logger=wandb_logger,
            callbacks=[loss_checkpoint],#lr_monitor,early_stopping_callback
            default_root_dir=OUTPUT_DIR,
            accumulate_grad_batches=CFG.grad_acc,
            max_epochs=CFG.epochs,
            precision=CFG.precision,
            **CFG.trainer
        )
        # 学習
        model = Trainer(CFG)
        trainer.fit(model, data_module)
        torch.save(model.model.state_dict(),OUTPUT_DIR + '/' + f'{CFG.exp_name}_fold{fold}.pth')
        
        del model, data_module
        
        if CFG.inference:
            data_module = DataModule(X[0:1], y[0:1], X[0:1], y[0:1], test_X, CFG)
            data_module.setup()
            # Road best loss model
            best_model = Trainer.load_from_checkpoint(cfg=CFG,checkpoint_path=loss_checkpoint.best_model_path)
            predictions = trainer.predict(best_model, data_module.test_dataloader())
            preds = []
            for p in predictions:
                preds += p
            preds = torch.stack(preds).flatten()
            submission['pressure'] = preds.to('cpu').detach().numpy()
            submission.to_csv(OUTPUT_DIR + '/' + f'submission_fold{fold}.csv',index=False)
            
            # oof
            data_module = DataModule(X[0:1], y[0:1], X[0:1], y[0:1], X[Fold==fold], CFG)
            data_module.setup()
            predictions = trainer.predict(best_model, data_module.test_dataloader())
            preds = []
            for p in predictions:
                preds += p
            preds = torch.stack(preds).flatten()
            df_oof.loc[df_oof["fold"] == fold, ['pred']] = preds.to('cpu').detach().numpy()
            df_oof.to_csv(OUTPUT_DIR + '/' + f'oof.csv',index=False)
        
        wandb.finish()

        

In [None]:
train()
#send_line_notification("finished")
wandb.finish()

In [None]:
# oof = pd.read_csv(OUTPUT_DIR + 'oof.csv')
# cv = compute_metric(oof,oof["pred"])
# print(f"cv:{cv}")

In [None]:
# # median
# submission = pd.read_csv(INPUT_DIR / "sample_submission.csv")
# preds = [pd.read_csv(f'./exp037sub_fold{i}.csv') for i in range(5)]
# print(preds[0].pressure[0],preds[1].pressure[0],preds[2].pressure[0],preds[3].pressure[0],preds[4].pressure[0])
# submission['pressure'] = np.median([preds[0].pressure,preds[1].pressure,preds[2].pressure,preds[3].pressure,preds[4].pressure],axis=0)
# submission.to_csv('./exp037_5fold_median.csv',index=False)
# submission.head()