In [1]:
import sys
from time import time
import numpy as np
import pandas as pd
from pathlib import Path
import lightgbm as lgb
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
import copy
import wandb
from collections import OrderedDict

from sklearn.metrics import mean_absolute_error
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torchdata

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import WandbLogger


In [2]:
sys.path.append('../../src/')
import utils as utils
from utils import Timer

In [3]:
class CFG:
    seed = 42
    exp_num = 6
    local = True
    n_folds = 5
    folds = [0]
    debug = False
    bias = 1000
    epochs = 200

    
    ######################
    # Dataset #
    ######################
    transforms = {
        "train": [{"name": ""}],
        "valid": [{"name": ""}],
        "test": [{"name": ""}]
    }

    ######################
    # Loaders #
    ######################
    loader_params = {
        "train": {
            'batch_size': 128,
            'shuffle': True,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': True,
        },
        "valid": {
            'batch_size': 32,
            'shuffle': False,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': False,
        },
        "test": {
            'batch_size': 32,
            'shuffle': False,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': False,
        }
    }

    ######################
    # Split #
    ######################
    split = "GroupKFold"
    split_params = {
        "n_splits": 5,
    }

    ######################
    # Model #
    ######################
    input_dim = 5

    dense_dim = 512
    lstm_dim = 512
    logit_dim = 512
    num_classes = 1

    ######################
    # Criterion #
    ######################
#     loss_name = "rmspe_loss"
#     loss_params: dict = {}

    ######################
    # Optimizer #
    ######################
    optimizer_name = "Adam"
    optimizer_params = {
        "lr": 0.001
    }

    ######################
    # Scheduler #
    ######################
    scheduler_name = "ReduceLROnPlateau"
    scheduler_params = {
        'factor': 0.2, 
        'patience': 7
    }

In [4]:
utils.set_seed(CFG.seed)

In [5]:
if CFG.local:
    DATA_DIR = Path("/home/knikaido/work/Ventilator-Pressure-Prediction/data/ventilator-pressure-prediction")
    OUTPUT_DIR = Path('./output/')
else:
    DATA_DIR = Path("../input/ventilator-pressure-prediction")
    OUTPUT_DIR = Path('')   

In [6]:
def get_transforms(phase: str):
    transforms = CFG.transforms
    if transforms is None:
        return None
    else:
        if transforms[phase] is None:
            return None
        trns_list = []
        for trns_conf in transforms[phase]:
            trns_name = trns_conf["name"]
            trns_params = {} if trns_conf.get("params") is None else \
                trns_conf["params"]
            if globals().get(trns_name) is not None:
                trns_cls = globals()[trns_name]
                trns_list.append(trns_cls(**trns_params))

        if len(trns_list) > 0:
            return Compose(trns_list)
        else:
            return None
        
        
class Normalize:
    def __call__(self, y: np.ndarray):
        max_vol = np.abs(y).max()
        y_vol = y * 1 / max_vol
        return np.asfortranarray(y_vol)


class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y

In [7]:
def compute_metric(preds, trues, u_outs):
    """
    Metric for the problem, as I understood it.
    """
    
    y = trues
    w = 1 - u_outs
    
    assert y.shape == preds.shape and w.shape == y.shape, (y.shape, preds.shape, w.shape)
    
    mae = w * np.abs(y - preds)
    mae = mae.sum() / w.sum()
    
    return mae


class VentilatorLoss(nn.Module):
    """
    Directly optimizes the competition metric
    """
    def __call__(self, preds, y, u_out):
        w = 1 - u_out
        mae = w * (y - preds).abs()
        mae = mae.sum(-1) / w.sum(-1)

        return mae

In [8]:
def get_criterion():
    return VentilatorLoss()

In [9]:
# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if optimizer_name == "SAM":
        base_optimizer_name = CFG.base_optimizer
        if __OPTIMIZERS__.get(base_optimizer_name) is not None:
            base_optimizer = __OPTIMIZERS__[base_optimizer_name]
        else:
            base_optimizer = optim.__getattribute__(base_optimizer_name)
        return SAM(model.parameters(), base_optimizer, **CFG.optimizer_params)

    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(),
                                              **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(),
                                                      **CFG.optimizer_params)


def get_scheduler(optimizer):
    scheduler_name = CFG.scheduler_name

    if scheduler_name is None:
        return
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(
            optimizer, **CFG.scheduler_params)

In [10]:
# validation
splitter = getattr(model_selection, CFG.split)(**CFG.split_params)

In [11]:
class VentilatorDataset(torchdata.Dataset):
    def __init__(self, df, train_col):
        if "pressure" not in df.columns:
            df['pressure'] = 0
        self.df = df
        self.groups = df.groupby('breath_id').groups
        self.keys = list(self.groups.keys())
        self.train_col = train_col
        
    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        indexes = self.groups[self.keys[idx]]
        df_ = self.df.iloc[indexes]
        
        input_ = df_[self.train_col].values
        u_out_ = df_['u_out'].values
        p_ = df_['pressure'].values

        data = {
            "input": input_.astype(np.float32),
            "u_out": u_out_.astype(np.float32),
            "p": p_.astype(np.float32),
        }
        
        return data

In [12]:
class RNNModel(nn.Module):
    def __init__(
        self,
        input_dim=4,
        lstm_dim=256,
        dense_dim=256,
        logit_dim=256,
        num_classes=1,
    ):
        super().__init__()

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, dense_dim // 2),
            nn.ReLU(),
            nn.Linear(dense_dim // 2, dense_dim),
            nn.ReLU(),
        )

        self.lstm = nn.LSTM(dense_dim, lstm_dim, batch_first=True, bidirectional=True)

        self.logits = nn.Sequential(
            nn.Linear(lstm_dim * 2, logit_dim),
            nn.ReLU(),
            nn.Linear(logit_dim, num_classes),
        )

    def forward(self, x):
        features = self.mlp(x)
        features, _ = self.lstm(features)
        pred = self.logits(features)
        return pred

In [13]:
# Learner class(pytorch-lighting)
class Learner(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.criterion = get_criterion()
    
    def training_step(self, batch, batch_idx):
        d_ = batch
        output = self.model(d_['input'])
        loss = self.criterion(output.view(-1), d_['p'].view(-1), d_['u_out'].view(-1))
        return loss
    
    def validation_step(self, batch, batch_idx):
        d_ = batch
        output = self.model(d_['input'])
        loss = self.criterion(output.view(-1), d_['p'].view(-1), d_['u_out'].view(-1))
        
        self.log(f'Loss/val', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        
        output = OrderedDict({
            "targets": d_['p'].detach(), "preds": output.detach(), "u_outs": d_['u_out'].detach(), "loss": loss.detach()
        })
        return output
    
    def validation_epoch_end(self, outputs):

        targets = torch.cat([o["targets"].view(-1) for o in outputs]).cpu().numpy()
        preds = torch.cat([o["preds"].view(-1) for o in outputs]).cpu().numpy()
        u_outs = torch.cat([o["u_outs"].view(-1) for o in outputs]).cpu().numpy()

        score = get_score(preds, targets, u_outs)
        self.log(f'custom_mae/val', score, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        print(f'epoch = {self.current_epoch}, custom_mae = {score}')

    def configure_optimizers(self):
        optimizer = get_optimizer(self.model)
        scheduler = get_scheduler(optimizer)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "Loss/val"}

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
def get_score(y_pred, y_true, u_outs):
    return compute_metric(y_pred, y_true, u_outs)


def to_np(input):
    return input.detach().cpu().numpy()

# oof
def evaluate(model, loaders, phase):
    model.eval()
    pred_list = []
    target_list = []
    with torch.no_grad():
        for batch in loaders[phase]:
            d_ = batch
            d_['input'] = d_['input'].to(device)
            output = model(d_['input'])
#             output = nn.Softmax(dim=1)(output)
            pred_list.append(to_np(output))
            target_list.append(to_np(d_['p']))

    pred_list = np.concatenate(pred_list).reshape(-1)
    target_list = np.concatenate(target_list).reshape(-1)
    model.train()
    return pred_list, target_list

In [16]:
train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')
display(train), display(test)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.000000,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.355850,0,12.234987
...,...,...,...,...,...,...,...,...
6035995,6035996,125749,50,10,2.504603,1.489714,1,3.869032
6035996,6035997,125749,50,10,2.537961,1.488497,1,3.869032
6035997,6035998,125749,50,10,2.571408,1.558978,1,3.798729
6035998,6035999,125749,50,10,2.604744,1.272663,1,4.079938


Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,1,0,5,20,0.000000,0.000000,0
1,2,0,5,20,0.031904,7.515046,0
2,3,0,5,20,0.063827,14.651675,0
3,4,0,5,20,0.095751,21.230610,0
4,5,0,5,20,0.127644,26.320956,0
...,...,...,...,...,...,...,...
4023995,4023996,125748,20,10,2.530117,4.971245,1
4023996,4023997,125748,20,10,2.563853,4.975709,1
4023997,4023998,125748,20,10,2.597475,4.979468,1
4023998,4023999,125748,20,10,2.631134,4.982648,1


(None, None)

In [17]:
def get_raw_features(input_df, dataType = 'train'):
    colum = ['time_step', 'u_in']

    return input_df[colum]

In [18]:
def get_category_features(input_df, dataType = 'train'):
    output_df = copy.deepcopy(input_df)
    colum = ['R', 'C']
    
#     output_df['R_C'] = output_df['R'] + output_df['C'] * 10

    return output_df[colum].astype('category')

In [19]:
def get_diff_shift_features(input_df, dataType = 'train'):
    
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    b_id_gby = input_df.groupby(['breath_id'])
    shift_idx = [-2, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    
    for i in shift_idx:
        output_df[f'u_in_diff_{i}'] = b_id_gby['u_in'].diff(i)
        output_df[f'u_in_shift_{i}'] = b_id_gby['u_in'].shift(i)

        output_df[f'time_step_diff_{i}'] = b_id_gby['time_step'].diff(i)
        output_df[f'time_step_shift_{i}'] = b_id_gby['time_step'].shift(i)
    
    return output_df.iloc[:, c_num:]

In [20]:
def get_cum_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    b_id_gby = input_df.groupby(['breath_id'])
    
    output_df['u_in_cumsum'] = b_id_gby['u_in'].cumsum()
    output_df['time_step_cumsum'] = b_id_gby['time_step'].cumsum()
    
    return output_df.iloc[:, c_num:]

In [21]:
def get_agg_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    # Dict for aggregations

    
    def get_agg_window(create_feature_dict, only_u_out=False):
        
        df_tgt = output_df
        if only_u_out:
            df_tgt = output_df[output_df['u_out'] ==0]
        df_feature = df_tgt.groupby(['breath_id']).agg(create_feature_dict)
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        if only_u_out:
            df_feature = df_feature.add_suffix('_u_out')
            
        return df_feature
    
    create_feature_dict = {
        'u_in': [np.max, np.std, np.mean, 'first', 'last'],
    }
    df_agg_feature = get_agg_window(create_feature_dict).reset_index()
    
#     create_feature_dict = {
#         'u_in': [np.max, np.std, np.mean, 'last'],
#     }
#     df_tmp = get_agg_window(create_feature_dict, only_u_out = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
    
#     df_tmp = get_agg_window(start_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(start_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')

    output_df = pd.merge(output_df, df_agg_feature, how='left', on='breath_id')
    
    
    return output_df.iloc[:, c_num:]

In [22]:
def get_RC_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
        
    output_df['R'] = output_df['R'].astype(str)
    output_df['C'] = output_df['C'].astype(str)
    output_df['RC'] = output_df['R']+output_df['C']
    output_df = pd.get_dummies(output_df)
    
    return output_df.iloc[:, c_num:]

In [23]:
def to_feature(input_df, dataType = 'train'):
    """input_df を特徴量行列に変換した新しいデータフレームを返す.
    """

    processors = [
        get_raw_features,
        get_category_features,
        get_diff_shift_features,
        get_cum_features,
        get_agg_features,
        get_RC_features
    ]

    out_df = pd.DataFrame()

    for func in tqdm(processors, total=len(processors)):
        with Timer(prefix='' + func.__name__ + ' '):
            _df = func(input_df, dataType)

        # 長さが等しいことをチェック (ずれている場合, func の実装がおかしい)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
    out_df = utils.reduce_mem_usage(out_df)
    
    return out_df

In [24]:
train_df = to_feature(train, dataType = 'train')
test_df = to_feature(test, dataType = 'test')

 33%|███▎      | 2/6 [00:00<00:00, 10.18it/s]

get_raw_features  0.014[s]
get_category_features  0.150[s]


 33%|███▎      | 2/6 [00:20<00:00, 10.18it/s]

get_diff_shift_features  271.001[s]


 50%|█████     | 3/6 [04:32<05:40, 113.40s/it]

get_cum_features  0.190[s]


 67%|██████▋   | 4/6 [04:33<02:23, 71.57s/it] 

get_agg_features  0.878[s]


 83%|████████▎ | 5/6 [04:35<00:47, 47.39s/it]

get_RC_features  6.720[s]


100%|██████████| 6/6 [04:43<00:00, 47.25s/it]
 33%|███▎      | 2/6 [00:00<00:00, 15.85it/s]

Mem. usage decreased from 2711.25 Mb to 742.57 Mb (72.6% reduction)
get_raw_features  0.009[s]
get_category_features  0.096[s]


 33%|███▎      | 2/6 [00:19<00:00, 15.85it/s]

get_diff_shift_features  179.654[s]


 50%|█████     | 3/6 [03:00<03:45, 75.18s/it]

get_cum_features  0.121[s]


 67%|██████▋   | 4/6 [03:01<01:34, 47.45s/it]

get_agg_features  0.524[s]


 83%|████████▎ | 5/6 [03:02<00:31, 31.40s/it]

get_RC_features  4.489[s]


100%|██████████| 6/6 [03:07<00:00, 31.32s/it]


Mem. usage decreased from 1807.50 Mb to 495.05 Mb (72.6% reduction)


In [25]:
ss = StandardScaler()
ss.fit(train_df)

train_df = pd.DataFrame(ss.transform(train_df), columns=list(train_df.columns))
train_mean = train_df.mean()
train_df = train_df.fillna(train_df.mean())

test_df = pd.DataFrame(ss.transform(test_df), columns=list(test_df.columns))
test_df = test_df.fillna(train_mean)

In [26]:
display(train_df), display(test_df)

Unnamed: 0,time_step,u_in,R,C,u_in_diff_-2,u_in_shift_-2,time_step_diff_-2,time_step_shift_-2,u_in_diff_-1,u_in_shift_-1,...,C_50,RC_2010,RC_2020,RC_2050,RC_5010,RC_5020,RC_5050,RC_510,RC_520,RC_550
0,-1.706609,-0.538776,-0.359072,1.394522,-2.503374e+00,1.240467e+00,-5.629722e-01,-1.703993e+00,-2.511949e+00,8.449263e-01,...,1.435740,-0.295786,-0.299427,2.866523,-0.470539,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
1,-1.662664,0.823912,-0.359072,1.394522,-5.308007e-01,1.264159e+00,-7.206978e-01,-1.658398e+00,-5.786234e-01,1.157443e+00,...,1.435740,-0.295786,-0.299427,2.866523,-0.470539,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
2,-1.618480,1.130953,-0.359072,1.394522,-3.576065e-01,1.467412e+00,-8.784233e-01,-1.612640e+00,-5.675740e-02,1.179935e+00,...,1.435740,-0.295786,-0.299427,2.866523,-0.470539,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
3,-1.574017,1.153051,-0.359072,1.394522,-5.333698e-01,1.619540e+00,-8.521357e-01,-1.567208e+00,-3.633358e-01,1.372891e+00,...,1.435740,-0.295786,-0.299427,2.866523,-0.470539,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
4,-1.529395,1.342625,-0.359072,1.394522,-2.398601e-01,1.608317e+00,-7.469854e-01,-1.521613e+00,-2.756753e-01,1.517311e+00,...,1.435740,-0.295786,-0.299427,2.866523,-0.470539,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035995,1.562294,-0.434126,1.171893,-0.937525,-5.327896e-02,-4.320053e-01,-2.738087e-01,1.649282e+00,-1.572538e-02,-4.356242e-01,...,-0.696505,-0.295786,-0.299427,-0.348855,2.125220,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
6035996,1.605641,-0.434199,1.171893,-0.937525,-2.202934e-02,-4.548400e-01,-2.475211e-01,1.693733e+00,-2.550839e-02,-4.302972e-01,...,-0.696505,-0.295786,-0.299427,-0.348855,2.125220,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
6035997,1.651539,-0.428965,1.171893,-0.937525,-3.732968e-02,-4.380841e-01,-1.686583e-01,1.738184e+00,2.317612e-02,-4.519751e-01,...,-0.696505,-0.295786,-0.299427,-0.348855,2.125220,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
6035998,1.694886,-0.450263,1.171893,-0.937525,4.515522e-19,-2.216228e-17,2.726380e-15,1.010101e-16,-4.455089e-02,-4.360681e-01,...,-0.696505,-0.295786,-0.299427,-0.348855,2.125220,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883


Unnamed: 0,time_step,u_in,R,C,u_in_diff_-2,u_in_shift_-2,time_step_diff_-2,time_step_shift_-2,u_in_diff_-1,u_in_shift_-1,...,C_50,RC_2010,RC_2020,RC_2050,RC_5010,RC_5020,RC_5050,RC_510,RC_520,RC_550
0,-1.706609,-0.544978,-1.124554,-0.354513,-1.651318e+00,6.126275e-01,1.014283e+00,-1.708896e+00,-1.041172e+00,2.101770e-02,...,-0.696505,-0.295786,-0.299427,-0.348855,-0.470539,-0.350621,-0.348926,-0.351859,2.848793,-0.350883
1,-1.664975,0.014441,-1.124554,-0.354513,-1.549414e+00,1.138217e+00,1.014283e+00,-1.666161e+00,-9.894817e-01,5.614117e-01,...,-0.696505,-0.295786,-0.299427,-0.348855,-0.470539,-0.350621,-0.348926,-0.351859,2.848793,-0.350883
2,-1.623261,0.545366,-1.124554,-0.354513,-1.325054e+00,1.544723e+00,1.014283e+00,-1.623426e+00,-9.132784e-01,1.060374e+00,...,-0.696505,-0.295786,-0.299427,-0.348855,-0.470539,-0.350621,-0.348926,-0.351859,2.848793,-0.350883
3,-1.581587,1.035584,-1.124554,-0.354513,-1.060445e+00,1.876412e+00,1.040571e+00,-1.580773e+00,-7.102472e-01,1.446285e+00,...,-0.696505,-0.295786,-0.299427,-0.348855,-0.470539,-0.350621,-0.348926,-0.351859,2.848793,-0.350883
4,-1.539913,1.414733,-1.124554,-0.354513,-8.377977e-01,2.119567e+00,1.014283e+00,-1.537956e+00,-5.839523e-01,1.761169e+00,...,-0.696505,-0.295786,-0.299427,-0.348855,-0.470539,-0.350621,-0.348926,-0.351859,2.848793,-0.350883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4023995,1.595442,-0.174842,-0.359072,-0.937525,-4.658715e-02,-1.589231e-01,-5.103970e-01,1.683274e+00,-1.650026e-02,-1.713460e-01,...,-0.696505,3.380826,-0.299427,-0.348855,-0.470539,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
4023996,1.641339,-0.174552,-0.359072,-0.937525,-4.644624e-02,-1.586114e-01,-4.578218e-01,1.727725e+00,-1.640424e-02,-1.710501e-01,...,-0.696505,3.380826,-0.299427,-0.348855,-0.470539,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
4023997,1.684687,-0.174261,-0.359072,-0.937525,-4.633293e-02,-1.586114e-01,-6.944102e-01,1.774791e+00,-1.632514e-02,-1.707541e-01,...,-0.696505,3.380826,-0.299427,-0.348855,-0.470539,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883
4023998,1.728034,-0.173970,-0.359072,-0.937525,4.515522e-19,-2.216228e-17,2.726380e-15,1.010101e-16,-1.626321e-02,-1.707541e-01,...,-0.696505,3.380826,-0.299427,-0.348855,-0.470539,-0.350621,-0.348926,-0.351859,-0.351026,-0.350883


(None, None)

In [27]:
train_col = train_df.columns.to_list()

In [28]:
train_df = pd.concat([train_df, train[['id', 'breath_id', 'pressure', 'u_out']]], axis=1)
test_df = pd.concat([test_df, test[['id', 'breath_id', 'u_out']]], axis=1)

In [None]:
oof_total = np.zeros((len(train), CFG.num_classes))
sub_preds = np.zeros((test.shape[0], len(CFG.folds)))
val_idxes = []
models = []
y = train['pressure']
groups = train['breath_id']
gkfold = model_selection.GroupKFold(n_splits=CFG.n_folds)
scores = []
input_dim = len(train_col)

for i, (trn_idx, val_idx) in enumerate(splitter.split(train_df, y, groups)):
    if i not in CFG.folds:
        continue

    trn_df = train_df.loc[trn_idx, :].reset_index(drop=True)
    val_df = train_df.loc[val_idx, :].reset_index(drop=True)
    trn_y = y.values[trn_idx]
    val_y = y.values[val_idx]
    
    
    loaders = {
        phase: torchdata.DataLoader(
            VentilatorDataset(
                df_, train_col
            ),
            **CFG.loader_params[phase])  # type: ignore
        for phase, df_ in zip(["train", "valid", "test"], [trn_df, val_df, test_df])
    }
    
    
    model = RNNModel(
        input_dim=input_dim,
        lstm_dim=CFG.lstm_dim,
        dense_dim=CFG.dense_dim,
        logit_dim=CFG.logit_dim,
        num_classes=CFG.num_classes,
    )
    model_name = model.__class__.__name__
#     break
    
    learner = Learner(model)
    
    # loggers
    RUN_NAME = f'exp{str(CFG.exp_num)}'
    wandb.init(project='Ventilator-Pressure-Prediction', entity='sqrt4kaido', group=RUN_NAME, job_type=RUN_NAME + f'-fold-{i}')
    wandb.run.name = RUN_NAME + f'-fold-{i}'
    wandb_config = wandb.config
    wandb_config.model_name = model_name
    wandb.watch(model)
    
    # callbacks
    callbacks = []
    checkpoint_callback = ModelCheckpoint(
        monitor=f'Loss/val',
        mode='min',
        dirpath=OUTPUT_DIR,
        verbose=False,
        save_weights_only=True,
        filename=f'{model_name}-{learner.current_epoch}-{i}')
    callbacks.append(checkpoint_callback)

#     early_stop_callback = EarlyStopping(
#         monitor='Loss/val',
#         min_delta=0.00,
#         patience=10,
#         verbose=True,
#         mode='min')
#     callbacks.append(early_stop_callback)
    
    loggers = []
    loggers.append(WandbLogger())
    
    trainer = pl.Trainer(
        logger=loggers,
        callbacks=callbacks,
        max_epochs=CFG.epochs,
        default_root_dir=OUTPUT_DIR,
        gpus=1,
#         fast_dev_run=DEBUG,
        deterministic=True,
        benchmark=False,
        )
    
    trainer.fit(learner, train_dataloader=loaders['train'], val_dataloaders=loaders['valid'])
#     trainer.save_checkpoint(OUTPUT_DIR / "last.ckpt")
    print('train done.')
    
    #############
    # validation (to make oof)
    #############
    checkpoint = torch.load(checkpoint_callback.best_model_path)
    learner.load_state_dict(checkpoint['state_dict'])
    
    model = model.to(device)
    oof_pred, oof_target = evaluate(model, loaders, phase="valid")
    models.append(model)
    
    oof_score = get_score(oof_pred, oof_target, val_df['u_out'].values)
    scores.append(oof_score)
    oof_total[val_idx] = oof_pred.reshape(1, -1).T / CFG.bias
    val_idxes.append(val_idx)
    
    print('validate done.')
    print(f'fold = {i}, auc = {oof_score}')
    wandb.log({'CV_score': oof_score})
    
    #############
    # inference
    #############
    test_pred, _ = evaluate(model, loaders, phase="test")
    sub_preds[:, i] = test_pred
    
    print('inference done.')

# test_preds_total = np.array(test_preds_total)


[34m[1mwandb[0m: Currently logged in as: [33msqrt4kaido[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | RNNModel       | 4.9 M 
1 | criterion | VentilatorLoss | 0     
---------------------------------------------
4.9 M     Trainable params
0         Non-trainable params
4.9 M     Total params
19.512    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

epoch = 0, custom_mae = 17.4442138671875


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

epoch = 0, custom_mae = 1.0391385555267334


Validating: 0it [00:00, ?it/s]

epoch = 1, custom_mae = 1.0566691160202026


Validating: 0it [00:00, ?it/s]

epoch = 2, custom_mae = 0.8121448159217834


Validating: 0it [00:00, ?it/s]

epoch = 3, custom_mae = 0.7736174464225769


Validating: 0it [00:00, ?it/s]

epoch = 4, custom_mae = 0.7162048816680908


Validating: 0it [00:00, ?it/s]

epoch = 5, custom_mae = 0.7139606475830078


Validating: 0it [00:00, ?it/s]

epoch = 6, custom_mae = 0.6995557546615601


Validating: 0it [00:00, ?it/s]

epoch = 7, custom_mae = 0.760233998298645


Validating: 0it [00:00, ?it/s]

epoch = 8, custom_mae = 0.7307992577552795


Validating: 0it [00:00, ?it/s]

epoch = 9, custom_mae = 0.637686014175415


Validating: 0it [00:00, ?it/s]

epoch = 10, custom_mae = 0.6045182943344116


Validating: 0it [00:00, ?it/s]

epoch = 11, custom_mae = 0.6251944303512573


Validating: 0it [00:00, ?it/s]

epoch = 12, custom_mae = 0.5589180588722229


Validating: 0it [00:00, ?it/s]

epoch = 13, custom_mae = 0.5685568451881409


Validating: 0it [00:00, ?it/s]

epoch = 14, custom_mae = 0.5780256390571594


Validating: 0it [00:00, ?it/s]

epoch = 15, custom_mae = 0.505926787853241


Validating: 0it [00:00, ?it/s]

epoch = 16, custom_mae = 0.5418127775192261


Validating: 0it [00:00, ?it/s]

epoch = 17, custom_mae = 0.515927791595459


Validating: 0it [00:00, ?it/s]

epoch = 18, custom_mae = 0.500109851360321


Validating: 0it [00:00, ?it/s]

epoch = 19, custom_mae = 0.47795191407203674


Validating: 0it [00:00, ?it/s]

epoch = 20, custom_mae = 0.48591455817222595


Validating: 0it [00:00, ?it/s]

epoch = 21, custom_mae = 0.503153920173645


Validating: 0it [00:00, ?it/s]

epoch = 22, custom_mae = 0.46157971024513245


Validating: 0it [00:00, ?it/s]

epoch = 23, custom_mae = 0.4378763437271118


Validating: 0it [00:00, ?it/s]

epoch = 24, custom_mae = 0.4331468641757965


Validating: 0it [00:00, ?it/s]

epoch = 25, custom_mae = 0.45943090319633484


Validating: 0it [00:00, ?it/s]

epoch = 26, custom_mae = 0.4335169196128845


Validating: 0it [00:00, ?it/s]

epoch = 27, custom_mae = 0.41337403655052185


Validating: 0it [00:00, ?it/s]

epoch = 28, custom_mae = 0.43304452300071716


Validating: 0it [00:00, ?it/s]

epoch = 29, custom_mae = 0.3918595016002655


Validating: 0it [00:00, ?it/s]

epoch = 30, custom_mae = 0.4350476861000061


Validating: 0it [00:00, ?it/s]

epoch = 31, custom_mae = 0.41534048318862915


Validating: 0it [00:00, ?it/s]

epoch = 32, custom_mae = 0.3922814726829529


Validating: 0it [00:00, ?it/s]

epoch = 33, custom_mae = 0.3933795690536499


Validating: 0it [00:00, ?it/s]

epoch = 34, custom_mae = 0.45836883783340454


Validating: 0it [00:00, ?it/s]

epoch = 35, custom_mae = 0.4083797037601471


Validating: 0it [00:00, ?it/s]

epoch = 36, custom_mae = 0.3771685063838959


Validating: 0it [00:00, ?it/s]

epoch = 37, custom_mae = 0.3706117272377014


Validating: 0it [00:00, ?it/s]

epoch = 38, custom_mae = 0.36829468607902527


Validating: 0it [00:00, ?it/s]

epoch = 39, custom_mae = 0.37597331404685974


Validating: 0it [00:00, ?it/s]

epoch = 40, custom_mae = 0.3550399839878082


Validating: 0it [00:00, ?it/s]

epoch = 41, custom_mae = 0.3910254240036011


Validating: 0it [00:00, ?it/s]

epoch = 42, custom_mae = 0.3486781418323517


Validating: 0it [00:00, ?it/s]

epoch = 43, custom_mae = 0.36132919788360596


Validating: 0it [00:00, ?it/s]

epoch = 44, custom_mae = 0.35563984513282776


Validating: 0it [00:00, ?it/s]

epoch = 45, custom_mae = 0.41295352578163147


Validating: 0it [00:00, ?it/s]

epoch = 46, custom_mae = 0.3836791515350342


Validating: 0it [00:00, ?it/s]

epoch = 47, custom_mae = 0.3536083698272705


Validating: 0it [00:00, ?it/s]

epoch = 48, custom_mae = 0.3693374991416931


Validating: 0it [00:00, ?it/s]

epoch = 49, custom_mae = 0.3476813733577728


Validating: 0it [00:00, ?it/s]

epoch = 50, custom_mae = 0.3645998537540436


Validating: 0it [00:00, ?it/s]

epoch = 51, custom_mae = 0.33761537075042725


Validating: 0it [00:00, ?it/s]

epoch = 52, custom_mae = 0.32356613874435425


Validating: 0it [00:00, ?it/s]

epoch = 53, custom_mae = 0.37836208939552307


Validating: 0it [00:00, ?it/s]

epoch = 54, custom_mae = 0.3352091610431671


Validating: 0it [00:00, ?it/s]

epoch = 55, custom_mae = 0.33280906081199646


Validating: 0it [00:00, ?it/s]

epoch = 56, custom_mae = 0.393821120262146


Validating: 0it [00:00, ?it/s]

epoch = 57, custom_mae = 0.7374326586723328


Validating: 0it [00:00, ?it/s]

epoch = 58, custom_mae = 0.3534476161003113


Validating: 0it [00:00, ?it/s]

epoch = 59, custom_mae = 0.3347470760345459


Validating: 0it [00:00, ?it/s]

epoch = 60, custom_mae = 0.3876662254333496


Validating: 0it [00:00, ?it/s]

epoch = 61, custom_mae = 0.2912267744541168


Validating: 0it [00:00, ?it/s]

epoch = 62, custom_mae = 0.281217098236084


Validating: 0it [00:00, ?it/s]

epoch = 63, custom_mae = 0.2785777449607849


Validating: 0it [00:00, ?it/s]

epoch = 64, custom_mae = 0.27695396542549133


Validating: 0it [00:00, ?it/s]

epoch = 65, custom_mae = 0.27475684881210327


Validating: 0it [00:00, ?it/s]

epoch = 66, custom_mae = 0.2724531292915344


Validating: 0it [00:00, ?it/s]

epoch = 67, custom_mae = 0.2720367908477783


Validating: 0it [00:00, ?it/s]

epoch = 68, custom_mae = 0.26888325810432434


Validating: 0it [00:00, ?it/s]

epoch = 69, custom_mae = 0.2812081575393677


Validating: 0it [00:00, ?it/s]

epoch = 70, custom_mae = 0.26613399386405945


Validating: 0it [00:00, ?it/s]

epoch = 71, custom_mae = 0.26791027188301086


Validating: 0it [00:00, ?it/s]

epoch = 72, custom_mae = 0.26359450817108154


Validating: 0it [00:00, ?it/s]

epoch = 73, custom_mae = 0.26413413882255554


Validating: 0it [00:00, ?it/s]

epoch = 74, custom_mae = 0.262611448764801


Validating: 0it [00:00, ?it/s]

epoch = 75, custom_mae = 0.26102375984191895


Validating: 0it [00:00, ?it/s]

epoch = 76, custom_mae = 0.2621270716190338


Validating: 0it [00:00, ?it/s]

epoch = 77, custom_mae = 0.2592465877532959


Validating: 0it [00:00, ?it/s]

epoch = 78, custom_mae = 0.2590758502483368


Validating: 0it [00:00, ?it/s]

epoch = 79, custom_mae = 0.2587675452232361


Validating: 0it [00:00, ?it/s]

epoch = 80, custom_mae = 0.2581968903541565


Validating: 0it [00:00, ?it/s]

epoch = 81, custom_mae = 0.2593403160572052


Validating: 0it [00:00, ?it/s]

epoch = 82, custom_mae = 0.2554157078266144


Validating: 0it [00:00, ?it/s]

epoch = 83, custom_mae = 0.2559173107147217


Validating: 0it [00:00, ?it/s]

epoch = 84, custom_mae = 0.25551941990852356


Validating: 0it [00:00, ?it/s]

epoch = 85, custom_mae = 0.2547391951084137


Validating: 0it [00:00, ?it/s]

epoch = 86, custom_mae = 0.2541607916355133


Validating: 0it [00:00, ?it/s]

epoch = 87, custom_mae = 0.27025994658470154


Validating: 0it [00:00, ?it/s]

epoch = 88, custom_mae = 0.2542382478713989


Validating: 0it [00:00, ?it/s]

epoch = 89, custom_mae = 0.2564166188240051


Validating: 0it [00:00, ?it/s]

epoch = 90, custom_mae = 0.2628490626811981


Validating: 0it [00:00, ?it/s]

epoch = 91, custom_mae = 0.2540079951286316


Validating: 0it [00:00, ?it/s]

epoch = 92, custom_mae = 0.25535035133361816


Validating: 0it [00:00, ?it/s]

epoch = 93, custom_mae = 0.2532048523426056


Validating: 0it [00:00, ?it/s]

epoch = 94, custom_mae = 0.25236305594444275


Validating: 0it [00:00, ?it/s]

epoch = 95, custom_mae = 0.2521154582500458


Validating: 0it [00:00, ?it/s]

epoch = 96, custom_mae = 0.25378093123435974


Validating: 0it [00:00, ?it/s]

epoch = 97, custom_mae = 0.2504581809043884


Validating: 0it [00:00, ?it/s]

epoch = 98, custom_mae = 0.25043919682502747


In [None]:
if len(CFG.folds) != CFG.n_folds:

    oof_score = get_score(oof_pred, oof_target, val_df['u_out'].values)
    print(f'MAE {oof_score}')

    oof_df = train.iloc[val_idxes[0], :1]
    oof_df['pressure'] = oof_pred
    oof_df.to_csv(OUTPUT_DIR / f'oof{CFG.exp_num}.csv',index = False)    
else:
    score = get_score(y, oof_total, train['u_out'].values)
    print(f'MAE {score}: folds: {scores}')

    oof_df = pd.DataFrame({'id': train['id'].values, 'pressure':oof_total.reshape(-1)})
    oof_df.to_csv(OUTPUT_DIR / f'oof{CFG.exp_num}.csv',index = False)
oof_df

In [None]:
sub = pd.read_csv(DATA_DIR / 'sample_submission.csv')
sub['pressure'] = np.mean(sub_preds, axis=1)
sub.to_csv(OUTPUT_DIR / f'sub{CFG.exp_num}.csv',index = False)
sub

In [None]:
wandb.init(project='Ventilator-Pressure-Prediction', entity='sqrt4kaido', group=RUN_NAME, job_type='summary')
wandb.run.name = 'summary'
wandb.log({'CV_score': oof_score})
# wandb.save(utils.get_notebook_path())
wandb.finish()