In [1]:
import sys
from time import time
import numpy as np
import pandas as pd
from pathlib import Path
import lightgbm as lgb
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
import copy
import wandb
from collections import OrderedDict
import tsfresh.feature_extraction.feature_calculators as feature_calculators

from sklearn.metrics import mean_absolute_error
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torchdata

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import WandbLogger


In [2]:
sys.path.append('../../src/')
import utils as utils
from utils import Timer

In [3]:
class CFG:
    seed = 42
    exp_num = 25
    local = True
    n_folds = 5
    folds = [0]
    debug = False
    bias = 1000
    epochs = 200

    
    ######################
    # Dataset #
    ######################
    transforms = {
        "train": [{"name": ""}],
        "valid": [{"name": ""}],
        "test": [{"name": ""}]
    }

    ######################
    # Loaders #
    ######################
    loader_params = {
        "train": {
            'batch_size': 128,
            'shuffle': True,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': True,
        },
        "valid": {
            'batch_size': 32,
            'shuffle': False,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': False,
        },
        "test": {
            'batch_size': 32,
            'shuffle': False,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': False,
        }
    }

    ######################
    # Split #
    ######################
    split = "GroupKFold"
    split_params = {
        "n_splits": 5,
    }

    ######################
    # Model #
    ######################
    input_dim = 5

    dense_dim = 512
    lstm_dim = 512
    logit_dim = 512
    num_classes = 1

    ######################
    # Criterion #
    ######################
#     loss_name = "rmspe_loss"
#     loss_params: dict = {}

    ######################
    # Optimizer #
    ######################
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 0.001,
        'weight_decay': 1e-6
    }

    ######################
    # Scheduler #
    ######################
    scheduler_name = "CosineAnnealingLR"
    scheduler_params = {
        'T_max': 25, 
        'eta_min': 1e-6
    }

In [4]:
utils.set_seed(CFG.seed)

In [5]:
if CFG.local:
    DATA_DIR = Path("/home/knikaido/work/Ventilator-Pressure-Prediction/data/ventilator-pressure-prediction")
    OUTPUT_DIR = Path('./output/')
else:
    DATA_DIR = Path("../input/ventilator-pressure-prediction")
    OUTPUT_DIR = Path('')   

In [6]:
def get_transforms(phase: str):
    transforms = CFG.transforms
    if transforms is None:
        return None
    else:
        if transforms[phase] is None:
            return None
        trns_list = []
        for trns_conf in transforms[phase]:
            trns_name = trns_conf["name"]
            trns_params = {} if trns_conf.get("params") is None else \
                trns_conf["params"]
            if globals().get(trns_name) is not None:
                trns_cls = globals()[trns_name]
                trns_list.append(trns_cls(**trns_params))

        if len(trns_list) > 0:
            return Compose(trns_list)
        else:
            return None
        
        
class Normalize:
    def __call__(self, y: np.ndarray):
        max_vol = np.abs(y).max()
        y_vol = y * 1 / max_vol
        return np.asfortranarray(y_vol)


class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y

In [7]:
def compute_metric(preds, trues, u_outs):
    """
    Metric for the problem, as I understood it.
    """
    
    y = trues
    w = 1 - u_outs
    
    assert y.shape == preds.shape and w.shape == y.shape, (y.shape, preds.shape, w.shape)
    
    mae = w * np.abs(y - preds)
    mae = mae.sum() / w.sum()
    
    return mae


class VentilatorLoss(nn.Module):
    """
    Directly optimizes the competition metric
    """
    def __call__(self, preds, y, u_out):
        w = 1 - u_out
        mae = w * (y - preds).abs()
        mae = mae.sum(-1) / w.sum(-1)

        return mae

In [8]:
def get_criterion():
    return VentilatorLoss()

In [9]:
# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if optimizer_name == "SAM":
        base_optimizer_name = CFG.base_optimizer
        if __OPTIMIZERS__.get(base_optimizer_name) is not None:
            base_optimizer = __OPTIMIZERS__[base_optimizer_name]
        else:
            base_optimizer = optim.__getattribute__(base_optimizer_name)
        return SAM(model.parameters(), base_optimizer, **CFG.optimizer_params)

    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(),
                                              **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(),
                                                      **CFG.optimizer_params)


def get_scheduler(optimizer):
    scheduler_name = CFG.scheduler_name

    if scheduler_name is None:
        return
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(
            optimizer, **CFG.scheduler_params)

In [10]:
# validation
splitter = getattr(model_selection, CFG.split)(**CFG.split_params)

In [11]:
class VentilatorDataset(torchdata.Dataset):
    def __init__(self, df, train_value_col, train_category_col):
        if "pressure" not in df.columns:
            df['pressure'] = 0
        self.df = df
        self.groups = df.groupby('breath_id').groups
        self.keys = list(self.groups.keys())
        self.train_value_col = train_value_col
        self.train_category_col = train_category_col

        
    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        indexes = self.groups[self.keys[idx]]
        df_ = self.df.iloc[indexes]
        
        input_value = df_[self.train_value_col].values
        input_category = df_[self.train_category_col].values

        u_out_ = df_['u_out'].values
        p_ = df_['pressure'].values

        data = {
            "input_value": input_value.astype(np.float32),
            "input_category": input_category.astype(int),
            "u_out": u_out_.astype(np.float32),
            "p": p_.astype(np.float32),
        }
        
        return data

In [12]:
class RNNModel(nn.Module):
    def __init__(
        self,
        input_dim=4,
        lstm_dim=256,
        dense_dim=256,
        logit_dim=256,
        num_classes=1,
    ):
        super().__init__()
        
        self.rc_emb = nn.Embedding(9, 4, padding_idx=0)
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, dense_dim // 2),
            nn.LayerNorm(dense_dim // 2),
            nn.ReLU(),
            nn.Linear(dense_dim // 2, dense_dim),
            nn.LayerNorm(dense_dim),
            nn.ReLU(),
        )

        self.conv_basic = nn.Sequential(
            nn.Conv1d(in_channels=dense_dim+4, out_channels=dense_dim+4, kernel_size=3, padding=1),
            nn.LayerNorm(80),
            nn.ReLU(),
            nn.Conv1d(in_channels=dense_dim+4, out_channels=dense_dim+4, kernel_size=3, padding=1),
            nn.LayerNorm(80),
            nn.ReLU(),
        )

        self.lstm = nn.LSTM(dense_dim+4, lstm_dim, num_layers=2, batch_first=True, bidirectional=True)

        self.logits = nn.Sequential(
            nn.Linear(lstm_dim * 2, logit_dim),
            nn.ReLU(),
            nn.Linear(logit_dim, num_classes),
        )     
        
        # nakamaさんの初期化
        for n, m in self.named_modules():
            if isinstance(m, nn.LSTM):
                print(f'init {m}')
                for param in m.parameters():
                    if len(param.shape) >= 2:
                        nn.init.orthogonal_(param.data)
                    else:
                        nn.init.normal_(param.data)
            elif isinstance(m, nn.GRU):
                print(f"init {m}")
                for param in m.parameters():
                    if len(param.shape) >= 2:
                        init.orthogonal_(param.data)
                    else:
                        init.normal_(param.data)

    def forward(self, cont_seq_x, cate_seq_x):
        bs = cont_seq_x.size(0)
        rc_emb = self.rc_emb(cate_seq_x).view(bs, 80, -1)
        
        features = self.mlp(cont_seq_x)
        features = torch.cat((rc_emb, features), 2)

        features = self.conv_basic(features.permute([0, 2, 1]))
        features, _ = self.lstm(features.permute([0, 2, 1]))
        
        pred = self.logits(features)
        return pred

In [13]:
# Learner class(pytorch-lighting)
class Learner(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.criterion = get_criterion()
    
    def training_step(self, batch, batch_idx):
        d_ = batch
        output = self.model(d_['input_value'], d_['input_category'])
        loss = self.criterion(output.view(-1), d_['p'].view(-1), d_['u_out'].view(-1))
        return loss
    
    def validation_step(self, batch, batch_idx):
        d_ = batch
        output = self.model(d_['input_value'], d_['input_category'])
        loss = self.criterion(output.view(-1), d_['p'].view(-1), d_['u_out'].view(-1))
        
        self.log(f'Loss/val', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        
        output = OrderedDict({
            "targets": d_['p'].detach(), "preds": output.detach(), "u_outs": d_['u_out'].detach(), "loss": loss.detach()
        })
        return output
    
    def validation_epoch_end(self, outputs):

        targets = torch.cat([o["targets"].view(-1) for o in outputs]).cpu().numpy()
        preds = torch.cat([o["preds"].view(-1) for o in outputs]).cpu().numpy()
        u_outs = torch.cat([o["u_outs"].view(-1) for o in outputs]).cpu().numpy()

        score = get_score(preds, targets, u_outs)
        self.log(f'custom_mae/val', score, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        print(f'epoch = {self.current_epoch}, custom_mae = {score}')

    def configure_optimizers(self):
        optimizer = get_optimizer(self.model)
        scheduler = get_scheduler(optimizer)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "Loss/val"}

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
def get_score(y_pred, y_true, u_outs):
    return compute_metric(y_pred, y_true, u_outs)


def to_np(input):
    return input.detach().cpu().numpy()

# oof
def evaluate(model, loaders, phase):
    model.eval()
    pred_list = []
    target_list = []
    with torch.no_grad():
        for batch in loaders[phase]:
            d_ = batch
            d_['input_value'] = d_['input_value'].to(device)
            d_['input_category'] = d_['input_category'].to(device)
            output = model(d_['input_value'], d_['input_category'])
#             output = nn.Softmax(dim=1)(output)
            pred_list.append(to_np(output))
            target_list.append(to_np(d_['p']))

    pred_list = np.concatenate(pred_list).reshape(-1)
    target_list = np.concatenate(target_list).reshape(-1)
    model.train()
    return pred_list, target_list

In [16]:
train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')
display(train), display(test)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.000000,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.355850,0,12.234987
...,...,...,...,...,...,...,...,...
6035995,6035996,125749,50,10,2.504603,1.489714,1,3.869032
6035996,6035997,125749,50,10,2.537961,1.488497,1,3.869032
6035997,6035998,125749,50,10,2.571408,1.558978,1,3.798729
6035998,6035999,125749,50,10,2.604744,1.272663,1,4.079938


Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,1,0,5,20,0.000000,0.000000,0
1,2,0,5,20,0.031904,7.515046,0
2,3,0,5,20,0.063827,14.651675,0
3,4,0,5,20,0.095751,21.230610,0
4,5,0,5,20,0.127644,26.320956,0
...,...,...,...,...,...,...,...
4023995,4023996,125748,20,10,2.530117,4.971245,1
4023996,4023997,125748,20,10,2.563853,4.975709,1
4023997,4023998,125748,20,10,2.597475,4.979468,1
4023998,4023999,125748,20,10,2.631134,4.982648,1


(None, None)

In [17]:
def get_raw_features(input_df, dataType = 'train'):
    colum = ['time_step', 'u_in', 'R', 'C']

    return input_df[colum]

In [18]:
def get_category_features(input_df, dataType = 'train'):
    output_df = copy.deepcopy(input_df)
    colum = ['R_C']
    rc_map = {'5_10': 0, '5_20': 1, '5_50': 2, '20_10': 3, '20_20': 4, '20_50': 5, '50_10': 6, '50_20': 7, '50_50': 8}
    
    output_df['R_C'] = [f'{r}_{c}' for r, c in zip(output_df['R'], output_df['C'])]
    output_df['R_C'] = output_df['R_C'].map(rc_map)

    return output_df[colum]

In [19]:
def get_diff_shift_features(input_df, dataType = 'train'):
    
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    b_id_gby = input_df.groupby(['breath_id'])
    shift_idx = [-2, -1, 1, 2, 3, 4]
    
    def g_by_diff(c_, i):
        temp_df=pd.concat([output_df.loc[:, ['breath_id', c_]], output_df.loc[:, ['breath_id', c_]].reset_index().shift(i).rename(columns=lambda s:s+'_shift')], axis=1)
        df_with_diff=temp_df.loc[(temp_df['breath_id']==temp_df['breath_id_shift']), slice(None)]
        return(df_with_diff[c_]-df_with_diff[f'{c_}_shift'])
    
    # diffより直接引いたほうが早い
    for i in shift_idx:
        output_df[f'u_in_shift_{i}'] = b_id_gby['u_in'].shift(i)
        output_df[f'u_in_diff_{i}'] = g_by_diff('u_in', i)

        output_df[f'time_step_shift_{i}'] = b_id_gby['time_step'].shift(i)
        output_df[f'time_step_diff_{i}'] = g_by_diff('time_step', i)
    
    return output_df.iloc[:, c_num:]

In [20]:
def get_cum_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    b_id_gby = input_df.groupby(['breath_id'])
    
    output_df['u_in_cumsum'] = b_id_gby['u_in'].cumsum()
    output_df['time_step_cumsum'] = b_id_gby['time_step'].cumsum()
    
    return output_df.iloc[:, c_num:]

In [21]:
def get_simple_calc_features(input_df, dataType = 'train'):
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    output_df['area'] = output_df['time_step'] * output_df['u_in']
    output_df['area'] = output_df.groupby('breath_id')['area'].cumsum()
    
#     output_df['u_in_pow2'] = output_df['u_in'] * output_df['u_in']
    
    return output_df.iloc[:, c_num:]

In [22]:
def calc_fft_real(s_):   
    return list(feature_calculators.fft_coefficient(s_+1e-07, [{'coeff':0, 'attr': 'real'}]))[0][1]

def calc_fft_imag(s_):   
    return list(feature_calculators.fft_coefficient(s_+1e-07, [{'coeff':0, 'attr': 'imag'}]))[0][1]

def calc_fft_abs(s_):   
    return list(feature_calculators.fft_coefficient(s_+1e-07, [{'coeff':0, 'attr': 'abs'}]))[0][1]

def calc_fft_angle(s_):   
    return list(feature_calculators.fft_coefficient(s_+1e-07, [{'coeff':0, 'attr': 'angle'}]))[0][1]

def calc_number_peaks(s_):   
    return feature_calculators.number_peaks(s_, 3)

In [23]:
# def calc_registance(df):
#     max_ = np.max(df['u_in'])
#     plato_df = df[(df['time_step'] >= 1.0) & df['time_step'] <= 1.5]

In [24]:
def get_agg_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    # Dict for aggregations
    create_feature_dict = {
        'u_in': [np.max, np.std, np.mean, 'first', 'last'],
    }
    
    def get_agg_window(add_suffix = False):
        
        df_tgt = output_df
        df_feature = df_tgt.groupby(['breath_id']).agg(create_feature_dict)
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        return df_feature
    
#     def get_agg_plat_window(add_suffix = False):
        
#         df_tgt = output_df[(output_df['time_step'] >= 1.0) & output_df['time_step'] <= 1.5]
#         df_feature = df_tgt.groupby(['breath_id']).agg(create_feature_dict)
#         df_feature.columns = ['_plat_'.join(col) for col in df_feature.columns]
#         return df_feature
    
    df_agg_feature = get_agg_window().reset_index()
    
#     df_tmp = get_agg_plat_window().reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
    
#     df_tmp = get_agg_window(start_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(start_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')

    output_df = pd.merge(output_df, df_agg_feature, how='left', on='breath_id')
    
    output_df['u_in_diffmax'] = output_df['u_in_amax'] - output_df['u_in']
    output_df['u_in_diffmean'] = output_df['u_in_mean'] - output_df['u_in']
    
    return output_df.iloc[:, c_num:]

In [25]:
def to_feature(input_df, dataType = 'train'):
    """input_df を特徴量行列に変換した新しいデータフレームを返す.
    """

    processors = [
        get_raw_features,
        get_category_features,
        get_simple_calc_features,
        get_diff_shift_features,
        get_cum_features,
        get_agg_features
    ]

    out_df = pd.DataFrame()

    for func in tqdm(processors, total=len(processors)):
        with Timer(prefix='' + func.__name__ + ' '):
            _df = func(input_df, dataType)

        # 長さが等しいことをチェック (ずれている場合, func の実装がおかしい)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
#     out_df = utils.reduce_mem_usage(out_df)
    
    return out_df

In [26]:
train_df = to_feature(train, dataType = 'train')
test_df = to_feature(test, dataType = 'test')

  0%|          | 0/6 [00:00<?, ?it/s]

get_raw_features  0.027[s]


 33%|███▎      | 2/6 [00:02<00:04,  1.25s/it]

get_category_features  2.399[s]


 50%|█████     | 3/6 [00:02<00:02,  1.17it/s]

get_simple_calc_features  0.193[s]
get_diff_shift_features  11.187[s]


 67%|██████▋   | 4/6 [00:14<00:09,  4.94s/it]

get_cum_features  0.199[s]


 83%|████████▎ | 5/6 [00:15<00:03,  3.51s/it]

get_agg_features  0.953[s]


100%|██████████| 6/6 [00:17<00:00,  2.87s/it]
  0%|          | 0/6 [00:00<?, ?it/s]

get_raw_features  0.018[s]


 33%|███▎      | 2/6 [00:01<00:03,  1.25it/s]

get_category_features  1.537[s]
get_simple_calc_features  0.117[s]


 50%|█████     | 3/6 [00:01<00:01,  1.84it/s]

get_diff_shift_features  6.850[s]


 67%|██████▋   | 4/6 [00:09<00:06,  3.04s/it]

get_cum_features  0.113[s]


 83%|████████▎ | 5/6 [00:09<00:02,  2.16s/it]

get_agg_features  0.550[s]


100%|██████████| 6/6 [00:10<00:00,  1.77s/it]


In [27]:
train_value_col = [i for i in train_df.columns.to_list() if i not in ['R_C']]
train_category_col = ['R_C']

In [28]:
ss = StandardScaler()

train_category = train_df[train_category_col]
train_df = pd.DataFrame(ss.fit_transform(train_df[train_value_col]), columns=train_value_col)
train_mean = train_df.mean()
train_df = train_df.fillna(train_df.mean())

test_category = test_df[train_category_col]
test_df = pd.DataFrame(ss.transform(test_df[train_value_col]), columns=train_value_col)
test_df = test_df.fillna(train_mean)

In [29]:
display(train_df), display(test_df)

Unnamed: 0,time_step,u_in,R,C,area,u_in_pow2,u_in_shift_-2,u_in_diff_-2,time_step_shift_-2,time_step_diff_-2,...,time_step_diff_4,u_in_cumsum,time_step_cumsum,u_in_amax,u_in_std,u_in_mean,u_in_first,u_in_last,u_in_diffmax,u_in_diffmean
0,-1.706609,-0.538775,-0.359072,1.394522,-0.935020,-0.238130,1.239960e+00,-2.503819e+00,-1.703981e+00,-5.670750e-01,...,-1.790277e-15,-0.980690,-1.116536,-0.245401,0.119327,0.513555,-0.550080,0.281162,-0.015391,0.820945
1,-1.662676,0.823348,-0.359072,1.394522,-0.932038,0.105631,1.263865e+00,-5.308005e-01,-1.658426e+00,-7.288073e-01,...,-1.790277e-15,-0.936302,-1.115471,-0.245401,0.119327,0.513555,-0.550080,0.281162,-0.616229,-0.672003
2,-1.618468,1.130480,-0.359072,1.394522,-0.924711,0.277274,1.467130e+00,-3.577016e-01,-1.612622e+00,-8.800655e-01,...,-1.790277e-15,-0.881950,-1.113335,-0.245401,0.119327,0.513555,-0.550080,0.281162,-0.751706,-1.008635
3,-1.574044,1.152777,-0.359072,1.394522,-0.913545,0.291083,1.619080e+00,-5.335696e-01,-1.567182e+00,-8.432008e-01,...,-1.790277e-15,-0.826876,-1.110123,-0.245401,0.119327,0.513555,-0.550080,0.281162,-0.761541,-1.033072
4,-1.529378,1.342362,-0.359072,1.394522,-0.896949,0.415877,1.608516e+00,-2.398765e-01,-1.521650e+00,-7.557113e-01,...,7.451631e-01,-0.765651,-1.105828,-0.245401,0.119327,0.513555,-0.550080,0.281162,-0.845168,-1.240867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035995,1.563202,-0.434092,1.171893,-0.937525,0.367140,-0.235880,-4.319748e-01,-5.327759e-02,1.648132e+00,-2.612726e-01,...,2.412574e-01,-0.046169,1.893579,-0.334837,-0.430115,-0.438246,-0.313487,-3.249996,-0.153790,0.279152
6035996,1.606751,-0.434183,1.171893,-0.937525,0.385353,-0.235883,-4.548240e-01,-2.202768e-02,1.692762e+00,-2.523388e-01,...,2.455403e-01,-0.042575,1.973871,-0.334837,-0.430115,-0.438246,-0.313487,-3.249996,-0.153750,0.279251
6035997,1.650417,-0.428937,1.171893,-0.937525,0.404680,-0.235665,-4.380590e-01,-3.732884e-02,1.737306e+00,-1.774798e-01,...,2.719253e-01,-0.038811,2.055221,-0.334837,-0.430115,-0.438246,-0.313487,-3.249996,-0.156064,0.273501
6035998,1.693939,-0.450248,1.171893,-0.937525,0.420662,-0.236490,2.584907e-16,7.782393e-17,-1.094350e-16,2.408285e-15,...,2.566442e-01,-0.035738,2.137626,-0.334837,-0.430115,-0.438246,-0.313487,-3.249996,-0.146664,0.296860


Unnamed: 0,time_step,u_in,R,C,area,u_in_pow2,u_in_shift_-2,u_in_diff_-2,time_step_shift_-2,time_step_diff_-2,...,time_step_diff_4,u_in_cumsum,time_step_cumsum,u_in_amax,u_in_std,u_in_mean,u_in_first,u_in_last,u_in_diffmax,u_in_diffmean
0,-1.706609,-0.544978,-1.124554,-0.354513,-0.935020,-0.238137,6.128857e-01,-1.651673e+00,-1.708917e+00,1.020982e+00,...,-1.790277e-15,-0.980891,-1.116536,0.048467,0.367378,0.364698,-0.553396,0.267353,0.290369,0.760954
1,-1.664958,0.014398,-1.124554,-0.354513,-0.933864,-0.180687,1.137916e+00,-1.549065e+00,-1.666179e+00,1.012356e+00,...,-1.790277e-15,-0.962745,-1.115526,0.048467,0.367378,0.364698,-0.553396,0.267353,0.043626,0.147852
2,-1.623282,0.545607,-1.124554,-0.354513,-0.929355,-0.019761,1.544151e+00,-1.324769e+00,-1.623482e+00,1.025706e+00,...,-1.790277e-15,-0.927367,-1.113507,0.048467,0.367378,0.364698,-0.553396,0.267353,-0.190693,-0.434376
3,-1.581604,1.035304,-1.124554,-0.354513,-0.919555,0.220380,1.876616e+00,-1.060282e+00,-1.580758e+00,1.030121e+00,...,-1.790277e-15,-0.876103,-1.110478,0.048467,0.367378,0.364698,-0.553396,0.267353,-0.406700,-0.971107
4,-1.539968,1.414199,-1.124554,-0.354513,-0.903357,0.466612,2.120740e+00,-8.376255e-01,-1.538032e+00,1.020777e+00,...,-1.053881e+00,-0.812548,-1.106440,0.048467,0.367378,0.364698,-0.553396,0.267353,-0.573832,-1.386393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4023995,1.596511,-0.174948,-0.359072,-0.937525,0.367168,-0.212997,-1.590032e-01,-4.658669e-02,1.683030e+00,-4.998149e-01,...,6.806547e-01,-0.510729,1.922381,-0.027660,-0.551736,-0.843438,-0.156138,0.279443,0.048648,-0.186684
4023996,1.640554,-0.174615,-0.359072,-0.937525,0.428672,-0.212952,-1.587494e-01,-4.644606e-02,1.728091e+00,-4.664415e-01,...,7.232725e-01,-0.498714,2.003492,-0.027660,-0.551736,-0.843438,-0.156138,0.279443,0.048502,-0.187048
4023997,1.684448,-0.174336,-0.359072,-0.937525,0.491030,-0.212914,-1.585319e-01,-4.633274e-02,1.773832e+00,-7.012871e-01,...,6.008123e-01,-0.486691,2.085667,-0.027660,-0.551736,-0.843438,-0.156138,0.279443,0.048378,-0.187355
4023998,1.728391,-0.174099,-0.359072,-0.937525,0.554236,-0.212882,2.584907e-16,7.782393e-17,-1.094350e-16,2.408285e-15,...,5.266806e-01,-0.474660,2.168907,-0.027660,-0.551736,-0.843438,-0.156138,0.279443,0.048274,-0.187615


(None, None)

In [30]:
train_df = pd.concat([train_df, train_category, train[['id', 'breath_id', 'pressure', 'u_out']]], axis=1)
test_df = pd.concat([test_df, test_category, test[['id', 'breath_id', 'u_out']]], axis=1)

In [31]:
train_df = utils.reduce_mem_usage(train_df)
test_df = utils.reduce_mem_usage(test_df)

Mem. usage decreased from 2026.25 Mb to 518.07 Mb (74.4% reduction)
Mem. usage decreased from 1320.13 Mb to 337.71 Mb (74.4% reduction)


In [32]:
oof_total = np.zeros((len(train), CFG.num_classes))
sub_preds = np.zeros((test.shape[0], len(CFG.folds)))
val_idxes = []
models = []
y = train['pressure']
groups = train['breath_id']
gkfold = model_selection.GroupKFold(n_splits=CFG.n_folds)
scores = []
input_dim = len(train_value_col)

for i, (trn_idx, val_idx) in enumerate(splitter.split(train_df, y, groups)):
    if i not in CFG.folds:
        continue

    trn_df = train_df.loc[trn_idx, :].reset_index(drop=True)
    val_df = train_df.loc[val_idx, :].reset_index(drop=True)
    trn_y = y.values[trn_idx]
    val_y = y.values[val_idx]
    
    
    loaders = {
        phase: torchdata.DataLoader(
            VentilatorDataset(
                df_, train_value_col, train_category_col
            ),
            **CFG.loader_params[phase])  # type: ignore
        for phase, df_ in zip(["train", "valid", "test"], [trn_df, val_df, test_df])
    }
    
    
    model = RNNModel(
        input_dim=input_dim,
        lstm_dim=CFG.lstm_dim,
        dense_dim=CFG.dense_dim,
        logit_dim=CFG.logit_dim,
        num_classes=CFG.num_classes,
    )
    model_name = model.__class__.__name__
#     break
    
    learner = Learner(model)
    
    # loggers
    RUN_NAME = f'exp{str(CFG.exp_num)}'
    wandb.init(project='Ventilator-Pressure-Prediction', entity='sqrt4kaido', group=RUN_NAME, job_type=RUN_NAME + f'-fold-{i}')
    wandb.run.name = RUN_NAME + f'-fold-{i}'
    wandb_config = wandb.config
    wandb_config.model_name = model_name
    wandb.watch(model)
    
    # callbacks
    callbacks = []
    checkpoint_callback = ModelCheckpoint(
        monitor=f'Loss/val',
        mode='min',
        dirpath=OUTPUT_DIR,
        verbose=False,
        save_weights_only=True,
        filename=f'{model_name}-{learner.current_epoch}-{i}')
    callbacks.append(checkpoint_callback)

#     early_stop_callback = EarlyStopping(
#         monitor='Loss/val',
#         min_delta=0.00,
#         patience=10,
#         verbose=True,
#         mode='min')
#     callbacks.append(early_stop_callback)
    
    loggers = []
    loggers.append(WandbLogger())
    
    trainer = pl.Trainer(
        logger=loggers,
        callbacks=callbacks,
        max_epochs=CFG.epochs,
        default_root_dir=OUTPUT_DIR,
        gpus=1,
#         fast_dev_run=DEBUG,
        deterministic=True,
        benchmark=False,
        )
    
    trainer.fit(learner, train_dataloader=loaders['train'], val_dataloaders=loaders['valid'])
#     trainer.save_checkpoint(OUTPUT_DIR / "last.ckpt")
    print('train done.')
    
    #############
    # validation (to make oof)
    #############
    checkpoint = torch.load(checkpoint_callback.best_model_path)
    learner.load_state_dict(checkpoint['state_dict'])
    
    model = model.to(device)
    oof_pred, oof_target = evaluate(model, loaders, phase="valid")
    models.append(model)
    
    oof_score = get_score(oof_pred, oof_target, val_df['u_out'].values)
    scores.append(oof_score)
    oof_total[val_idx] = oof_pred.reshape(1, -1).T / CFG.bias
    val_idxes.append(val_idx)
    
    print('validate done.')
    print(f'fold = {i}, auc = {oof_score}')
    wandb.log({'CV_score': oof_score})
    
    #############
    # inference
    #############
    test_pred, _ = evaluate(model, loaders, phase="test")
    sub_preds[:, i] = test_pred
    
    print('inference done.')

# test_preds_total = np.array(test_preds_total)


init LSTM(516, 512, num_layers=2, batch_first=True, bidirectional=True)


[34m[1mwandb[0m: Currently logged in as: [33msqrt4kaido[0m (use `wandb login --relogin` to force relogin)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | RNNModel       | 12.8 M
1 | criterion | VentilatorLoss | 0     
---------------------------------------------
12.8 M    Trainable params
0         Non-trainable params
12.8 M    Total params
51.145    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

epoch = 0, custom_mae = 17.436227798461914


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

epoch = 0, custom_mae = 1.0892761945724487


Validating: 0it [00:00, ?it/s]

epoch = 1, custom_mae = 0.9069517254829407


Validating: 0it [00:00, ?it/s]

epoch = 2, custom_mae = 0.7759530544281006


Validating: 0it [00:00, ?it/s]

epoch = 3, custom_mae = 0.6964182257652283


Validating: 0it [00:00, ?it/s]

epoch = 4, custom_mae = 0.7107073664665222


Validating: 0it [00:00, ?it/s]

epoch = 5, custom_mae = 0.6859674453735352


Validating: 0it [00:00, ?it/s]

epoch = 6, custom_mae = 0.706102728843689


Validating: 0it [00:00, ?it/s]

epoch = 7, custom_mae = 0.5727066993713379


Validating: 0it [00:00, ?it/s]

epoch = 8, custom_mae = 0.5962947010993958


Validating: 0it [00:00, ?it/s]

epoch = 9, custom_mae = 0.5710928440093994


Validating: 0it [00:00, ?it/s]

epoch = 10, custom_mae = 0.5459699034690857


Validating: 0it [00:00, ?it/s]

epoch = 11, custom_mae = 0.5681695938110352


Validating: 0it [00:00, ?it/s]

epoch = 12, custom_mae = 0.4823494851589203


Validating: 0it [00:00, ?it/s]

epoch = 13, custom_mae = 0.4388710856437683


Validating: 0it [00:00, ?it/s]

epoch = 14, custom_mae = 0.49887505173683167


Validating: 0it [00:00, ?it/s]

epoch = 15, custom_mae = 0.441950261592865


Validating: 0it [00:00, ?it/s]

epoch = 16, custom_mae = 0.3892747759819031


Validating: 0it [00:00, ?it/s]

epoch = 17, custom_mae = 0.42014938592910767


Validating: 0it [00:00, ?it/s]

epoch = 18, custom_mae = 0.36463868618011475


Validating: 0it [00:00, ?it/s]

epoch = 19, custom_mae = 0.34462693333625793


Validating: 0it [00:00, ?it/s]

epoch = 20, custom_mae = 0.3451935648918152


Validating: 0it [00:00, ?it/s]

epoch = 21, custom_mae = 0.3276730179786682


Validating: 0it [00:00, ?it/s]

epoch = 22, custom_mae = 0.3229241371154785


Validating: 0it [00:00, ?it/s]

epoch = 23, custom_mae = 0.32012760639190674


Validating: 0it [00:00, ?it/s]

epoch = 24, custom_mae = 0.31651705503463745


Validating: 0it [00:00, ?it/s]

epoch = 25, custom_mae = 0.31613680720329285


Validating: 0it [00:00, ?it/s]

epoch = 26, custom_mae = 0.3161861300468445


Validating: 0it [00:00, ?it/s]

epoch = 27, custom_mae = 0.3154596984386444


Validating: 0it [00:00, ?it/s]

epoch = 28, custom_mae = 0.3187609910964966


Validating: 0it [00:00, ?it/s]

epoch = 29, custom_mae = 0.31787794828414917


Validating: 0it [00:00, ?it/s]

epoch = 30, custom_mae = 0.3361486494541168


Validating: 0it [00:00, ?it/s]

epoch = 31, custom_mae = 0.3308703601360321


Validating: 0it [00:00, ?it/s]

epoch = 32, custom_mae = 0.3271249234676361


Validating: 0it [00:00, ?it/s]

epoch = 33, custom_mae = 0.34672436118125916


Validating: 0it [00:00, ?it/s]

epoch = 34, custom_mae = 0.39460837841033936


Validating: 0it [00:00, ?it/s]

epoch = 35, custom_mae = 0.41318583488464355


Validating: 0it [00:00, ?it/s]

epoch = 36, custom_mae = 0.47093573212623596


Validating: 0it [00:00, ?it/s]

epoch = 37, custom_mae = 0.4047570526599884


Validating: 0it [00:00, ?it/s]

epoch = 38, custom_mae = 0.39591583609580994


Validating: 0it [00:00, ?it/s]

epoch = 39, custom_mae = 0.530876100063324


Validating: 0it [00:00, ?it/s]

epoch = 40, custom_mae = 0.511234700679779


Validating: 0it [00:00, ?it/s]

epoch = 41, custom_mae = 0.39931514859199524


Validating: 0it [00:00, ?it/s]

epoch = 42, custom_mae = 0.5063233971595764


Validating: 0it [00:00, ?it/s]

epoch = 43, custom_mae = 0.44342654943466187


Validating: 0it [00:00, ?it/s]

epoch = 44, custom_mae = 0.40235504508018494


Validating: 0it [00:00, ?it/s]

epoch = 45, custom_mae = 0.4384099543094635


Validating: 0it [00:00, ?it/s]

epoch = 46, custom_mae = 0.4694070518016815


Validating: 0it [00:00, ?it/s]

epoch = 47, custom_mae = 0.41220536828041077


Validating: 0it [00:00, ?it/s]

epoch = 48, custom_mae = 0.42521166801452637


Validating: 0it [00:00, ?it/s]

epoch = 49, custom_mae = 0.4015306234359741


Validating: 0it [00:00, ?it/s]

epoch = 50, custom_mae = 0.4447701871395111


Validating: 0it [00:00, ?it/s]

epoch = 51, custom_mae = 0.4231811463832855


Validating: 0it [00:00, ?it/s]

epoch = 52, custom_mae = 0.337711900472641


Validating: 0it [00:00, ?it/s]

epoch = 53, custom_mae = 0.5009693503379822


Validating: 0it [00:00, ?it/s]

epoch = 54, custom_mae = 0.36634138226509094


Validating: 0it [00:00, ?it/s]

epoch = 55, custom_mae = 0.34635433554649353


Validating: 0it [00:00, ?it/s]

epoch = 56, custom_mae = 0.4616020917892456


Validating: 0it [00:00, ?it/s]

epoch = 57, custom_mae = 0.3193846642971039


Validating: 0it [00:00, ?it/s]

epoch = 58, custom_mae = 0.312192440032959


Validating: 0it [00:00, ?it/s]

epoch = 59, custom_mae = 0.38049593567848206


Validating: 0it [00:00, ?it/s]

epoch = 60, custom_mae = 0.2860766351222992


Validating: 0it [00:00, ?it/s]

epoch = 61, custom_mae = 0.30185550451278687


Validating: 0it [00:00, ?it/s]

epoch = 62, custom_mae = 0.2816149890422821


Validating: 0it [00:00, ?it/s]

epoch = 63, custom_mae = 0.2674347758293152


Validating: 0it [00:00, ?it/s]

epoch = 64, custom_mae = 0.2622758746147156


Validating: 0it [00:00, ?it/s]

epoch = 65, custom_mae = 0.2529735267162323


Validating: 0it [00:00, ?it/s]

epoch = 66, custom_mae = 0.3323827385902405


Validating: 0it [00:00, ?it/s]

epoch = 67, custom_mae = 0.23747025430202484


Validating: 0it [00:00, ?it/s]

epoch = 68, custom_mae = 0.23042723536491394


Validating: 0it [00:00, ?it/s]

epoch = 69, custom_mae = 0.22974717617034912


Validating: 0it [00:00, ?it/s]

epoch = 70, custom_mae = 0.2240363210439682


Validating: 0it [00:00, ?it/s]

epoch = 71, custom_mae = 0.22429822385311127


Validating: 0it [00:00, ?it/s]

epoch = 72, custom_mae = 0.22134141623973846


Validating: 0it [00:00, ?it/s]

epoch = 73, custom_mae = 0.2214776873588562


Validating: 0it [00:00, ?it/s]

epoch = 74, custom_mae = 0.220013827085495


Validating: 0it [00:00, ?it/s]

epoch = 75, custom_mae = 0.21999038755893707


Validating: 0it [00:00, ?it/s]

epoch = 76, custom_mae = 0.2198563665151596


Validating: 0it [00:00, ?it/s]

epoch = 77, custom_mae = 0.2200171798467636


Validating: 0it [00:00, ?it/s]

epoch = 78, custom_mae = 0.2200097143650055


Validating: 0it [00:00, ?it/s]

epoch = 79, custom_mae = 0.22082501649856567


Validating: 0it [00:00, ?it/s]

epoch = 80, custom_mae = 0.22359073162078857


Validating: 0it [00:00, ?it/s]

epoch = 81, custom_mae = 0.22896771132946014


Validating: 0it [00:00, ?it/s]

epoch = 82, custom_mae = 0.23393316566944122


Validating: 0it [00:00, ?it/s]

epoch = 83, custom_mae = 0.23215141892433167


Validating: 0it [00:00, ?it/s]

epoch = 84, custom_mae = 0.23663198947906494


Validating: 0it [00:00, ?it/s]

epoch = 85, custom_mae = 0.24491527676582336


Validating: 0it [00:00, ?it/s]

epoch = 86, custom_mae = 0.2435573786497116


Validating: 0it [00:00, ?it/s]

epoch = 87, custom_mae = 0.2509390711784363


Validating: 0it [00:00, ?it/s]

epoch = 88, custom_mae = 0.2534922957420349


Validating: 0it [00:00, ?it/s]

epoch = 89, custom_mae = 0.27384576201438904


Validating: 0it [00:00, ?it/s]

epoch = 90, custom_mae = 0.2893674671649933


Validating: 0it [00:00, ?it/s]

epoch = 91, custom_mae = 0.2805693745613098


Validating: 0it [00:00, ?it/s]

epoch = 92, custom_mae = 0.4176943600177765


Validating: 0it [00:00, ?it/s]

epoch = 93, custom_mae = 0.42692339420318604


Validating: 0it [00:00, ?it/s]

epoch = 94, custom_mae = 0.3047839105129242


Validating: 0it [00:00, ?it/s]

epoch = 95, custom_mae = 0.41017746925354004


Validating: 0it [00:00, ?it/s]

epoch = 96, custom_mae = 0.45210134983062744


Validating: 0it [00:00, ?it/s]

epoch = 97, custom_mae = 0.34834474325180054


Validating: 0it [00:00, ?it/s]

epoch = 98, custom_mae = 0.3031633496284485


Validating: 0it [00:00, ?it/s]

epoch = 99, custom_mae = 0.3047637939453125


Validating: 0it [00:00, ?it/s]

epoch = 100, custom_mae = 0.3665451407432556


Validating: 0it [00:00, ?it/s]

epoch = 101, custom_mae = 0.31160396337509155


Validating: 0it [00:00, ?it/s]

epoch = 102, custom_mae = 0.32052600383758545


Validating: 0it [00:00, ?it/s]

epoch = 103, custom_mae = 0.3238796591758728


Validating: 0it [00:00, ?it/s]

epoch = 104, custom_mae = 0.30396798253059387


Validating: 0it [00:00, ?it/s]

epoch = 105, custom_mae = 0.288652241230011


Validating: 0it [00:00, ?it/s]

epoch = 106, custom_mae = 0.26940852403640747


Validating: 0it [00:00, ?it/s]

epoch = 107, custom_mae = 0.26926666498184204


Validating: 0it [00:00, ?it/s]

epoch = 108, custom_mae = 0.296410471200943


Validating: 0it [00:00, ?it/s]

epoch = 109, custom_mae = 0.27729231119155884


Validating: 0it [00:00, ?it/s]

epoch = 110, custom_mae = 0.8940730690956116


Validating: 0it [00:00, ?it/s]

epoch = 111, custom_mae = 0.2586762309074402


Validating: 0it [00:00, ?it/s]

epoch = 112, custom_mae = 0.25423142313957214


Validating: 0it [00:00, ?it/s]

epoch = 113, custom_mae = 0.23234768211841583


Validating: 0it [00:00, ?it/s]

epoch = 114, custom_mae = 0.23018398880958557


Validating: 0it [00:00, ?it/s]

epoch = 115, custom_mae = 0.22146634757518768


Validating: 0it [00:00, ?it/s]

epoch = 116, custom_mae = 0.21861086785793304


Validating: 0it [00:00, ?it/s]

epoch = 117, custom_mae = 0.21601440012454987


Validating: 0it [00:00, ?it/s]

epoch = 118, custom_mae = 0.21319963037967682


Validating: 0it [00:00, ?it/s]

epoch = 119, custom_mae = 0.21084626019001007


Validating: 0it [00:00, ?it/s]

epoch = 120, custom_mae = 0.20997869968414307


Validating: 0it [00:00, ?it/s]

epoch = 121, custom_mae = 0.20845463871955872


Validating: 0it [00:00, ?it/s]

epoch = 122, custom_mae = 0.20814476907253265
train done.




KeyboardInterrupt: 

In [None]:
if len(CFG.folds) != CFG.n_folds:

    oof_score = get_score(oof_pred, oof_target, val_df['u_out'].values)
    print(f'MAE {oof_score}')

    oof_df = train.iloc[val_idxes[0], :1]
    oof_df['pressure'] = oof_pred
    oof_df.to_csv(OUTPUT_DIR / f'oof{CFG.exp_num}.csv',index = False)    
else:
    score = get_score(y, oof_total, train['u_out'].values)
    print(f'MAE {score}: folds: {scores}')

    oof_df = pd.DataFrame({'id': train['id'].values, 'pressure':oof_total.reshape(-1)})
    oof_df.to_csv(OUTPUT_DIR / f'oof{CFG.exp_num}.csv',index = False)
oof_df

In [None]:
sub = pd.read_csv(DATA_DIR / 'sample_submission.csv')
sub['pressure'] = np.mean(sub_preds, axis=1)
sub.to_csv(OUTPUT_DIR / f'sub{CFG.exp_num}.csv',index = False)
sub

In [None]:
wandb.init(project='Ventilator-Pressure-Prediction', entity='sqrt4kaido', group=RUN_NAME, job_type='summary')
wandb.run.name = 'summary'
wandb.log({'CV_score': oof_score})
# wandb.save(utils.get_notebook_path())
wandb.finish()

Exception in thread Thread-8:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/user/.local/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 197, in check_network_status
    status_response = self._interface.communicate_network_status()
  File "/home/user/.local/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 749, in communicate_network_status
    resp = self._communicate(req, timeout=timeout, local=True)
  File "/home/user/.local/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 539, in _communicate
    return self._communicate_async(rec, local=local).get(timeout=timeout)
  File "/home/user/.local/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 544, in _communicate_async
    raise Exception("The wandb backend process has shutdown")