In [1]:
import sys
from time import time
import numpy as np
import pandas as pd
from pathlib import Path
import lightgbm as lgb
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
import copy
import wandb
from collections import OrderedDict

from sklearn.metrics import mean_absolute_error
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torchdata

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import WandbLogger


In [2]:
sys.path.append('../../src/')
import utils as utils
from utils import Timer

In [3]:
class CFG:
    seed = 42
    exp_num = 23
    local = True
    n_folds = 5
    folds = [0]
    debug = False
    bias = 1000
    epochs = 100

    
    ######################
    # Dataset #
    ######################
    transforms = {
        "train": [{"name": ""}],
        "valid": [{"name": ""}],
        "test": [{"name": ""}]
    }

    ######################
    # Loaders #
    ######################
    loader_params = {
        "train": {
            'batch_size': 128,
            'shuffle': True,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': True,
        },
        "valid": {
            'batch_size': 32,
            'shuffle': False,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': False,
        },
        "test": {
            'batch_size': 32,
            'shuffle': False,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': False,
        }
    }

    ######################
    # Split #
    ######################
    split = "GroupKFold"
    split_params = {
        "n_splits": 5,
    }

    ######################
    # Model #
    ######################
    input_dim = 5

    dense_dim = 512
    lstm_dim = 512
    logit_dim = 512
    num_classes = 1

    ######################
    # Criterion #
    ######################
#     loss_name = "rmspe_loss"
#     loss_params: dict = {}

    ######################
    # Optimizer #
    ######################
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 0.001,
        'weight_decay': 1e-6
    }

    ######################
    # Scheduler #
    ######################
    scheduler_name = "CosineAnnealingLR"
    scheduler_params = {
        'T_max': 25, 
        'eta_min': 1e-6
    }

In [4]:
utils.set_seed(CFG.seed)

In [5]:
if CFG.local:
    DATA_DIR = Path("/home/knikaido/work/Ventilator-Pressure-Prediction/data/ventilator-pressure-prediction")
    OUTPUT_DIR = Path('./output/')
else:
    DATA_DIR = Path("../input/ventilator-pressure-prediction")
    OUTPUT_DIR = Path('')   

In [6]:
def get_transforms(phase: str):
    transforms = CFG.transforms
    if transforms is None:
        return None
    else:
        if transforms[phase] is None:
            return None
        trns_list = []
        for trns_conf in transforms[phase]:
            trns_name = trns_conf["name"]
            trns_params = {} if trns_conf.get("params") is None else \
                trns_conf["params"]
            if globals().get(trns_name) is not None:
                trns_cls = globals()[trns_name]
                trns_list.append(trns_cls(**trns_params))

        if len(trns_list) > 0:
            return Compose(trns_list)
        else:
            return None
        
        
class Normalize:
    def __call__(self, y: np.ndarray):
        max_vol = np.abs(y).max()
        y_vol = y * 1 / max_vol
        return np.asfortranarray(y_vol)


class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y

In [7]:
def compute_metric(preds, trues, u_outs):
    """
    Metric for the problem, as I understood it.
    """
    
    y = trues
    w = 1 - u_outs
    
    assert y.shape == preds.shape and w.shape == y.shape, (y.shape, preds.shape, w.shape)
    
    mae = w * np.abs(y - preds)
    mae = mae.sum() / w.sum()
    
    return mae


class VentilatorLoss(nn.Module):
    """
    Directly optimizes the competition metric
    """
    def __call__(self, preds, y, u_out):
        w = 1 - u_out
        mae = w * (y - preds).abs()
        mae = mae.sum(-1) / w.sum(-1)

        return mae

In [8]:
def get_criterion():
    return VentilatorLoss()

In [9]:
# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if optimizer_name == "SAM":
        base_optimizer_name = CFG.base_optimizer
        if __OPTIMIZERS__.get(base_optimizer_name) is not None:
            base_optimizer = __OPTIMIZERS__[base_optimizer_name]
        else:
            base_optimizer = optim.__getattribute__(base_optimizer_name)
        return SAM(model.parameters(), base_optimizer, **CFG.optimizer_params)

    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(),
                                              **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(),
                                                      **CFG.optimizer_params)


def get_scheduler(optimizer):
    scheduler_name = CFG.scheduler_name

    if scheduler_name is None:
        return
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(
            optimizer, **CFG.scheduler_params)

In [10]:
# validation
splitter = getattr(model_selection, CFG.split)(**CFG.split_params)

In [11]:
class VentilatorDataset(torchdata.Dataset):
    def __init__(self, df, train_value_col, train_category_col):
        if "pressure" not in df.columns:
            df['pressure'] = 0
        self.df = df
        self.groups = df.groupby('breath_id').groups
        self.keys = list(self.groups.keys())
        self.train_value_col = train_value_col
        self.train_category_col = train_category_col

        
    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        indexes = self.groups[self.keys[idx]]
        df_ = self.df.iloc[indexes]
        
        input_value = df_[self.train_value_col].values
        input_category = df_[self.train_category_col].values

        u_out_ = df_['u_out'].values
        p_ = df_['pressure'].values

        data = {
            "input_value": input_value.astype(np.float32),
            "input_category": input_category.astype(int),
            "u_out": u_out_.astype(np.float32),
            "p": p_.astype(np.float32),
        }
        
        return data

In [12]:
class RNNModel(nn.Module):
    def __init__(
        self,
        input_dim=4,
        lstm_dim=256,
        dense_dim=256,
        logit_dim=256,
        num_classes=1,
    ):
        super().__init__()
        
        self.rc_emb = nn.Embedding(9, 4, padding_idx=0)
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, dense_dim // 2),
            nn.LayerNorm(dense_dim // 2),
            nn.ReLU(),
            nn.Linear(dense_dim // 2, dense_dim),
            nn.LayerNorm(dense_dim),
            nn.ReLU(),
        )

        self.conv_basic = nn.Sequential(
            nn.Conv1d(in_channels=dense_dim+4, out_channels=dense_dim+4, kernel_size=3, padding=1),
            nn.LayerNorm(80),
            nn.ReLU(),
            nn.Conv1d(in_channels=dense_dim+4, out_channels=dense_dim+4, kernel_size=3, padding=1),
            nn.LayerNorm(80),
            nn.ReLU(),
        )

        self.lstm = nn.LSTM(dense_dim+4, lstm_dim, num_layers=2, batch_first=True, bidirectional=True)

        self.logits = nn.Sequential(
            nn.Linear(lstm_dim * 2, logit_dim),
            nn.ReLU(),
            nn.Linear(logit_dim, num_classes),
        )     
        
        # nakamaさんの初期化
        for n, m in self.named_modules():
            if isinstance(m, nn.LSTM):
                print(f'init {m}')
                for param in m.parameters():
                    if len(param.shape) >= 2:
                        nn.init.orthogonal_(param.data)
                    else:
                        nn.init.normal_(param.data)
            elif isinstance(m, nn.GRU):
                print(f"init {m}")
                for param in m.parameters():
                    if len(param.shape) >= 2:
                        init.orthogonal_(param.data)
                    else:
                        init.normal_(param.data)

    def forward(self, cont_seq_x, cate_seq_x):
        bs = cont_seq_x.size(0)
        rc_emb = self.rc_emb(cate_seq_x).view(bs, 80, -1)
        
        features = self.mlp(cont_seq_x)
        features = torch.cat((rc_emb, features), 2)

        features = self.conv_basic(features.permute([0, 2, 1]))
        features, _ = self.lstm(features.permute([0, 2, 1]))
        
        pred = self.logits(features)
        return pred

In [13]:
# Learner class(pytorch-lighting)
class Learner(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.criterion = get_criterion()
    
    def training_step(self, batch, batch_idx):
        d_ = batch
        output = self.model(d_['input_value'], d_['input_category'])
        loss = self.criterion(output.view(-1), d_['p'].view(-1), d_['u_out'].view(-1))
        return loss
    
    def validation_step(self, batch, batch_idx):
        d_ = batch
        output = self.model(d_['input_value'], d_['input_category'])
        loss = self.criterion(output.view(-1), d_['p'].view(-1), d_['u_out'].view(-1))
        
        self.log(f'Loss/val', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        
        output = OrderedDict({
            "targets": d_['p'].detach(), "preds": output.detach(), "u_outs": d_['u_out'].detach(), "loss": loss.detach()
        })
        return output
    
    def validation_epoch_end(self, outputs):

        targets = torch.cat([o["targets"].view(-1) for o in outputs]).cpu().numpy()
        preds = torch.cat([o["preds"].view(-1) for o in outputs]).cpu().numpy()
        u_outs = torch.cat([o["u_outs"].view(-1) for o in outputs]).cpu().numpy()

        score = get_score(preds, targets, u_outs)
        self.log(f'custom_mae/val', score, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        print(f'epoch = {self.current_epoch}, custom_mae = {score}')
        
    def configure_optimizers(self):
        optimizer = get_optimizer(self.model)
        scheduler = get_scheduler(optimizer)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "Loss/val"}

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
def get_score(y_pred, y_true, u_outs):
    return compute_metric(y_pred, y_true, u_outs)


def to_np(input):
    return input.detach().cpu().numpy()

# oof
def evaluate(model, loaders, phase):
    model.eval()
    pred_list = []
    target_list = []
    with torch.no_grad():
        for batch in loaders[phase]:
            d_ = batch
            d_['input_value'] = d_['input_value'].to(device)
            d_['input_category'] = d_['input_category'].to(device)
            output = model(d_['input_value'], d_['input_category'])
#             output = nn.Softmax(dim=1)(output)
            pred_list.append(to_np(output))
            target_list.append(to_np(d_['p']))

    pred_list = np.concatenate(pred_list).reshape(-1)
    target_list = np.concatenate(target_list).reshape(-1)
    model.train()
    return pred_list, target_list

In [24]:
train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')
sub_20 = pd.read_csv('../20/output/sub20.csv')
test['pressure'] = sub_20['pressure']
test = test[test['R'] != 50]
display(train), display(test)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.000000,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.355850,0,12.234987
...,...,...,...,...,...,...,...,...
6035995,6035996,125749,50,10,2.504603,1.489714,1,3.869032
6035996,6035997,125749,50,10,2.537961,1.488497,1,3.869032
6035997,6035998,125749,50,10,2.571408,1.558978,1,3.798729
6035998,6035999,125749,50,10,2.604744,1.272663,1,4.079938


Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,0,5,20,0.000000,0.000000,0,6.244868
1,2,0,5,20,0.031904,7.515046,0,5.935369
2,3,0,5,20,0.063827,14.651675,0,7.033539
3,4,0,5,20,0.095751,21.230610,0,7.486939
4,5,0,5,20,0.127644,26.320956,0,9.006235
...,...,...,...,...,...,...,...,...
4023995,4023996,125748,20,10,2.530117,4.971245,1,36.236553
4023996,4023997,125748,20,10,2.563853,4.975709,1,34.818909
4023997,4023998,125748,20,10,2.597475,4.979468,1,35.760990
4023998,4023999,125748,20,10,2.631134,4.982648,1,36.524254


(None, None)

In [25]:
y = train['pressure']
groups = train['breath_id']
gkfold = model_selection.GroupKFold(n_splits=CFG.n_folds)
trn_idxes = []
val_idxes = []
for i, (trn_idx, val_idx) in enumerate(splitter.split(train, y, groups)):
    trn_idxes.append(trn_idx)
    val_idxes.append(val_idx)

In [26]:
ori_train_len = len(train)

In [27]:
train = pd.concat([train, test]).reset_index(drop=True)
y = train['pressure']
test = pd.read_csv(DATA_DIR / 'test.csv')

In [28]:
ori_trn_idxes = copy.deepcopy(trn_idxes)

In [30]:
trn_idxes[0] = np.hstack((trn_idxes[0], np.arange(ori_train_len, len(train))))

In [31]:
def get_raw_features(input_df, dataType = 'train'):
    colum = ['time_step', 'u_in', 'R', 'C']

    return input_df[colum]

In [32]:
def get_category_features(input_df, dataType = 'train'):
    output_df = copy.deepcopy(input_df)
    colum = ['R_C']
    rc_map = {'5_10': 0, '5_20': 1, '5_50': 2, '20_10': 3, '20_20': 4, '20_50': 5, '50_10': 6, '50_20': 7, '50_50': 8}
    
    output_df['R_C'] = [f'{r}_{c}' for r, c in zip(output_df['R'], output_df['C'])]
    output_df['R_C'] = output_df['R_C'].map(rc_map)

    return output_df[colum]

In [33]:
def get_diff_shift_features(input_df, dataType = 'train'):
    
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    b_id_gby = input_df.groupby(['breath_id'])
    shift_idx = [-2, -1, 1, 2, 3, 4]
    
    def g_by_diff(c_, i):
        temp_df=pd.concat([output_df.loc[:, ['breath_id', c_]], output_df.loc[:, ['breath_id', c_]].reset_index().shift(i).rename(columns=lambda s:s+'_shift')], axis=1)
        df_with_diff=temp_df.loc[(temp_df['breath_id']==temp_df['breath_id_shift']), slice(None)]
        return(df_with_diff[c_]-df_with_diff[f'{c_}_shift'])
    
    # diffより直接引いたほうが早い
    for i in shift_idx:
        output_df[f'u_in_shift_{i}'] = b_id_gby['u_in'].shift(i)
        output_df[f'u_in_diff_{i}'] = g_by_diff('u_in', i)

        output_df[f'time_step_shift_{i}'] = b_id_gby['time_step'].shift(i)
        output_df[f'time_step_diff_{i}'] = g_by_diff('time_step', i)
    
    return output_df.iloc[:, c_num:]

In [34]:
def get_cum_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    b_id_gby = input_df.groupby(['breath_id'])
    
    output_df['u_in_cumsum'] = b_id_gby['u_in'].cumsum()
    output_df['time_step_cumsum'] = b_id_gby['time_step'].cumsum()
    
    return output_df.iloc[:, c_num:]

In [35]:
def get_simple_calc_features(input_df, dataType = 'train'):
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    output_df['area'] = output_df['time_step'] * output_df['u_in']
    output_df['area'] = output_df.groupby('breath_id')['area'].cumsum()
    
    return output_df.iloc[:, c_num:]

In [36]:
def get_agg_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    # Dict for aggregations
    create_feature_dict = {
        'u_in': [np.max, np.std, np.mean, 'first', 'last'],
    }
    
    def get_agg_window(start_time=0, end_time=3.0, add_suffix = False):
        
        df_tgt = output_df[(output_df['time_step'] >= start_time) & (output_df['time_step'] <= end_time)]
        df_feature = df_tgt.groupby(['breath_id']).agg(create_feature_dict)
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(start_time) + '_' + str(end_time))
            
        return df_feature
    
    df_agg_feature = get_agg_window().reset_index()
    
#     df_tmp = get_agg_window(start_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(start_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')

    output_df = pd.merge(output_df, df_agg_feature, how='left', on='breath_id')
    
    output_df['u_in_diffmax'] = output_df['u_in_amax'] - output_df['u_in']
    output_df['u_in_diffmean'] = output_df['u_in_mean'] - output_df['u_in']
    
    return output_df.iloc[:, c_num:]

In [37]:
def to_feature(input_df, dataType = 'train'):
    """input_df を特徴量行列に変換した新しいデータフレームを返す.
    """

    processors = [
        get_raw_features,
        get_category_features,
        get_simple_calc_features,
        get_diff_shift_features,
        get_cum_features,
        get_agg_features
    ]

    out_df = pd.DataFrame()

    for func in tqdm(processors, total=len(processors)):
        with Timer(prefix='' + func.__name__ + ' '):
            _df = func(input_df, dataType)

        # 長さが等しいことをチェック (ずれている場合, func の実装がおかしい)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
#     out_df = utils.reduce_mem_usage(out_df)
    
    return out_df

In [38]:
train_df = to_feature(train, dataType = 'train')
test_df = to_feature(test, dataType = 'test')

 17%|█▋        | 1/6 [00:00<00:01,  4.19it/s]

get_raw_features  0.199[s]


 33%|███▎      | 2/6 [00:03<00:08,  2.07s/it]

get_category_features  3.299[s]


 50%|█████     | 3/6 [00:03<00:03,  1.29s/it]

get_simple_calc_features  0.234[s]
get_diff_shift_features  16.689[s]


 67%|██████▋   | 4/6 [00:21<00:15,  7.70s/it]

get_cum_features  0.291[s]


 83%|████████▎ | 5/6 [00:22<00:05,  5.36s/it]

get_agg_features  1.812[s]


100%|██████████| 6/6 [00:25<00:00,  4.26s/it]
  0%|          | 0/6 [00:00<?, ?it/s]

get_raw_features  0.019[s]


 50%|█████     | 3/6 [00:01<00:01,  1.85it/s]

get_category_features  1.563[s]
get_simple_calc_features  0.102[s]
get_diff_shift_features  6.781[s]


 67%|██████▋   | 4/6 [00:08<00:06,  3.01s/it]

get_cum_features  0.120[s]


 83%|████████▎ | 5/6 [00:09<00:02,  2.14s/it]

get_agg_features  0.777[s]


100%|██████████| 6/6 [00:10<00:00,  1.79s/it]


In [39]:
train_value_col = [i for i in train_df.columns.to_list() if i not in ['R_C']]
train_category_col = ['R_C']

In [40]:
ss = StandardScaler()

train_category = train_df[train_category_col]
train_df = pd.DataFrame(ss.fit_transform(train_df[train_value_col]), columns=train_value_col)
train_mean = train_df.mean()
train_df = train_df.fillna(train_df.mean())

test_category = test_df[train_category_col]
test_df = pd.DataFrame(ss.transform(test_df[train_value_col]), columns=train_value_col)
test_df = test_df.fillna(train_mean)

In [41]:
display(train_df), display(test_df)

Unnamed: 0,time_step,u_in,R,C,area,u_in_shift_-2,u_in_diff_-2,time_step_shift_-2,time_step_diff_-2,u_in_shift_-1,...,time_step_diff_4,u_in_cumsum,time_step_cumsum,u_in_amax,u_in_std,u_in_mean,u_in_first,u_in_last,u_in_diffmax,u_in_diffmean
0,-1.706713,-0.543500,-0.147367,1.363534,-0.938988,1.141787e+00,-2.573241e+00,-1.704294e+00,-5.167295e-01,7.719222e-01,...,3.141453e-15,-0.994527,-1.116580,-0.289037,0.048447,0.410407,-0.570847,0.236488,-0.047503,0.781452
1,-1.662875,0.751864,-0.147367,1.363534,-0.936140,1.164414e+00,-5.479401e-01,-1.658837e+00,-6.828853e-01,1.069197e+00,...,3.141453e-15,-0.952558,-1.115517,-0.289037,0.048447,0.410407,-0.570847,0.236488,-0.643404,-0.639676
2,-1.618762,1.043944,-0.147367,1.363534,-0.929143,1.356808e+00,-3.702544e-01,-1.613133e+00,-8.382806e-01,1.090778e+00,...,3.141453e-15,-0.901169,-1.113386,-0.289037,0.048447,0.410407,-0.570847,0.236488,-0.777769,-0.960113
3,-1.574434,1.065147,-0.147367,1.363534,-0.918480,1.500631e+00,-5.507826e-01,-1.567790e+00,-8.004076e-01,1.274278e+00,...,3.141453e-15,-0.849096,-1.110181,-0.289037,0.048447,0.410407,-0.570847,0.236488,-0.787523,-0.983375
4,-1.529865,1.245441,-0.147367,1.363534,-0.902633,1.490631e+00,-2.493072e-01,-1.522357e+00,-7.105252e-01,1.411453e+00,...,6.945245e-01,-0.791208,-1.105895,-0.289037,0.048447,0.410407,-0.570847,0.236488,-0.870463,-1.181174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8453195,1.589268,-0.197504,-0.147367,-0.960264,0.304493,-1.823550e-01,-5.089554e-02,1.675413e+00,-4.476298e-01,-1.940091e-01,...,6.285537e-01,-0.550178,1.915740,-0.072711,-0.599609,-0.874032,-0.194235,0.234475,0.016010,-0.177704
8453196,1.633216,-0.197188,-0.147367,-0.960264,0.363225,-1.821148e-01,-5.075119e-02,1.720376e+00,-4.133436e-01,-1.937383e-01,...,6.721377e-01,-0.538818,1.996675,-0.072711,-0.599609,-0.874032,-0.194235,0.234475,0.015864,-0.178050
8453197,1.677015,-0.196922,-0.147367,-0.960264,0.422771,-1.819089e-01,-5.063486e-02,1.766020e+00,-6.546125e-01,-1.935092e-01,...,5.469013e-01,-0.527450,2.078672,-0.072711,-0.599609,-0.874032,-0.194235,0.234475,0.015742,-0.178342
8453198,1.720862,-0.196697,-0.147367,-0.960264,0.483128,4.295256e-16,4.516785e-17,-2.302015e-17,1.276866e-15,-1.933129e-01,...,4.710889e-01,-0.516074,2.161731,-0.072711,-0.599609,-0.874032,-0.194235,0.234475,0.015638,-0.178589


Unnamed: 0,time_step,u_in,R,C,area,u_in_shift_-2,u_in_diff_-2,time_step_shift_-2,time_step_diff_-2,u_in_shift_-1,...,time_step_diff_4,u_in_cumsum,time_step_cumsum,u_in_amax,u_in_std,u_in_mean,u_in_first,u_in_last,u_in_diffmax,u_in_diffmean
0,-1.706713,-0.549399,-0.964471,-0.379315,-0.938988,5.482508e-01,-1.698514e+00,-1.709220e+00,1.114763e+00,-1.106258e-02,...,3.141453e-15,-0.994717,-1.116580,0.002920,0.287993,0.269508,-0.574017,0.220322,0.255744,0.724347
1,-1.665152,-0.017439,-0.964471,-0.379315,-0.937884,1.045201e+00,-1.593186e+00,-1.666573e+00,1.105901e+00,5.030959e-01,...,3.141453e-15,-0.977560,-1.115573,0.002920,0.287993,0.269508,-0.574017,0.220322,0.011028,0.140740
2,-1.623566,0.487735,-0.964471,-0.379315,-0.933579,1.429709e+00,-1.362947e+00,-1.623969e+00,1.119616e+00,9.770753e-01,...,3.141453e-15,-0.944110,-1.113558,0.002920,0.287993,0.269508,-0.574017,0.220322,-0.221365,-0.413480
3,-1.581979,0.953432,-0.964471,-0.379315,-0.924220,1.744393e+00,-1.091452e+00,-1.581337e+00,1.124152e+00,1.343809e+00,...,3.141453e-15,-0.895640,-1.110535,0.002920,0.287993,0.269508,-0.574017,0.220322,-0.435597,-0.924390
4,-1.540432,1.313757,-0.964471,-0.379315,-0.908752,1.975460e+00,-8.628954e-01,-1.538703e+00,1.114552e+00,1.643947e+00,...,-1.145304e+00,-0.835549,-1.106506,0.002920,0.287993,0.269508,-0.574017,0.220322,-0.601356,-1.319699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4023995,1.589268,-0.197504,-0.147367,-0.960264,0.304493,-1.823550e-01,-5.089554e-02,1.675413e+00,-4.476298e-01,-1.940091e-01,...,6.285537e-01,-0.550178,1.915740,-0.072711,-0.599609,-0.874032,-0.194235,0.234475,0.016010,-0.177704
4023996,1.633216,-0.197188,-0.147367,-0.960264,0.363225,-1.821148e-01,-5.075119e-02,1.720376e+00,-4.133436e-01,-1.937383e-01,...,6.721377e-01,-0.538818,1.996675,-0.072711,-0.599609,-0.874032,-0.194235,0.234475,0.015864,-0.178050
4023997,1.677015,-0.196922,-0.147367,-0.960264,0.422771,-1.819089e-01,-5.063486e-02,1.766020e+00,-6.546125e-01,-1.935092e-01,...,5.469013e-01,-0.527450,2.078672,-0.072711,-0.599609,-0.874032,-0.194235,0.234475,0.015742,-0.178342
4023998,1.720862,-0.196697,-0.147367,-0.960264,0.483128,4.295256e-16,4.516785e-17,-2.302015e-17,1.276866e-15,-1.933129e-01,...,4.710889e-01,-0.516074,2.161731,-0.072711,-0.599609,-0.874032,-0.194235,0.234475,0.015638,-0.178589


(None, None)

In [42]:
train_df = pd.concat([train_df, train_category, train[['id', 'breath_id', 'pressure', 'u_out']]], axis=1)
test_df = pd.concat([test_df, test_category, test[['id', 'breath_id', 'u_out']]], axis=1)

In [43]:
train_df = utils.reduce_mem_usage(train_df)
test_df = utils.reduce_mem_usage(test_df)

Mem. usage decreased from 2773.19 Mb to 709.42 Mb (74.4% reduction)
Mem. usage decreased from 1289.43 Mb to 330.03 Mb (74.4% reduction)


In [44]:
oof_total = np.zeros((len(train), CFG.num_classes))
sub_preds = np.zeros((test.shape[0], len(CFG.folds)))
models = []
scores = []
input_dim = len(train_value_col)

for i, (trn_idx, val_idx) in enumerate(zip(trn_idxes, val_idxes)):
    if i not in CFG.folds:
        continue

    trn_df = train_df.loc[trn_idx, :].reset_index(drop=True)
    val_df = train_df.loc[val_idx, :].reset_index(drop=True)
    trn_y = y.values[trn_idx]
    val_y = y.values[val_idx]
    
    
    loaders = {
        phase: torchdata.DataLoader(
            VentilatorDataset(
                df_, train_value_col, train_category_col
            ),
            **CFG.loader_params[phase])  # type: ignore
        for phase, df_ in zip(["train", "valid", "test"], [trn_df, val_df, test_df])
    }
    
    
    model = RNNModel(
        input_dim=input_dim,
        lstm_dim=CFG.lstm_dim,
        dense_dim=CFG.dense_dim,
        logit_dim=CFG.logit_dim,
        num_classes=CFG.num_classes,
    )
    model_name = model.__class__.__name__
#     break
    
    learner = Learner(model)
    
    # loggers
    RUN_NAME = f'exp{str(CFG.exp_num)}'
    wandb.init(project='Ventilator-Pressure-Prediction', entity='sqrt4kaido', group=RUN_NAME, job_type=RUN_NAME + f'-fold-{i}')
    wandb.run.name = RUN_NAME + f'-fold-{i}'
    wandb_config = wandb.config
    wandb_config.model_name = model_name
    wandb.watch(model)
    
    # callbacks
    callbacks = []
    checkpoint_callback = ModelCheckpoint(
        monitor=f'Loss/val',
        mode='min',
        dirpath=OUTPUT_DIR,
        verbose=False,
        save_weights_only=True,
        filename=f'{model_name}-{learner.current_epoch}-{i}')
    callbacks.append(checkpoint_callback)

#     early_stop_callback = EarlyStopping(
#         monitor='Loss/val',
#         min_delta=0.00,
#         patience=10,
#         verbose=True,
#         mode='min')
#     callbacks.append(early_stop_callback)
    
    loggers = []
    loggers.append(WandbLogger())
    
    trainer = pl.Trainer(
        logger=loggers,
        callbacks=callbacks,
        max_epochs=CFG.epochs,
        default_root_dir=OUTPUT_DIR,
        gpus=1,
#         fast_dev_run=DEBUG,
        deterministic=True,
        benchmark=False,
        )
    
    trainer.fit(learner, train_dataloader=loaders['train'], val_dataloaders=loaders['valid'])
    trainer.save_checkpoint(OUTPUT_DIR / "last.ckpt")

    #######
    trn_df = train_df.loc[ori_trn_idxes[0], :].reset_index(drop=True)
    val_df = train_df.loc[val_idx, :].reset_index(drop=True)
    trn_y = y.values[ori_trn_idxes[0]]
    val_y = y.values[val_idx]
    
    trainer = pl.Trainer(
        logger=loggers,
        callbacks=callbacks,
        max_epochs=CFG.epochs,
        default_root_dir=OUTPUT_DIR,
        gpus=1,
#         fast_dev_run=DEBUG,
        deterministic=True,
        benchmark=False,
        )

    loaders = {
        phase: torchdata.DataLoader(
            VentilatorDataset(
                df_, train_value_col, train_category_col
            ),
            **CFG.loader_params[phase])  # type: ignore
        for phase, df_ in zip(["train", "valid", "test"], [trn_df, val_df, test_df])
    }
    
    checkpoint = torch.load(str(OUTPUT_DIR / "last.ckpt"))
    learner.load_state_dict(checkpoint['state_dict'])

    trainer.fit(learner, train_dataloader=loaders['train'], val_dataloaders=loaders['valid'])

    print('train done.')
    
    #############
    # validation (to make oof)
    #############
    checkpoint = torch.load(checkpoint_callback.best_model_path)
    learner.load_state_dict(checkpoint['state_dict'])
    
    model = model.to(device)
    oof_pred, oof_target = evaluate(model, loaders, phase="valid")
    models.append(model)
    
    oof_score = get_score(oof_pred, oof_target, val_df['u_out'].values)
    scores.append(oof_score)
    oof_total[val_idx] = oof_pred.reshape(1, -1).T / CFG.bias
    val_idxes.append(val_idx)
    
    print('validate done.')
    print(f'fold = {i}, auc = {oof_score}')
    wandb.log({'CV_score': oof_score})
    
    #############
    # inference
    #############
    test_pred, _ = evaluate(model, loaders, phase="test")
    sub_preds[:, i] = test_pred
    
    print('inference done.')

# test_preds_total = np.array(test_preds_total)


init LSTM(516, 512, num_layers=2, batch_first=True, bidirectional=True)


[34m[1mwandb[0m: Currently logged in as: [33msqrt4kaido[0m (use `wandb login --relogin` to force relogin)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | RNNModel       | 12.8 M
1 | criterion | VentilatorLoss | 0     
---------------------------------------------
12.8 M    Trainable params
0         Non-trainable params
12.8 M    Total params
51.143    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

epoch = 0, custom_mae = 17.361391067504883


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

epoch = 0, custom_mae = 0.897667646408081


Validating: 0it [00:00, ?it/s]

epoch = 1, custom_mae = 0.7779513001441956


Validating: 0it [00:00, ?it/s]

epoch = 2, custom_mae = 0.7104186415672302


Validating: 0it [00:00, ?it/s]

epoch = 3, custom_mae = 0.7473292946815491


Validating: 0it [00:00, ?it/s]

epoch = 4, custom_mae = 0.6357897520065308


Validating: 0it [00:00, ?it/s]

epoch = 5, custom_mae = 0.6172882318496704


Validating: 0it [00:00, ?it/s]

epoch = 6, custom_mae = 0.6262355446815491


Validating: 0it [00:00, ?it/s]

epoch = 7, custom_mae = 0.5429548621177673


Validating: 0it [00:00, ?it/s]

epoch = 8, custom_mae = 0.5245142579078674


Validating: 0it [00:00, ?it/s]

epoch = 9, custom_mae = 0.4957989752292633


Validating: 0it [00:00, ?it/s]

epoch = 10, custom_mae = 0.4389912188053131


Validating: 0it [00:00, ?it/s]

epoch = 11, custom_mae = 0.4629458785057068


Validating: 0it [00:00, ?it/s]

epoch = 12, custom_mae = 0.4582303762435913


Validating: 0it [00:00, ?it/s]

epoch = 13, custom_mae = 0.38532859086990356


Validating: 0it [00:00, ?it/s]

epoch = 14, custom_mae = 0.365160197019577


Validating: 0it [00:00, ?it/s]

epoch = 15, custom_mae = 0.3496358394622803


Validating: 0it [00:00, ?it/s]

epoch = 16, custom_mae = 0.3538823127746582


Validating: 0it [00:00, ?it/s]

epoch = 17, custom_mae = 0.31744593381881714


Validating: 0it [00:00, ?it/s]

epoch = 18, custom_mae = 0.3118184506893158


Validating: 0it [00:00, ?it/s]

epoch = 19, custom_mae = 0.29620006680488586


Validating: 0it [00:00, ?it/s]

epoch = 20, custom_mae = 0.285791277885437


Validating: 0it [00:00, ?it/s]

epoch = 21, custom_mae = 0.2758193910121918


Validating: 0it [00:00, ?it/s]

epoch = 22, custom_mae = 0.27286678552627563


Validating: 0it [00:00, ?it/s]

epoch = 23, custom_mae = 0.2694127857685089


Validating: 0it [00:00, ?it/s]

epoch = 24, custom_mae = 0.2682937681674957


Validating: 0it [00:00, ?it/s]

epoch = 25, custom_mae = 0.26792025566101074


Validating: 0it [00:00, ?it/s]

epoch = 26, custom_mae = 0.26822033524513245


Validating: 0it [00:00, ?it/s]

epoch = 27, custom_mae = 0.2693110406398773


Validating: 0it [00:00, ?it/s]

epoch = 28, custom_mae = 0.2685270607471466


Validating: 0it [00:00, ?it/s]

epoch = 29, custom_mae = 0.2716972827911377


Validating: 0it [00:00, ?it/s]

epoch = 30, custom_mae = 0.2758670449256897


Validating: 0it [00:00, ?it/s]

epoch = 31, custom_mae = 0.28469979763031006


Validating: 0it [00:00, ?it/s]

epoch = 32, custom_mae = 0.2928757965564728


Validating: 0it [00:00, ?it/s]

epoch = 33, custom_mae = 0.3006034195423126


Validating: 0it [00:00, ?it/s]

epoch = 34, custom_mae = 0.3322243094444275


Validating: 0it [00:00, ?it/s]

epoch = 35, custom_mae = 0.31573250889778137


Validating: 0it [00:00, ?it/s]

epoch = 36, custom_mae = 0.34831148386001587


Validating: 0it [00:00, ?it/s]

epoch = 37, custom_mae = 0.31528398394584656


Validating: 0it [00:00, ?it/s]

epoch = 38, custom_mae = 0.3638623058795929


Validating: 0it [00:00, ?it/s]

epoch = 39, custom_mae = 0.34396225214004517


Validating: 0it [00:00, ?it/s]

epoch = 40, custom_mae = 0.4207165539264679


Validating: 0it [00:00, ?it/s]

epoch = 41, custom_mae = 0.389479398727417


Validating: 0it [00:00, ?it/s]

epoch = 42, custom_mae = 0.4630466401576996


Validating: 0it [00:00, ?it/s]

epoch = 43, custom_mae = 0.3964826166629791


Validating: 0it [00:00, ?it/s]

epoch = 44, custom_mae = 0.3991454839706421


Validating: 0it [00:00, ?it/s]

epoch = 45, custom_mae = 0.41795697808265686


Validating: 0it [00:00, ?it/s]

epoch = 46, custom_mae = 0.38112199306488037


Validating: 0it [00:00, ?it/s]

epoch = 47, custom_mae = 0.369878888130188


Validating: 0it [00:00, ?it/s]

epoch = 48, custom_mae = 0.3639485538005829


Validating: 0it [00:00, ?it/s]

epoch = 49, custom_mae = 0.3814595937728882




KeyboardInterrupt: 

In [None]:
if len(CFG.folds) != CFG.n_folds:

    oof_score = get_score(oof_pred, oof_target, val_df['u_out'].values)
    print(f'MAE {oof_score}')

    oof_df = train.iloc[val_idxes[0], :1]
    oof_df['pressure'] = oof_pred
    oof_df.to_csv(OUTPUT_DIR / f'oof{CFG.exp_num}.csv',index = False)    
else:
    score = get_score(y, oof_total, train['u_out'].values)
    print(f'MAE {score}: folds: {scores}')

    oof_df = pd.DataFrame({'id': train['id'].values, 'pressure':oof_total.reshape(-1)})
    oof_df.to_csv(OUTPUT_DIR / f'oof{CFG.exp_num}.csv',index = False)
oof_df

In [None]:
sub = pd.read_csv(DATA_DIR / 'sample_submission.csv')
sub['pressure'] = np.mean(sub_preds, axis=1)
sub.to_csv(OUTPUT_DIR / f'sub{CFG.exp_num}.csv',index = False)
sub

In [None]:
wandb.init(project='Ventilator-Pressure-Prediction', entity='sqrt4kaido', group=RUN_NAME, job_type='summary')
wandb.run.name = 'summary'
wandb.log({'CV_score': oof_score})
# wandb.save(utils.get_notebook_path())
wandb.finish()

Exception in thread Thread-7:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
Exception in thread Thread-8:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
        self.run()
self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
  File "/usr/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/user/.local/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 197, in check_network_status
    status_response = self._interface.communicate_network_status()
    self._target(*self._args, **self._kwargs)
  File "/home/user/.local/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 215, in check_status
  File "/home/user/.local/lib/python3.8/site-packages/wandb/sdk/interface/interface.py", line 749, in communicate_network_status
    status_response = self._interface.communicate_stop_status()
  File "