# Train classification

Most sharing code train this dataset as a regression task.

But in this code, I train as a classification task.

I encode the target value pressure to 950 classes and calculate CrossEntropy Loss.

In [1]:
from IPython.core.display import display, HTML 
display(HTML("<style>.container { width:100% !important; }</style>")) 

In [1]:
import gc
import os
import sys
import copy
import random
import wandb
import math
from pathlib import Path
from collections import OrderedDict

import numpy as np
import pandas as pd
import scipy.signal
from scipy import signal

from sklearn.model_selection import GroupKFold
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AdamW
from transformers import get_cosine_schedule_with_warmup
from sklearn.preprocessing import RobustScaler, QuantileTransformer

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import WandbLogger

device = torch.device("cuda")

In [2]:
sys.path.append('../../src/')
import utils as utils
from utils import Timer

In [3]:
class CFG:
    exp_num = 56
    n_folds = 5
    folds = [0]
    seed = 777
    local = True
    
    lr = 0.0015
    epochs = 50
    emb_dim = 64
    hidden_dim = 256
    weight_decay = 0.1  
    
    ######################
    # Loaders #
    ######################
    loader_params = {
        "train": {
            'batch_size': 128,
            'shuffle': True,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': True,
        },
        "valid": {
            'batch_size': 32,
            'shuffle': False,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': False,
        },
        "test": {
            'batch_size': 32,
            'shuffle': False,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': False,
        }
    }


In [4]:
utils.set_seed(CFG.seed)    

In [5]:
if CFG.local:
    DATA_DIR = Path("/home/knikaido/work/Ventilator-Pressure-Prediction/data/ventilator-pressure-prediction")
    OUTPUT_DIR = Path('./output/')
else:
    DATA_DIR = Path("../input/ventilator-pressure-prediction")
    OUTPUT_DIR = Path('')   

In [6]:
def loss_fn(y_pred, y_true, u_outs):
#     loss = nn.L1Loss()(y_pred.reshape(-1), y_true.reshape(-1))
    w = 1 - u_outs.reshape(-1)
    loss = nn.CrossEntropyLoss(reduction = 'none')(y_pred.reshape(-1, 950), y_true.reshape(-1)).reshape(-1)
    loss = loss * w
    loss = loss.sum() / w.sum()
    return loss

def compute_metric(preds, trues, u_outs):
    """
    Metric for the problem, as I understood it.
    """
    
    y = trues
    w = 1 - u_outs
    
    assert y.shape == preds.shape and w.shape == y.shape, (y.shape, preds.shape, w.shape)
    
    mae = w * np.abs(y - preds)
    mae = mae.sum() / w.sum()
    
    return mae

def get_score(y_pred, y_true, u_outs):
    return compute_metric(y_pred, y_true, u_outs)

In [7]:
class VentilatorDataset(Dataset):
    
    def __init__(self, df, train_value_col, train_category_col, label_dic=None):
        self.dfs = [_df for _, _df in df.groupby("breath_id")]
        self.label_dic = label_dic
        
    def __len__(self):
        return len(self.dfs)
    
    def __getitem__(self, item):
        df = self.dfs[item]
        X = df[train_category_col + train_value_col].values
        u_out = df['u_out'].values
        y = df['pressure'].values
        if self.label_dic is None:
            label = [-1]
        else:
            label = [self.label_dic[i] for i in y]

        d = {
            "X": torch.tensor(X).float(),
            "u_out": torch.tensor(u_out).long(),
            "y" : torch.tensor(label).long(),
        }
        return d

In [8]:
class VentilatorModel(nn.Module):
    
    def __init__(self, input_dim):
        super(VentilatorModel, self).__init__()
        self.rc_emb = nn.Embedding(9, 4, padding_idx=0)

        self.seq_emb = nn.Sequential(
            nn.Linear(4+input_dim, CFG.emb_dim),
            nn.LayerNorm(CFG.emb_dim)
        )
        
        self.lstm = nn.LSTM(CFG.emb_dim, CFG.hidden_dim, batch_first=True, bidirectional=True, dropout=0.1, num_layers=4)

        self.head = nn.Sequential(
            nn.Linear(CFG.hidden_dim * 2, CFG.hidden_dim * 2),
            nn.LayerNorm(CFG.hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(CFG.hidden_dim * 2, 950),
        )
        
        # Encoder
        initrange = 0.1
        self.rc_emb.weight.data.uniform_(-initrange, initrange)
        
        # LSTM
        for n, m in self.named_modules():
            if isinstance(m, nn.LSTM):
                print(f'init {m}')
                for param in m.parameters():
                    if len(param.shape) >= 2:
                        nn.init.orthogonal_(param.data)
                    else:
                        nn.init.normal_(param.data)

    def forward(self, X, y=None):
        # embed
        bs = X.shape[0]
        rc_emb = self.rc_emb(X[:,:,0].long()).view(bs, 80, -1)

        seq_x = torch.cat((rc_emb, X[:, :, 1:]), 2)
        emb_x = self.seq_emb(seq_x)
        
        out, _ = self.lstm(emb_x, None) 
        logits = self.head(out)

        return logits

In [9]:
# Learner class(pytorch-lighting)
class Learner(pl.LightningModule):
    def __init__(self, model, num_train_steps, num_warmup_steps, target_dic_inv, unique_targets):
        super().__init__()
        self.model = model
        self.criterion = loss_fn
        self.num_train_steps = num_train_steps
        self.num_warmup_steps = num_warmup_steps
        self.target_dic_inv = target_dic_inv
        self.unique_targets = unique_targets
    
    def training_step(self, batch, batch_idx):
        d_ = batch
        out = self.model(d_['X'])
        loss = self.criterion(out, d_['y'], d_['u_out'])
        
        self.log(f'Loss/train', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        sch = self.lr_schedulers()
        sch.step()
        return loss
    
    def validation_step(self, batch, batch_idx):
        d_ = batch
        out = self.model(d_['X'])
        loss = self.criterion(out, d_['y'], d_['u_out'])
        
        self.log(f'Loss/val', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        
        pred = out.reshape(-1, 950).softmax(1)
        pred = torch.sum(torch.tensor(self.unique_targets).to(device) *  pred, axis=1)
        target = torch.tensor([[self.target_dic_inv[j.item()] for j in i] for i in d_['y']])
        output = OrderedDict({
            "targets": target.detach(), "preds": pred.detach(), "u_outs": d_['u_out'].detach(), "loss": loss.detach()
        })
        return output
    
    def validation_epoch_end(self, outputs):

        targets = torch.cat([o["targets"].view(-1) for o in outputs]).cpu().numpy()
        preds = torch.cat([o["preds"].view(-1) for o in outputs]).cpu().numpy()
        u_outs = torch.cat([o["u_outs"].view(-1) for o in outputs]).cpu().numpy()

        score = get_score(preds, targets, u_outs)
        self.log(f'custom_mae/val', score, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        print(f'epoch = {self.current_epoch}, custom_mae = {score}')

    def configure_optimizers(self):
#         optimizer = get_optimizer(self.model)
#         scheduler = get_scheduler(optimizer)
        optimizer = AdamW(self.model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
        scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps=self.num_warmup_steps, 
                                                    num_training_steps=self.num_train_steps)

        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "Loss/val"}

In [10]:
def valid_loop(model, loader, target_dic_inv):
    losses, predicts = [], []
    model.eval()
    for d in loader:
        with torch.no_grad():
            out = model(d['X'].to(device))
            loss = loss_fn(out, d['y'].to(device), d['u_out'].to(device))
        pred = out.reshape(-1, 950).softmax(1)
        pred = torch.sum(torch.tensor(unique_targets).to(device) *  pred, axis=1)
        losses.append(loss.item())
        predicts.append(pred.cpu().numpy())

    return np.array(losses).mean(), np.concatenate(predicts)

def test_loop(model, loader, target_dic_inv):
    predicts = []
    model.eval()
    for d in loader:
        with torch.no_grad():
            out = model(d['X'].to(device))
        pred = out.reshape(-1, 950).softmax(1)
        pred = torch.sum(torch.tensor(unique_targets).to(device) *  pred, axis=1)
        predicts.append(pred.cpu().numpy())

    return np.concatenate(predicts)

In [11]:
train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')
sub_df = pd.read_csv(DATA_DIR / "sample_submission.csv")

In [12]:
display(train), display(test)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.000000,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.355850,0,12.234987
...,...,...,...,...,...,...,...,...
6035995,6035996,125749,50,10,2.504603,1.489714,1,3.869032
6035996,6035997,125749,50,10,2.537961,1.488497,1,3.869032
6035997,6035998,125749,50,10,2.571408,1.558978,1,3.798729
6035998,6035999,125749,50,10,2.604744,1.272663,1,4.079938


Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,1,0,5,20,0.000000,0.000000,0
1,2,0,5,20,0.031904,7.515046,0
2,3,0,5,20,0.063827,14.651675,0
3,4,0,5,20,0.095751,21.230610,0
4,5,0,5,20,0.127644,26.320956,0
...,...,...,...,...,...,...,...
4023995,4023996,125748,20,10,2.530117,4.971245,1
4023996,4023997,125748,20,10,2.563853,4.975709,1
4023997,4023998,125748,20,10,2.597475,4.979468,1
4023998,4023999,125748,20,10,2.631134,4.982648,1


(None, None)

In [13]:
def get_raw_features(input_df, dataType = 'train'):
    colum = ['time_step', 'u_in', 'u_out']

    return input_df[colum]

In [14]:
def get_category_features(input_df, dataType = 'train'):
    output_df = copy.deepcopy(input_df)
    colum = ['R_C']
    rc_map = {'5_10': 0, '5_20': 1, '5_50': 2, '20_10': 3, '20_20': 4, '20_50': 5, '50_10': 6, '50_20': 7, '50_50': 8}
    
    output_df['R_C'] = [f'{r}_{c}' for r, c in zip(output_df['R'], output_df['C'])]
    output_df['R_C'] = output_df['R_C'].map(rc_map)

    return output_df[colum]

In [15]:
def get_simple_calc_features(input_df, dataType = 'train'):
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    output_df['time_delta'] = output_df.groupby('breath_id')['time_step'].diff().fillna(0)
    output_df['delta'] = output_df['time_delta'] * output_df['u_in']
    output_df['area'] = output_df.groupby('breath_id')['delta'].cumsum()

    output_df['cross']= output_df['u_in']*output_df['u_out']
    output_df['cross2']= output_df['time_step']*output_df['u_out']
    
    output_df['u_in_cumsum'] = (output_df['u_in']).groupby(output_df['breath_id']).cumsum()
    output_df['one'] = 1
    output_df['count'] = (output_df['one']).groupby(output_df['breath_id']).cumsum()
    output_df['u_in_cummean'] =output_df['u_in_cumsum'] / output_df['count']

    
    output_df = output_df.drop(['count','one'], axis=1)
    
    return output_df.iloc[:, c_num:]

In [16]:
def get_diff_shift_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    USE_LAG = [-2, -1, 1, 2, 3, 4]
    
    for lag in USE_LAG:
        output_df[f'breath_id_lag{lag}']=output_df['breath_id'].shift(lag).fillna(0)
        output_df[f'breath_id_lag{lag}same']=np.select([output_df[f'breath_id_lag{lag}']==output_df['breath_id']], [1], 0)

        # u_in 
        output_df[f'u_in_lag_{lag}'] = output_df['u_in'].shift(lag).fillna(0) * output_df[f'breath_id_lag{lag}same']
        output_df[f'u_in_diff_{lag}'] = output_df['u_in'] - output_df[f'u_in_lag_{lag}']
        output_df[f'u_out_lag_{lag}'] = output_df['u_out'].shift(lag).fillna(0) * output_df[f'breath_id_lag{lag}same']

        # breath_time
    output_df[f'time_step_lag_{1}'] = output_df['time_step'].shift(1).fillna(0) * output_df[f'breath_id_lag{1}same']
    output_df[f'time_step_diff_{1}'] = output_df['time_step'] - output_df[f'time_step_lag_{1}']

    drop_columns = ['time_step_lag_1']
    drop_columns += [f'breath_id_lag{i}' for i in USE_LAG]
    drop_columns += [f'breath_id_lag{i}same' for i in USE_LAG]
    output_df = output_df.drop(drop_columns, axis=1)

    # fill na by zero
    output_df = output_df.fillna(0)
    
    return output_df.iloc[:, c_num:]

In [17]:
def get_agg_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    # Dict for aggregations
    create_feature_dict = {
        'u_in': [np.max, np.mean],
    }
    
    def get_agg_window(start_time=0, end_time=3.0, add_suffix = False):
        
        df_tgt = output_df[(output_df['time_step'] >= start_time) & (output_df['time_step'] <= end_time)]
        df_feature = df_tgt.groupby(['breath_id']).agg(create_feature_dict)
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(start_time) + '_' + str(end_time))
            
        return df_feature
    
    df_agg_feature = get_agg_window().reset_index()
    
#     df_tmp = get_agg_window(start_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(start_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')

    output_df = pd.merge(output_df, df_agg_feature, how='left', on='breath_id')
    
    output_df['u_in_diffmax'] = output_df['u_in_amax'] - output_df['u_in']
    output_df['u_in_diffmean'] = output_df['u_in_mean'] - output_df['u_in']
    
#     output_df = output_df.drop(['u_in_amax','u_in_mean'], axis=1)
    
    return output_df.iloc[:, c_num:]

In [18]:
def lowpass(x, samplerate, fp, fs, gpass, gstop):
    fn = samplerate / 2   #ナイキスト周波数
    wp = fp / fn  #ナイキスト周波数で通過域端周波数を正規化
    ws = fs / fn  #ナイキスト周波数で阻止域端周波数を正規化
    N, Wn = signal.buttord(wp, ws, gpass, gstop)  #オーダーとバターワースの正規化周波数を計算
    b, a = signal.butter(N, Wn, "low")            #フィルタ伝達関数の分子と分母を計算
    y = signal.filtfilt(b, a, x)                  #信号に対してフィルタをかける
    return y  

In [19]:
fp = 5 # 通過域端周波数[Hz]
fs = 10 # 阻止域端周波数[Hz]
gpass = 3 # 通過域端最大損失[dB]
gstop = 40 # 阻止域端最小損失[dB]
samplerate = 100
def lowpass_filter(series):
    return lowpass(series, samplerate, fp, fs, gpass, gstop)

In [20]:
def get_filter_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    def get_agg_window(start_time=0, end_time=3.0, add_suffix = False):
        
        df_tgt = output_df[(output_df['time_step'] >= start_time) & (output_df['time_step'] <= end_time)]
        df_feature = df_tgt.groupby(['breath_id'])['u_in'].apply(lowpass_filter)
        df_feature.name = 'u_in_filter'
                    
        return df_feature
    
    df_agg_feature = get_agg_window().reset_index()
    df_agg_feature = df_agg_feature.explode("u_in_filter").reset_index(drop=True)
    
    USE_LAG = [-2, -1, 1, 2, 3, 4]
    
    for lag in USE_LAG:
        df_agg_feature[f'breath_id_lag{lag}']=df_agg_feature['breath_id'].shift(lag).fillna(0)
        df_agg_feature[f'breath_id_lag{lag}same']=np.select([df_agg_feature[f'breath_id_lag{lag}']==df_agg_feature['breath_id']], [1], 0)

        # u_in 
        df_agg_feature[f'u_in_filter_lag_{lag}'] = df_agg_feature['u_in_filter'].shift(lag).fillna(0) * df_agg_feature[f'breath_id_lag{lag}same']
        df_agg_feature[f'u_in_filter_diff_{lag}'] = df_agg_feature['u_in_filter'] - df_agg_feature[f'u_in_filter_lag_{lag}']

    drop_columns = [f'breath_id_lag{i}' for i in USE_LAG]
    drop_columns += [f'breath_id_lag{i}same' for i in USE_LAG]
    df_agg_feature = df_agg_feature.drop(drop_columns, axis=1)
    df_agg_feature = df_agg_feature.fillna(0)
    
    return df_agg_feature.iloc[:, 1:]

In [21]:
def get_sqrt_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    output_df['u_in_sqrt'] = output_df['u_in'].apply(lambda x: np.sqrt(x))
    output_df['u_in_sqrt_cumsum'] = output_df.groupby('breath_id')['u_in_sqrt'].cumsum()
    
    USE_LAG = [-2, -1, 1, 2, 3, 4]
    
    for lag in USE_LAG:
        output_df[f'breath_id_lag{lag}']=output_df['breath_id'].shift(lag).fillna(0)
        output_df[f'breath_id_lag{lag}same']=np.select([output_df[f'breath_id_lag{lag}']==output_df['breath_id']], [1], 0)

        # u_in 
        output_df[f'u_in_sqrt_filter_lag_{lag}'] = output_df['u_in_sqrt'].shift(lag).fillna(0) * output_df[f'breath_id_lag{lag}same']
        output_df[f'u_in_sqrt_filter_diff_{lag}'] = output_df['u_in_sqrt'] - output_df[f'u_in_sqrt_filter_lag_{lag}']

    drop_columns = [f'breath_id_lag{i}' for i in USE_LAG]
    drop_columns += [f'breath_id_lag{i}same' for i in USE_LAG]
    output_df = output_df.drop(drop_columns, axis=1)
    output_df = output_df.fillna(0)
    
    return output_df.iloc[:, c_num:]

In [22]:
def get_vib_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    output_df['u_out_diff'] = output_df['u_out'].diff()
    output_df['u_out_diff'].fillna(0, inplace=True)
    output_df['u_out_diff'].replace(-1, 0, inplace=True)
    uout1_df = output_df[output_df['u_out_diff']==1]
    
    first_df = output_df.loc[0::80,:]
    first_0_dict = dict(zip(first_df['id'], [0]*len(uout1_df)))

    output_df['u_in_diff'] = output_df['u_in'].diff()
    output_df['diff_sign'] = np.sign(output_df['u_in_diff'])
    output_df['sign_diff'] = output_df['diff_sign'].diff()
    output_df['tmp'] = output_df['id'].map(first_0_dict) # put 0, the 80row cycle
    output_df.iloc[0::80, output_df.columns.get_loc('sign_diff')] = output_df.iloc[0::80, output_df.columns.get_loc('tmp')]

    # Count the number of inversions, so take the absolute value and sum
    output_df['sign_diff'] = abs(output_df['sign_diff']) 
    sign_diff_dict = output_df.groupby('breath_id')['sign_diff'].sum().to_dict()
    output_df['diff_vib'] = output_df['breath_id'].map(sign_diff_dict)
    
    return output_df['diff_vib']

In [23]:
def to_feature(input_df, dataType = 'train'):
    """input_df を特徴量行列に変換した新しいデータフレームを返す.
    """

    processors = [
        get_raw_features,
        get_simple_calc_features,
        get_diff_shift_features,
        get_agg_features,
        get_category_features,
        get_filter_features,
        get_sqrt_features
    ]

    out_df = pd.DataFrame()

    for func in tqdm(processors, total=len(processors)):
        with Timer(prefix='' + func.__name__ + ' '):
            _df = func(input_df, dataType)

        # 長さが等しいことをチェック (ずれている場合, func の実装がおかしい)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
#     out_df = utils.reduce_mem_usage(out_df)
    
    return out_df

In [24]:
train_df = to_feature(train, dataType = 'train')
test_df = to_feature(test, dataType = 'test')

  0%|          | 0/7 [00:00<?, ?it/s]

get_raw_features  0.021[s]
get_simple_calc_features  12.110[s]
get_diff_shift_features  2.153[s]
get_agg_features  1.038[s]
get_category_features  2.382[s]
get_filter_features  41.261[s]
get_sqrt_features  5.544[s]


  0%|          | 0/7 [00:00<?, ?it/s]

get_raw_features  0.014[s]
get_simple_calc_features  8.089[s]
get_diff_shift_features  1.267[s]
get_agg_features  0.623[s]
get_category_features  1.528[s]
get_filter_features  27.345[s]
get_sqrt_features  3.568[s]


In [25]:
train_category_col = ['R_C']
train_value_col = [i for i in train_df.columns.to_list() if i not in train_category_col]

In [26]:
norm_features = train_value_col
norm_features = sorted(list(set(train_value_col) - set(['u_out'])), key=norm_features.index)
def norm_scale(train_df, test_df):
    scaler = RobustScaler()
    all_u_in = np.vstack([train_df[norm_features].values, test_df[norm_features].values])
    scaler.fit(all_u_in)
    train_df[norm_features] = scaler.transform(train_df[norm_features].values)
    test_df[norm_features] = scaler.transform(test_df[norm_features].values)
    return train_df, test_df

In [27]:
train_df, test_df = norm_scale(train_df, test_df)

In [28]:
train_df = utils.reduce_mem_usage(train_df)
test_df = utils.reduce_mem_usage(test_df)

Mem. usage decreased from 2809.11 Mb to 690.77 Mb (75.4% reduction)
Mem. usage decreased from 1872.74 Mb to 460.51 Mb (75.4% reduction)


In [29]:
train_df = pd.concat([train_df, train[['id', 'breath_id', 'pressure']]], axis=1)
test_df = pd.concat([test_df, test[['id', 'breath_id']]], axis=1)
test_df['pressure'] = train_df['pressure'].values[-1]

In [30]:
unique_targets = sorted(train_df['pressure'].unique().tolist())
target_dic = {v:i for i, v in enumerate(sorted(train_df['pressure'].unique().tolist()))}
target_dic_inv = {v: k for k, v in target_dic.items()}

In [31]:
display(train_df), display(test_df)

Unnamed: 0,time_step,u_in,u_out,time_delta,delta,area,cross,cross2,u_in_cumsum,u_in_cummean,...,u_in_sqrt_filter_diff_1,u_in_sqrt_filter_lag_2,u_in_sqrt_filter_diff_2,u_in_sqrt_filter_lag_3,u_in_sqrt_filter_diff_3,u_in_sqrt_filter_lag_4,u_in_sqrt_filter_diff_4,id,breath_id,pressure
0,-5.199219,-0.705078,0,-5.199219,-5.199219,-5.199219,-5.199219,-5.199219,-2.421875,-2.386719,...,1.313477,-5.199219,1.064453,-5.199219,0.899414,-5.199219,0.801758,1,1,5.837492
1,-2.054688,1.299805,0,0.122742,1.328125,-1.705078,-5.199219,-5.199219,-1.765625,0.221191,...,2.304688,-5.199219,2.093750,-5.199219,2.060547,-5.199219,2.039062,2,1,5.907794
2,-1.839844,1.403320,0,0.288330,1.438477,-1.396484,-5.199219,-5.199219,-1.455078,0.575195,...,1.482422,-0.621094,2.117188,-5.199219,2.115234,-5.199219,2.099609,3,1,7.876254
3,-1.687500,1.411133,0,0.634766,1.448242,-1.141602,-5.199219,-5.199219,-1.192383,0.711914,...,0.613281,1.307617,1.240234,-0.591309,2.083984,-5.199219,2.105469,4,1,11.742872
4,-1.564453,1.470703,0,1.088867,1.506836,-0.936035,-5.199219,-5.199219,-0.983398,0.813477,...,1.282227,1.416016,1.067383,1.305664,1.226562,-0.552734,2.101562,5,1,12.234987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035995,1.671875,-0.498779,1,-0.012794,-0.483887,0.397461,0.270996,1.673828,0.368408,-0.388916,...,0.584473,-0.449219,0.586914,-0.416748,0.460938,-0.382812,0.370361,6035996,125749,3.869032
6035996,1.810547,-0.499023,1,-0.054321,-0.484375,0.401855,0.270752,1.806641,0.372559,-0.398682,...,-0.766113,-0.438477,0.419189,-0.417480,0.466309,-0.383545,0.377441,6035997,125749,3.869032
6035997,1.924805,-0.488770,1,0.012169,-0.474854,0.406738,0.275879,1.919922,0.376953,-0.408447,...,0.583984,-0.428223,0.418945,-0.406738,0.465820,-0.384033,0.480713,6035998,125749,3.798729
6035998,2.042969,-0.535156,1,-0.067993,-0.520508,0.410400,0.247803,2.041016,0.380371,-0.418457,...,-1.148438,-0.428223,-0.898438,-0.395996,-0.821289,-0.372803,-0.741699,6035999,125749,4.079938


Unnamed: 0,time_step,u_in,u_out,time_delta,delta,area,cross,cross2,u_in_cumsum,u_in_cummean,...,u_in_sqrt_filter_diff_1,u_in_sqrt_filter_lag_2,u_in_sqrt_filter_diff_2,u_in_sqrt_filter_lag_3,u_in_sqrt_filter_diff_3,u_in_sqrt_filter_lag_4,u_in_sqrt_filter_diff_4,id,breath_id,pressure
0,-5.199219,-5.199219,0,-5.199219,-5.199219,-5.199219,-5.199219,-5.199219,-5.199219,-5.199219,...,-0.447266,-5.199219,-0.414307,-5.199219,-0.407471,-5.199219,-0.397949,1,0,3.869032
1,-2.144531,0.867676,0,-0.609863,0.874023,-1.914062,-5.199219,-5.199219,-1.990234,-0.710938,...,2.126953,-5.199219,1.911133,-5.199219,1.857422,-5.199219,1.793945,2,0,3.869032
2,-1.899414,1.189453,0,-0.520508,1.187500,-1.667969,-5.199219,-5.199219,-1.710938,-0.000099,...,1.815430,-5.199219,2.037109,-5.199219,2.001953,-5.199219,1.966797,3,0,3.869032
3,-1.740234,1.373047,0,-0.520020,1.378906,-1.394531,-5.199219,-5.199219,-1.422852,0.369873,...,1.674805,0.868164,1.792969,-5.199219,2.097656,-5.199219,2.082031,4,0,3.869032
4,-1.616211,1.492188,0,-0.650879,1.488281,-1.125000,-5.199219,-5.199219,-1.137695,0.592773,...,1.526367,1.191406,1.689453,0.873047,1.799805,-5.199219,2.156250,5,0,3.869032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4023995,1.783203,0.601562,1,0.202637,0.538086,-0.351807,1.688477,1.780273,-0.347900,-1.128906,...,-0.065613,0.595703,-0.050262,0.601562,-0.053833,0.594727,-0.051025,4023996,125748,3.869032
4023996,1.906250,0.628418,1,0.188110,0.541016,-0.326904,1.788086,1.895508,-0.325928,-1.114258,...,-0.093079,0.625000,-0.076782,0.631836,-0.081970,0.625977,-0.079468,4023997,125748,3.869032
4023997,2.011719,0.647461,1,0.099731,0.520508,-0.301514,1.873047,2.011719,-0.303223,-1.100586,...,-0.111511,0.647461,-0.095032,0.651855,-0.100586,0.646973,-0.099060,4023998,125748,3.869032
4023998,2.173828,0.665527,1,0.128052,0.534180,-0.276367,1.965820,2.164062,-0.279785,-1.087891,...,-0.125977,0.666992,-0.110596,0.671387,-0.116394,0.665527,-0.114136,4023999,125748,3.869032


(None, None)

In [None]:
oof = np.zeros(len(train_df))
test_preds_lst = []
input_dim = len(train_value_col)
train_df['pred'] = 0

gkf = GroupKFold(n_splits=CFG.n_folds).split(train_df, train_df.pressure, groups=train_df.breath_id)
for fold, (_, valid_idx) in enumerate(gkf):
    train_df.loc[valid_idx, 'fold'] = fold
    
for i, fold in enumerate(range(CFG.n_folds)):
    if i not in CFG.folds:
        continue
    print(f'Fold-{fold}')
    
    trn_df = train_df.query(f"fold!={fold}").reset_index(drop=True)
    val_df = train_df.query(f"fold=={fold}").reset_index(drop=True)
    
    loaders = {
        phase: DataLoader(
            VentilatorDataset(
                df_, train_value_col, train_category_col, target_dic
            ),
            **CFG.loader_params[phase])  # type: ignore
        for phase, df_ in zip(["train", "valid", "test"], [trn_df, val_df, test_df])
    }
    

    model = VentilatorModel(input_dim)
    model_name = model.__class__.__name__
    

    num_train_steps = int(len(loaders['train']) * CFG.epochs)
    num_warmup_steps = int(num_train_steps / 10)
    learner = Learner(model, num_train_steps, num_warmup_steps, target_dic_inv, unique_targets)
    
    # loggers
    RUN_NAME = f'exp{str(CFG.exp_num)}'
    wandb.init(project='Ventilator-Pressure-Prediction', entity='sqrt4kaido', group=RUN_NAME, job_type=RUN_NAME + f'-fold-{i}')
    wandb.run.name = RUN_NAME + f'-fold-{i}'
    wandb_config = wandb.config
    wandb_config.model_name = model_name
    wandb.watch(model)
    
    # callbacks
    callbacks = []
    checkpoint_callback = ModelCheckpoint(
        monitor=f'custom_mae/val',
        mode='min',
        dirpath=OUTPUT_DIR,
        verbose=False,
        save_weights_only=True,
        filename=f'{model_name}-{learner.current_epoch}-{i}')
    callbacks.append(checkpoint_callback)
    
    loggers = []
    loggers.append(WandbLogger())
    
    trainer = pl.Trainer(
        logger=loggers,
        callbacks=callbacks,
        max_epochs=CFG.epochs,
        default_root_dir=OUTPUT_DIR,
        gpus=1,
#         fast_dev_run=DEBUG,
        deterministic=True,
        benchmark=False,
        )
    
    trainer.fit(learner, train_dataloader=loaders['train'], val_dataloaders=loaders['valid'])
#     trainer.save_checkpoint(OUTPUT_DIR / "last.ckpt")
    print('train done.')

    ############
    # validation (to make oof)
    #############
    checkpoint = torch.load(checkpoint_callback.best_model_path)
    learner.load_state_dict(checkpoint['state_dict'])
    model = model.to(device)
    
    valid_loss, valid_predict = valid_loop(model, loaders['valid'], target_dic_inv)
    valid_score_mask = compute_metric(valid_predict, val_df['pressure'].values, val_df['u_out'].values)
    print(f"fold = {fold}, valid mask score = {valid_score_mask}:")
    train_df.loc[train_df['fold'] == fold, 'pred'] = valid_predict
    train_df.loc[train_df['fold'] == fold, ['id', 'pred']].to_csv(OUTPUT_DIR / f"oof_f{fold}.csv", index=None)
    
    
    #############
    # inference
    #############

    test_preds = test_loop(model, loaders['test'], target_dic_inv)
    test_preds_lst.append(test_preds)
    
    sub_df['pressure'] = test_preds
    sub_df.to_csv(OUTPUT_DIR / f"sub_f{fold}.csv", index=None)
    
wandb.finish()

Fold-0
init LSTM(64, 256, num_layers=4, batch_first=True, dropout=0.1, bidirectional=True)


[34m[1mwandb[0m: Currently logged in as: [33msqrt4kaido[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | VentilatorModel | 6.1 M 
------------------------------------------
6.1 M     Trainable params
0         Non-trainable params
6.1 M     Total params
24.583    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

epoch = 0, custom_mae = 14.739346742507172


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

epoch = 0, custom_mae = 2.3428055221765187


Validating: 0it [00:00, ?it/s]

epoch = 1, custom_mae = 1.3684774388780478


Validating: 0it [00:00, ?it/s]

epoch = 2, custom_mae = 0.9401840421627322


Validating: 0it [00:00, ?it/s]

epoch = 3, custom_mae = 0.7756997043857263


Validating: 0it [00:00, ?it/s]

epoch = 4, custom_mae = 0.8054295704072865


Validating: 0it [00:00, ?it/s]

epoch = 5, custom_mae = 0.6554383768002591


Validating: 0it [00:00, ?it/s]

epoch = 6, custom_mae = 0.5099353259775065


Validating: 0it [00:00, ?it/s]

epoch = 7, custom_mae = 0.49170323787622405


Validating: 0it [00:00, ?it/s]

epoch = 8, custom_mae = 0.6005833600209335


Validating: 0it [00:00, ?it/s]

epoch = 9, custom_mae = 0.4490593494770285


Validating: 0it [00:00, ?it/s]

epoch = 10, custom_mae = 0.4256645468973222


Validating: 0it [00:00, ?it/s]

epoch = 11, custom_mae = 0.5042535415084246


Validating: 0it [00:00, ?it/s]

epoch = 12, custom_mae = 0.3861288543639601


Validating: 0it [00:00, ?it/s]

epoch = 13, custom_mae = 0.3906744265522585


Validating: 0it [00:00, ?it/s]

epoch = 14, custom_mae = 0.36383862179334525


Validating: 0it [00:00, ?it/s]

epoch = 15, custom_mae = 0.4028886071633889


Validating: 0it [00:00, ?it/s]

epoch = 16, custom_mae = 0.6420740523763412


Validating: 0it [00:00, ?it/s]

epoch = 17, custom_mae = 0.3539619647105969


Validating: 0it [00:00, ?it/s]

epoch = 18, custom_mae = 0.3817360889289093


Validating: 0it [00:00, ?it/s]

epoch = 19, custom_mae = 0.33020681783981165


Validating: 0it [00:00, ?it/s]

epoch = 20, custom_mae = 0.35478753648384564


Validating: 0it [00:00, ?it/s]

epoch = 21, custom_mae = 0.30584965716839163


Validating: 0it [00:00, ?it/s]

epoch = 22, custom_mae = 0.2809033520854086


Validating: 0it [00:00, ?it/s]

epoch = 23, custom_mae = 0.28745540984485524


Validating: 0it [00:00, ?it/s]

epoch = 24, custom_mae = 0.3400802906768724


Validating: 0it [00:00, ?it/s]

epoch = 25, custom_mae = 0.2681843607097175


Validating: 0it [00:00, ?it/s]

epoch = 26, custom_mae = 0.268857190997987


Validating: 0it [00:00, ?it/s]

epoch = 27, custom_mae = 0.28845146969547236


Validating: 0it [00:00, ?it/s]

epoch = 28, custom_mae = 0.25806381164717784


Validating: 0it [00:00, ?it/s]

epoch = 29, custom_mae = 0.269201924290377


Validating: 0it [00:00, ?it/s]

epoch = 30, custom_mae = 0.250872564956892


Validating: 0it [00:00, ?it/s]

epoch = 31, custom_mae = 0.26373704153082334


Validating: 0it [00:00, ?it/s]

epoch = 32, custom_mae = 0.23682147804712983


Validating: 0it [00:00, ?it/s]

epoch = 33, custom_mae = 0.23680401912667237


Validating: 0it [00:00, ?it/s]

epoch = 34, custom_mae = 0.23319161750692086


Validating: 0it [00:00, ?it/s]

epoch = 35, custom_mae = 0.21663786127622173


Validating: 0it [00:00, ?it/s]

epoch = 36, custom_mae = 0.2110460280415689


Validating: 0it [00:00, ?it/s]

epoch = 37, custom_mae = 0.21155406112642416


Validating: 0it [00:00, ?it/s]

epoch = 38, custom_mae = 0.20248509689650834


Validating: 0it [00:00, ?it/s]

epoch = 39, custom_mae = 0.20409091961121456


Validating: 0it [00:00, ?it/s]

epoch = 40, custom_mae = 0.1999325876465703


Validating: 0it [00:00, ?it/s]

epoch = 41, custom_mae = 0.19655344722210738


Validating: 0it [00:00, ?it/s]

epoch = 42, custom_mae = 0.19360751962251896


Validating: 0it [00:00, ?it/s]

epoch = 43, custom_mae = 0.19366909718899272
train done.




In [35]:
valid_score_mask = compute_metric(train_df['pred'].values, train_df['pressure'].values, train_df['u_out'].values)
print("CV:", valid_score_mask)

CV: 0.16472824073225262


In [None]:
oof_df = train_df.loc[:, ['id', 'pred']]
oof_df.to_csv(OUTPUT_DIR / "oof_total.csv", index=None)

In [None]:
sub_df['pressure'] = np.stack(test_preds_lst).mean(0)
sub_df.to_csv(OUTPUT_DIR / "submission_mean.csv", index=None)

sub_df['pressure'] = np.median(np.stack(test_preds_lst), axis=0)
sub_df.to_csv(OUTPUT_DIR / "submission_median.csv", index=None)

# Post Processing: https://www.kaggle.com/snnclsr/a-dummy-approach-to-improve-your-score-postprocess
unique_pressures = train_df["pressure"].unique()
sorted_pressures = np.sort(unique_pressures)
total_pressures_len = len(sorted_pressures)

def find_nearest(prediction):
    insert_idx = np.searchsorted(sorted_pressures, prediction)
    if insert_idx == total_pressures_len:
        # If the predicted value is bigger than the highest pressure in the train dataset,
        # return the max value.
        return sorted_pressures[-1]
    elif insert_idx == 0:
        # Same control but for the lower bound.
        return sorted_pressures[0]
    lower_val = sorted_pressures[insert_idx - 1]
    upper_val = sorted_pressures[insert_idx]
    return lower_val if abs(lower_val - prediction) < abs(upper_val - prediction) else upper_val

sub_df = pd.read_csv(OUTPUT_DIR / "submission_mean.csv")
sub_df["pressure"] = sub_df["pressure"].apply(find_nearest)
sub_df.to_csv(OUTPUT_DIR / "submission_mean_pp.csv", index=None)

sub_df = pd.read_csv(OUTPUT_DIR / "submission_median.csv")
sub_df["pressure"] = sub_df["pressure"].apply(find_nearest)
sub_df.to_csv(OUTPUT_DIR / "submission_median_pp.csv", index=None)