In [1]:
import sys
from time import time
import numpy as np
import pandas as pd
from pathlib import Path
import lightgbm as lgb
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
import copy
import wandb
from collections import OrderedDict

from sklearn.metrics import mean_absolute_error
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, RobustScaler

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torchdata

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import WandbLogger


In [2]:
sys.path.append('../../src/')
import utils as utils
from utils import Timer

In [3]:
class CFG:
    seed = 42
    exp_num = 7
    local = True
    n_folds = 5
    folds = [0]
    debug = False
    bias = 1000
    epochs = 200

    
    ######################
    # Dataset #
    ######################
    transforms = {
        "train": [{"name": ""}],
        "valid": [{"name": ""}],
        "test": [{"name": ""}]
    }

    ######################
    # Loaders #
    ######################
    loader_params = {
        "train": {
            'batch_size': 128,
            'shuffle': True,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': True,
        },
        "valid": {
            'batch_size': 32,
            'shuffle': False,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': False,
        },
        "test": {
            'batch_size': 32,
            'shuffle': False,
            'num_workers': 8,
            'pin_memory': True,
            'drop_last': False,
        }
    }

    ######################
    # Split #
    ######################
    split = "GroupKFold"
    split_params = {
        "n_splits": 5,
    }

    ######################
    # Model #
    ######################
    input_dim = 5

    dense_dim = 512
    lstm_dim = 512
    logit_dim = 512
    num_classes = 1

    ######################
    # Criterion #
    ######################
#     loss_name = "rmspe_loss"
#     loss_params: dict = {}

    ######################
    # Optimizer #
    ######################
    optimizer_name = "Adam"
    optimizer_params = {
        "lr": 0.001
    }

    ######################
    # Scheduler #
    ######################
    scheduler_name = "ReduceLROnPlateau"
    scheduler_params = {
        'factor': 0.2, 
        'patience': 7
    }

In [4]:
utils.set_seed(CFG.seed)

In [5]:
if CFG.local:
    DATA_DIR = Path("/home/knikaido/work/Ventilator-Pressure-Prediction/data/ventilator-pressure-prediction")
    OUTPUT_DIR = Path('./output/')
else:
    DATA_DIR = Path("../input/ventilator-pressure-prediction")
    OUTPUT_DIR = Path('')   

In [6]:
def get_transforms(phase: str):
    transforms = CFG.transforms
    if transforms is None:
        return None
    else:
        if transforms[phase] is None:
            return None
        trns_list = []
        for trns_conf in transforms[phase]:
            trns_name = trns_conf["name"]
            trns_params = {} if trns_conf.get("params") is None else \
                trns_conf["params"]
            if globals().get(trns_name) is not None:
                trns_cls = globals()[trns_name]
                trns_list.append(trns_cls(**trns_params))

        if len(trns_list) > 0:
            return Compose(trns_list)
        else:
            return None
        
        
class Normalize:
    def __call__(self, y: np.ndarray):
        max_vol = np.abs(y).max()
        y_vol = y * 1 / max_vol
        return np.asfortranarray(y_vol)


class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y

In [7]:
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)


def init_weights(model):
    classname = model.__class__.__name__
    if classname.find("Conv2d") != -1:
        nn.init.xavier_uniform_(model.weight, gain=np.sqrt(2))
        model.bias.data.fill_(0)
    elif classname.find("BatchNorm") != -1:
        model.weight.data.normal_(1.0, 0.02)
        model.bias.data.fill_(0)
    elif classname.find("GRU") != -1:
        for weight in model.parameters():
            if len(weight.size()) > 1:
                nn.init.orghogonal_(weight.data)
    elif classname.find("Linear") != -1:
        model.weight.data.normal_(0, 0.01)
        model.bias.data.zero_()

In [8]:
def compute_metric(preds, trues, u_outs):
    """
    Metric for the problem, as I understood it.
    """
    
    y = trues
    w = 1 - u_outs
    
    assert y.shape == preds.shape and w.shape == y.shape, (y.shape, preds.shape, w.shape)
    
    mae = w * np.abs(y - preds)
    mae = mae.sum() / w.sum()
    
    return mae


class VentilatorLoss(nn.Module):
    """
    Directly optimizes the competition metric
    """
    def __call__(self, preds, y, u_out):
        w = 1 - u_out
        mae = w * (y - preds).abs()
        mae = mae.sum(-1) / w.sum(-1)

        return mae

In [9]:
def get_criterion():
    return VentilatorLoss()

In [10]:
# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if optimizer_name == "SAM":
        base_optimizer_name = CFG.base_optimizer
        if __OPTIMIZERS__.get(base_optimizer_name) is not None:
            base_optimizer = __OPTIMIZERS__[base_optimizer_name]
        else:
            base_optimizer = optim.__getattribute__(base_optimizer_name)
        return SAM(model.parameters(), base_optimizer, **CFG.optimizer_params)

    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(),
                                              **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(),
                                                      **CFG.optimizer_params)


def get_scheduler(optimizer):
    scheduler_name = CFG.scheduler_name

    if scheduler_name is None:
        return
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(
            optimizer, **CFG.scheduler_params)

In [11]:
# validation
splitter = getattr(model_selection, CFG.split)(**CFG.split_params)

In [12]:
class VentilatorDataset(torchdata.Dataset):
    def __init__(self, df, train_col):
        if "pressure" not in df.columns:
            df['pressure'] = 0
        self.df = df
        self.groups = df.groupby('breath_id').groups
        self.keys = list(self.groups.keys())
        self.train_col = train_col
        
    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        indexes = self.groups[self.keys[idx]]
        df_ = self.df.iloc[indexes]
        
        input_ = df_[self.train_col].values
        u_out_ = df_['u_out'].values
        p_ = df_['pressure'].values

        data = {
            "input": input_.astype(np.float32),
            "u_out": u_out_.astype(np.float32),
            "p": p_.astype(np.float32),
        }
        
        return data

In [33]:
class RNNModel(nn.Module):
    def __init__(
        self,
        input_dim=4,
        lstm_dim=256,
        dense_dim=256,
        logit_dim=256,
        num_classes=1,
    ):
        super().__init__()

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, dense_dim // 2),
            nn.ReLU(),
            nn.Linear(dense_dim // 2, dense_dim),
            nn.ReLU(),
        )

        self.lstm = nn.LSTM(dense_dim, lstm_dim, batch_first=True, bidirectional=True)

        self.logits = nn.Sequential(
            nn.Linear(lstm_dim * 2, logit_dim),
            nn.ReLU(),
            nn.Linear(logit_dim, num_classes),
        )
        
        self.init_weight()

    def init_weight(self):
        init_weights(self.mlp)
        init_weights(self.logits)

    def forward(self, x):
        features = self.mlp(x)
        features, _ = self.lstm(features)
        pred = self.logits(features)
        return pred

In [14]:
# Learner class(pytorch-lighting)
class Learner(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.criterion = get_criterion()
    
    def training_step(self, batch, batch_idx):
        d_ = batch
        output = self.model(d_['input'])
        loss = self.criterion(output.view(-1), d_['p'].view(-1), d_['u_out'].view(-1))
        return loss
    
    def validation_step(self, batch, batch_idx):
        d_ = batch
        output = self.model(d_['input'])
        loss = self.criterion(output.view(-1), d_['p'].view(-1), d_['u_out'].view(-1))
        
        self.log(f'Loss/val', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        
        output = OrderedDict({
            "targets": d_['p'].detach(), "preds": output.detach(), "u_outs": d_['u_out'].detach(), "loss": loss.detach()
        })
        return output
    
    def validation_epoch_end(self, outputs):

        targets = torch.cat([o["targets"].view(-1) for o in outputs]).cpu().numpy()
        preds = torch.cat([o["preds"].view(-1) for o in outputs]).cpu().numpy()
        u_outs = torch.cat([o["u_outs"].view(-1) for o in outputs]).cpu().numpy()

        score = get_score(preds, targets, u_outs)
        self.log(f'custom_mae/val', score, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        print(f'epoch = {self.current_epoch}, custom_mae = {score}')

    def configure_optimizers(self):
        optimizer = get_optimizer(self.model)
        scheduler = get_scheduler(optimizer)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "Loss/val"}

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
def get_score(y_pred, y_true, u_outs):
    return compute_metric(y_pred, y_true, u_outs)


def to_np(input):
    return input.detach().cpu().numpy()

# oof
def evaluate(model, loaders, phase):
    model.eval()
    pred_list = []
    target_list = []
    with torch.no_grad():
        for batch in loaders[phase]:
            d_ = batch
            d_['input'] = d_['input'].to(device)
            output = model(d_['input'])
#             output = nn.Softmax(dim=1)(output)
            pred_list.append(to_np(output))
            target_list.append(to_np(d_['p']))

    pred_list = np.concatenate(pred_list).reshape(-1)
    target_list = np.concatenate(target_list).reshape(-1)
    model.train()
    return pred_list, target_list

In [17]:
def extract_25(df):
    return df.iloc[:25]

In [18]:
train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')
# train = train.groupby('breath_id').apply(extract_25).reset_index(drop=True)
# test = test.groupby('breath_id').apply(extract_25).reset_index(drop=True)
display(train), display(test)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.000000,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.355850,0,12.234987
...,...,...,...,...,...,...,...,...
6035995,6035996,125749,50,10,2.504603,1.489714,1,3.869032
6035996,6035997,125749,50,10,2.537961,1.488497,1,3.869032
6035997,6035998,125749,50,10,2.571408,1.558978,1,3.798729
6035998,6035999,125749,50,10,2.604744,1.272663,1,4.079938


Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,1,0,5,20,0.000000,0.000000,0
1,2,0,5,20,0.031904,7.515046,0
2,3,0,5,20,0.063827,14.651675,0
3,4,0,5,20,0.095751,21.230610,0
4,5,0,5,20,0.127644,26.320956,0
...,...,...,...,...,...,...,...
4023995,4023996,125748,20,10,2.530117,4.971245,1
4023996,4023997,125748,20,10,2.563853,4.975709,1
4023997,4023998,125748,20,10,2.597475,4.979468,1
4023998,4023999,125748,20,10,2.631134,4.982648,1


(None, None)

In [19]:
def get_raw_features(input_df, dataType = 'train'):
    colum = ['time_step', 'u_in']

    return input_df[colum]

In [20]:
def get_simple_calc_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    output_df['cross']= output_df['u_in'] * output_df['u_out']
    output_df['cross2']= output_df['time_step'] * output_df['u_out']
    
    output_df['u_out_feature'] = output_df['u_out']
    
    return output_df.iloc[:, c_num:]

In [21]:
def get_category_features(input_df, dataType = 'train'):
    output_df = copy.deepcopy(input_df)
    colum = ['R', 'C']
    
#     output_df['R_C'] = output_df['R'] + output_df['C'] * 10

    return output_df[colum].astype('category')

In [22]:
def get_diff_shift_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    b_id_gby = input_df.groupby(['breath_id'])
    shift_idx = [-2, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    
    for i in shift_idx:
        output_df[f'u_in_diff_{i}'] = b_id_gby['u_in'].diff(i)
        output_df[f'u_in_shift_{i}'] = b_id_gby['u_in'].shift(i)

        output_df[f'time_step_diff_{i}'] = b_id_gby['time_step'].diff(i)
        output_df[f'time_step_shift_{i}'] = b_id_gby['time_step'].shift(i)
        
#     output_df['area_u_in'] = output_df['u_in'] * output_df['time_step_diff_1']
#     output_df['area_u_in_abs'] = output_df['u_in_diff_1'] * output_df['time_step_diff_1']
#     output_df['uin_in_time'] = output_df['u_in_diff_1'] / output_df['time_step_diff_1']
    
    return output_df.iloc[:, c_num:]

In [23]:
def get_cum_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    b_id_gby = input_df.groupby(['breath_id'])
    
    output_df['u_in_cumsum'] = b_id_gby['u_in'].cumsum()
    output_df['time_step_cumsum'] = b_id_gby['time_step'].cumsum()
    
    
    return output_df.iloc[:, c_num:]

In [24]:
def get_agg_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    # Dict for aggregations
    create_feature_dict = {
        'u_in': [np.max, np.std, np.mean, 'first', 'last'],
    }
    
    def get_agg_window(start_time=0, end_time=3.0, add_suffix = False):
        
        df_tgt = output_df[(output_df['time_step'] >= start_time) & (output_df['time_step'] <= end_time)]
        df_feature = df_tgt.groupby(['breath_id']).agg(create_feature_dict)
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(start_time) + '_' + str(end_time))
            
        return df_feature
    
    df_agg_feature = get_agg_window().reset_index()
    
#     df_tmp = get_agg_window(start_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(start_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')

    output_df = pd.merge(output_df, df_agg_feature, how='left', on='breath_id')
    
    
    return output_df.iloc[:, c_num:]

In [25]:
def to_feature(input_df, dataType = 'train'):
    """input_df を特徴量行列に変換した新しいデータフレームを返す.
    """

    processors = [
        get_raw_features,
#         get_simple_calc_features,
        get_category_features,
        get_diff_shift_features,
        get_cum_features,
        get_agg_features
    ]

    out_df = pd.DataFrame()

    for func in tqdm(processors, total=len(processors)):
        with Timer(prefix='' + func.__name__ + ' '):
            _df = func(input_df, dataType)

        # 長さが等しいことをチェック (ずれている場合, func の実装がおかしい)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
#     out_df = utils.reduce_mem_usage(out_df)
    
    return out_df

In [26]:
train_df = to_feature(train, dataType = 'train')
test_df = to_feature(test, dataType = 'test')

 40%|████      | 2/5 [00:00<00:00, 10.05it/s]

get_raw_features  0.014[s]
get_category_features  0.153[s]


 40%|████      | 2/5 [00:20<00:00, 10.05it/s]

get_diff_shift_features  273.907[s]


 60%|██████    | 3/5 [04:35<03:49, 114.61s/it]

get_cum_features  0.191[s]


 80%|████████  | 4/5 [04:36<01:12, 72.33s/it] 

get_agg_features  1.149[s]


100%|██████████| 5/5 [04:38<00:00, 55.74s/it]
 40%|████      | 2/5 [00:00<00:00, 16.72it/s]

get_raw_features  0.010[s]
get_category_features  0.089[s]


 40%|████      | 2/5 [00:15<00:00, 16.72it/s]

get_diff_shift_features  182.492[s]


 60%|██████    | 3/5 [03:03<02:32, 76.36s/it]

get_cum_features  0.123[s]


 80%|████████  | 4/5 [03:04<00:48, 48.19s/it]

get_agg_features  0.712[s]


100%|██████████| 5/5 [03:05<00:00, 37.13s/it]


In [27]:
ss = StandardScaler()
ss.fit(train_df)

train_df = pd.DataFrame(ss.transform(train_df), columns=list(train_df.columns))
train_mean = train_df.mean()
train_df = train_df.fillna(train_df.mean())

test_df = pd.DataFrame(ss.transform(test_df), columns=list(test_df.columns))
test_df = test_df.fillna(train_mean)

In [28]:
train_df = utils.reduce_mem_usage(train_df)
test_df = utils.reduce_mem_usage(test_df)

Mem. usage decreased from 2717.01 Mb to 679.25 Mb (75.0% reduction)
Mem. usage decreased from 1811.34 Mb to 452.84 Mb (75.0% reduction)


In [29]:
display(train_df), display(test_df)

Unnamed: 0,time_step,u_in,R,C,u_in_diff_-2,u_in_shift_-2,time_step_diff_-2,time_step_shift_-2,u_in_diff_-1,u_in_shift_-1,...,u_in_shift_10,time_step_diff_10,time_step_shift_10,u_in_cumsum,time_step_cumsum,u_in_amax,u_in_std,u_in_mean,u_in_first,u_in_last
0,-1.707031,-0.538574,-0.359131,1.394531,-2.503906,1.240234,-0.566895,-1.704102,-2.511719,0.844238,...,-0.000000,-0.000000,-0.000000,-0.980469,-1.116211,-0.245361,0.119324,0.513672,-0.550293,0.28125
1,-1.663086,0.823242,-0.359131,1.394531,-0.530762,1.263672,-0.729004,-1.658203,-0.578613,1.157227,...,-0.000000,-0.000000,-0.000000,-0.936523,-1.115234,-0.245361,0.119324,0.513672,-0.550293,0.28125
2,-1.618164,1.130859,-0.359131,1.394531,-0.357666,1.466797,-0.879883,-1.612305,-0.056763,1.179688,...,-0.000000,-0.000000,-0.000000,-0.881836,-1.113281,-0.245361,0.119324,0.513672,-0.550293,0.28125
3,-1.574219,1.152344,-0.359131,1.394531,-0.533691,1.619141,-0.843262,-1.567383,-0.363281,1.373047,...,-0.000000,-0.000000,-0.000000,-0.826660,-1.110352,-0.245361,0.119324,0.513672,-0.550293,0.28125
4,-1.529297,1.342773,-0.359131,1.394531,-0.239868,1.608398,-0.755859,-1.521484,-0.275635,1.516602,...,-0.000000,-0.000000,-0.000000,-0.765625,-1.105469,-0.245361,0.119324,0.513672,-0.550293,0.28125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035995,1.563477,-0.434082,1.171875,-0.937500,-0.053284,-0.431885,-0.261230,1.648438,-0.015732,-0.435547,...,-0.454102,0.284668,1.535156,-0.046173,1.893555,-0.334717,-0.430176,-0.438232,-0.313477,-3.25000
6035996,1.606445,-0.434082,1.171875,-0.937500,-0.022034,-0.454834,-0.252441,1.692383,-0.025513,-0.430176,...,-0.455078,0.272217,1.584961,-0.042572,1.973633,-0.334717,-0.430176,-0.438232,-0.313477,-3.25000
6035997,1.650391,-0.428955,1.171875,-0.937500,-0.037323,-0.437988,-0.177490,1.737305,0.023163,-0.451904,...,-0.445801,0.265381,1.634766,-0.038818,2.054688,-0.334717,-0.430176,-0.438232,-0.313477,-3.25000
6035998,1.694336,-0.450195,1.171875,-0.937500,0.000000,0.000000,0.000000,-0.000000,-0.044556,-0.436035,...,-0.451172,0.258545,1.684570,-0.035736,2.136719,-0.334717,-0.430176,-0.438232,-0.313477,-3.25000


Unnamed: 0,time_step,u_in,R,C,u_in_diff_-2,u_in_shift_-2,time_step_diff_-2,time_step_shift_-2,u_in_diff_-1,u_in_shift_-1,...,u_in_shift_10,time_step_diff_10,time_step_shift_10,u_in_cumsum,time_step_cumsum,u_in_amax,u_in_std,u_in_mean,u_in_first,u_in_last
0,-1.707031,-0.544922,-1.125000,-0.354492,-1.651367,0.612793,1.020508,-1.708984,-1.041016,0.020981,...,-0.000000,-0.000000,-0.000000,-0.980957,-1.116211,0.048462,0.367432,0.364746,-0.553223,0.267334
1,-1.665039,0.014397,-1.125000,-0.354492,-1.548828,1.137695,1.012695,-1.666016,-0.989258,0.561523,...,-0.000000,-0.000000,-0.000000,-0.962891,-1.115234,0.048462,0.367432,0.364746,-0.553223,0.267334
2,-1.623047,0.545410,-1.125000,-0.354492,-1.325195,1.543945,1.025391,-1.623047,-0.913574,1.060547,...,-0.000000,-0.000000,-0.000000,-0.927246,-1.113281,0.048462,0.367432,0.364746,-0.553223,0.267334
3,-1.582031,1.035156,-1.125000,-0.354492,-1.060547,1.876953,1.030273,-1.581055,-0.710449,1.445312,...,-0.000000,-0.000000,-0.000000,-0.875977,-1.110352,0.048462,0.367432,0.364746,-0.553223,0.267334
4,-1.540039,1.414062,-1.125000,-0.354492,-0.837402,2.121094,1.020508,-1.538086,-0.583984,1.761719,...,-0.000000,-0.000000,-0.000000,-0.812500,-1.106445,0.048462,0.367432,0.364746,-0.553223,0.267334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4023995,1.596680,-0.174927,-0.359131,-0.937500,-0.046600,-0.159058,-0.499756,1.682617,-0.016495,-0.171387,...,-0.199341,0.649902,1.567383,-0.510742,1.922852,-0.027664,-0.551758,-0.843262,-0.156128,0.279541
4023996,1.640625,-0.174561,-0.359131,-0.937500,-0.046448,-0.158691,-0.466553,1.728516,-0.016403,-0.171143,...,-0.197754,0.667480,1.617188,-0.498779,2.003906,-0.027664,-0.551758,-0.843262,-0.156128,0.279541
4023997,1.684570,-0.174316,-0.359131,-0.937500,-0.046326,-0.158569,-0.701172,1.773438,-0.016327,-0.170898,...,-0.196289,0.646484,1.667969,-0.486572,2.085938,-0.027664,-0.551758,-0.843262,-0.156128,0.279541
4023998,1.728516,-0.174072,-0.359131,-0.937500,0.000000,0.000000,0.000000,-0.000000,-0.016266,-0.170654,...,-0.195068,0.651367,1.717773,-0.474609,2.167969,-0.027664,-0.551758,-0.843262,-0.156128,0.279541


(None, None)

In [30]:
train_col = train_df.columns.to_list()

In [31]:
train_df = pd.concat([train_df, train[['id', 'breath_id', 'pressure', 'u_out']]], axis=1)
test_df = pd.concat([test_df, test[['id', 'breath_id', 'u_out']]], axis=1)

In [None]:
oof_total = np.zeros((len(train), CFG.num_classes))
sub_preds = np.zeros((test.shape[0], len(CFG.folds)))
val_idxes = []
models = []
y = train['pressure']
groups = train['breath_id']
gkfold = model_selection.GroupKFold(n_splits=CFG.n_folds)
scores = []
input_dim = len(train_col)

for i, (trn_idx, val_idx) in enumerate(splitter.split(train_df, y, groups)):
    if i not in CFG.folds:
        continue

    trn_df = train_df.loc[trn_idx, :].reset_index(drop=True)
    val_df = train_df.loc[val_idx, :].reset_index(drop=True)
    trn_y = y.values[trn_idx]
    val_y = y.values[val_idx]
    
    
    loaders = {
        phase: torchdata.DataLoader(
            VentilatorDataset(
                df_, train_col
            ),
            **CFG.loader_params[phase])  # type: ignore
        for phase, df_ in zip(["train", "valid", "test"], [trn_df, val_df, test_df])
    }
    
    
    model = RNNModel(
        input_dim=input_dim,
        lstm_dim=CFG.lstm_dim,
        dense_dim=CFG.dense_dim,
        logit_dim=CFG.logit_dim,
        num_classes=CFG.num_classes,
    )
    model_name = model.__class__.__name__
#     break
    
    learner = Learner(model)
    
    # loggers
    RUN_NAME = f'exp{str(CFG.exp_num)}'
    wandb.init(project='Ventilator-Pressure-Prediction', entity='sqrt4kaido', group=RUN_NAME, job_type=RUN_NAME + f'-fold-{i}')
    wandb.run.name = RUN_NAME + f'-fold-{i}'
    wandb_config = wandb.config
    wandb_config.model_name = model_name
    wandb.watch(model)
    
    # callbacks
    callbacks = []
    checkpoint_callback = ModelCheckpoint(
        monitor=f'Loss/val',
        mode='min',
        dirpath=OUTPUT_DIR,
        verbose=False,
        save_weights_only=True,
        filename=f'{model_name}-{learner.current_epoch}-{i}')
    callbacks.append(checkpoint_callback)

#     early_stop_callback = EarlyStopping(
#         monitor='Loss/val',
#         min_delta=0.00,
#         patience=10,
#         verbose=True,
#         mode='min')
#     callbacks.append(early_stop_callback)
    
    loggers = []
    loggers.append(WandbLogger())
    
    trainer = pl.Trainer(
        logger=loggers,
        callbacks=callbacks,
        max_epochs=CFG.epochs,
        default_root_dir=OUTPUT_DIR,
        gpus=1,
#         fast_dev_run=DEBUG,
        deterministic=True,
        benchmark=False,
        )
    
    trainer.fit(learner, train_dataloader=loaders['train'], val_dataloaders=loaders['valid'])
#     trainer.save_checkpoint(OUTPUT_DIR / "last.ckpt")
    print('train done.')
    
    #############
    # validation (to make oof)
    #############
    checkpoint = torch.load(checkpoint_callback.best_model_path)
    learner.load_state_dict(checkpoint['state_dict'])
    
    model = model.to(device)
    oof_pred, oof_target = evaluate(model, loaders, phase="valid")
    models.append(model)
    
    oof_score = get_score(oof_pred, oof_target, val_df['u_out'].values)
    scores.append(oof_score)
    oof_total[val_idx] = oof_pred.reshape(1, -1).T / CFG.bias
    val_idxes.append(val_idx)
    
    print('validate done.')
    print(f'fold = {i}, auc = {oof_score}')
    wandb.log({'CV_score': oof_score})
    
    #############
    # inference
    #############
    test_pred, _ = evaluate(model, loaders, phase="test")
    sub_preds[:, i] = test_pred
    
    print('inference done.')

# test_preds_total = np.array(test_preds_total)


[34m[1mwandb[0m: Currently logged in as: [33msqrt4kaido[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | RNNModel       | 4.9 M 
1 | criterion | VentilatorLoss | 0     
---------------------------------------------
4.9 M     Trainable params
0         Non-trainable params
4.9 M     Total params
19.499    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

epoch = 0, custom_mae = 17.438940048217773


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

epoch = 0, custom_mae = 1.3108384609222412


Validating: 0it [00:00, ?it/s]

epoch = 1, custom_mae = 0.9729192852973938


Validating: 0it [00:00, ?it/s]

epoch = 2, custom_mae = 0.8000661134719849


Validating: 0it [00:00, ?it/s]

epoch = 3, custom_mae = 0.7715524435043335


Validating: 0it [00:00, ?it/s]

epoch = 4, custom_mae = 0.7468957901000977


Validating: 0it [00:00, ?it/s]

epoch = 5, custom_mae = 0.729373037815094


Validating: 0it [00:00, ?it/s]

epoch = 6, custom_mae = 0.8333085179328918


Validating: 0it [00:00, ?it/s]

epoch = 7, custom_mae = 0.6714468598365784


Validating: 0it [00:00, ?it/s]

epoch = 8, custom_mae = 0.6342003345489502


Validating: 0it [00:00, ?it/s]

epoch = 9, custom_mae = 0.6134984493255615


Validating: 0it [00:00, ?it/s]

epoch = 10, custom_mae = 0.6071126461029053


Validating: 0it [00:00, ?it/s]

epoch = 11, custom_mae = 0.568688154220581


Validating: 0it [00:00, ?it/s]

epoch = 12, custom_mae = 0.5967393517494202


Validating: 0it [00:00, ?it/s]

epoch = 13, custom_mae = 0.5665075778961182


Validating: 0it [00:00, ?it/s]

epoch = 14, custom_mae = 0.5472763776779175


Validating: 0it [00:00, ?it/s]

epoch = 15, custom_mae = 0.5139713287353516


Validating: 0it [00:00, ?it/s]

epoch = 16, custom_mae = 0.5596718788146973


Validating: 0it [00:00, ?it/s]

epoch = 17, custom_mae = 0.5046927332878113


Validating: 0it [00:00, ?it/s]

epoch = 18, custom_mae = 0.5137667655944824


Validating: 0it [00:00, ?it/s]

epoch = 19, custom_mae = 0.5469162464141846


Validating: 0it [00:00, ?it/s]

epoch = 20, custom_mae = 0.5049368739128113


Validating: 0it [00:00, ?it/s]

epoch = 21, custom_mae = 0.46642592549324036


Validating: 0it [00:00, ?it/s]

epoch = 22, custom_mae = 0.46191322803497314


Validating: 0it [00:00, ?it/s]

epoch = 23, custom_mae = 0.4935760498046875


Validating: 0it [00:00, ?it/s]

epoch = 24, custom_mae = 0.4807301461696625


Validating: 0it [00:00, ?it/s]

epoch = 25, custom_mae = 0.4553607106208801


Validating: 0it [00:00, ?it/s]

epoch = 26, custom_mae = 0.5010672211647034


Validating: 0it [00:00, ?it/s]

epoch = 27, custom_mae = 0.4638870358467102


Validating: 0it [00:00, ?it/s]

epoch = 28, custom_mae = 0.47851312160491943


Validating: 0it [00:00, ?it/s]

epoch = 29, custom_mae = 0.4751971662044525


Validating: 0it [00:00, ?it/s]

epoch = 30, custom_mae = 0.45067793130874634


Validating: 0it [00:00, ?it/s]

epoch = 31, custom_mae = 0.43086904287338257


Validating: 0it [00:00, ?it/s]

epoch = 32, custom_mae = 0.4574492871761322


Validating: 0it [00:00, ?it/s]

epoch = 33, custom_mae = 0.3863672614097595


Validating: 0it [00:00, ?it/s]

epoch = 34, custom_mae = 0.37219762802124023


Validating: 0it [00:00, ?it/s]

epoch = 35, custom_mae = 0.39196664094924927


Validating: 0it [00:00, ?it/s]

epoch = 36, custom_mae = 0.39592236280441284


Validating: 0it [00:00, ?it/s]

epoch = 37, custom_mae = 0.37039411067962646


Validating: 0it [00:00, ?it/s]

epoch = 38, custom_mae = 0.37233754992485046


Validating: 0it [00:00, ?it/s]

epoch = 39, custom_mae = 0.3890504837036133


Validating: 0it [00:00, ?it/s]

epoch = 40, custom_mae = 0.40487125515937805


Validating: 0it [00:00, ?it/s]

epoch = 41, custom_mae = 0.42767882347106934


Validating: 0it [00:00, ?it/s]

epoch = 42, custom_mae = 0.5584889054298401


Validating: 0it [00:00, ?it/s]

epoch = 43, custom_mae = 0.47452786564826965


Validating: 0it [00:00, ?it/s]

epoch = 44, custom_mae = 0.37464144825935364


Validating: 0it [00:00, ?it/s]

epoch = 45, custom_mae = 0.5262590646743774


Validating: 0it [00:00, ?it/s]

epoch = 46, custom_mae = 0.3912554085254669


Validating: 0it [00:00, ?it/s]

epoch = 47, custom_mae = 0.3538174033164978


Validating: 0it [00:00, ?it/s]

epoch = 48, custom_mae = 0.3414541482925415


Validating: 0it [00:00, ?it/s]

epoch = 49, custom_mae = 0.3346276879310608


Validating: 0it [00:00, ?it/s]

epoch = 50, custom_mae = 0.32609954476356506


Validating: 0it [00:00, ?it/s]

epoch = 51, custom_mae = 0.31818294525146484


Validating: 0it [00:00, ?it/s]

epoch = 52, custom_mae = 0.3109351396560669


Validating: 0it [00:00, ?it/s]

epoch = 53, custom_mae = 0.3075571656227112


Validating: 0it [00:00, ?it/s]

epoch = 54, custom_mae = 0.30641263723373413


Validating: 0it [00:00, ?it/s]

epoch = 55, custom_mae = 0.30292996764183044


Validating: 0it [00:00, ?it/s]

epoch = 56, custom_mae = 0.2973991930484772


Validating: 0it [00:00, ?it/s]

epoch = 57, custom_mae = 0.3019489049911499


Validating: 0it [00:00, ?it/s]

epoch = 58, custom_mae = 0.2919975817203522


Validating: 0it [00:00, ?it/s]

epoch = 59, custom_mae = 0.33063679933547974


Validating: 0it [00:00, ?it/s]

epoch = 60, custom_mae = 0.2891703248023987


Validating: 0it [00:00, ?it/s]

epoch = 61, custom_mae = 0.286363810300827


Validating: 0it [00:00, ?it/s]

epoch = 62, custom_mae = 0.2926090955734253


Validating: 0it [00:00, ?it/s]

epoch = 63, custom_mae = 0.2834514081478119


Validating: 0it [00:00, ?it/s]

epoch = 64, custom_mae = 0.2907533347606659


Validating: 0it [00:00, ?it/s]

epoch = 65, custom_mae = 0.28214097023010254


Validating: 0it [00:00, ?it/s]

epoch = 66, custom_mae = 0.2879953980445862


Validating: 0it [00:00, ?it/s]

epoch = 67, custom_mae = 0.2773836851119995


Validating: 0it [00:00, ?it/s]

epoch = 68, custom_mae = 0.2771270275115967


Validating: 0it [00:00, ?it/s]

epoch = 69, custom_mae = 0.2758003771305084


Validating: 0it [00:00, ?it/s]

epoch = 70, custom_mae = 0.2759953737258911


Validating: 0it [00:00, ?it/s]

epoch = 71, custom_mae = 0.2724487781524658


Validating: 0it [00:00, ?it/s]

epoch = 72, custom_mae = 0.2761112153530121


Validating: 0it [00:00, ?it/s]

epoch = 73, custom_mae = 0.2752234935760498


Validating: 0it [00:00, ?it/s]

epoch = 74, custom_mae = 0.27148380875587463


Validating: 0it [00:00, ?it/s]

epoch = 75, custom_mae = 0.2724889814853668


Validating: 0it [00:00, ?it/s]

epoch = 76, custom_mae = 0.280648410320282


Validating: 0it [00:00, ?it/s]

epoch = 77, custom_mae = 0.267954021692276


Validating: 0it [00:00, ?it/s]

epoch = 78, custom_mae = 0.27915918827056885


Validating: 0it [00:00, ?it/s]

epoch = 79, custom_mae = 0.27450987696647644


Validating: 0it [00:00, ?it/s]

epoch = 80, custom_mae = 0.27094876766204834


Validating: 0it [00:00, ?it/s]

epoch = 81, custom_mae = 0.271668404340744


Validating: 0it [00:00, ?it/s]

epoch = 82, custom_mae = 0.26690831780433655


Validating: 0it [00:00, ?it/s]

epoch = 83, custom_mae = 0.2686610519886017


Validating: 0it [00:00, ?it/s]

epoch = 84, custom_mae = 0.27480548620224


Validating: 0it [00:00, ?it/s]

epoch = 85, custom_mae = 0.2648048400878906


Validating: 0it [00:00, ?it/s]

epoch = 86, custom_mae = 0.26745396852493286


Validating: 0it [00:00, ?it/s]

epoch = 87, custom_mae = 0.26156049966812134


Validating: 0it [00:00, ?it/s]

epoch = 88, custom_mae = 0.2671048045158386


Validating: 0it [00:00, ?it/s]

epoch = 89, custom_mae = 0.2662467658519745


Validating: 0it [00:00, ?it/s]

epoch = 90, custom_mae = 0.2624320685863495


In [None]:
if len(CFG.folds) != CFG.n_folds:

    oof_score = get_score(oof_pred, oof_target, val_df['u_out'].values)
    print(f'MAE {oof_score}')

    oof_df = train.iloc[val_idxes[0], :1]
    oof_df['pressure'] = oof_pred
    oof_df.to_csv(OUTPUT_DIR / f'oof{CFG.exp_num}.csv',index = False)    
else:
    score = get_score(y, oof_total, train['u_out'].values)
    print(f'MAE {score}: folds: {scores}')

    oof_df = pd.DataFrame({'id': train['id'].values, 'pressure':oof_total.reshape(-1)})
    oof_df.to_csv(OUTPUT_DIR / f'oof{CFG.exp_num}.csv',index = False)
oof_df

In [None]:
sub = pd.read_csv(DATA_DIR / 'sample_submission.csv')
sub['pressure'] = np.mean(sub_preds, axis=1)
sub.to_csv(OUTPUT_DIR / f'sub{CFG.exp_num}.csv',index = False)
sub

In [None]:
wandb.init(project='Ventilator-Pressure-Prediction', entity='sqrt4kaido', group=RUN_NAME, job_type='summary')
wandb.run.name = 'summary'
wandb.log({'CV_score': oof_score})
# wandb.save(utils.get_notebook_path())
wandb.finish()