# Probabilistic ensemble

This notebook builds on top of the [VPP classification solution](https://www.kaggle.com/takamichitoda/ventilator-train-classification). Please, make yourself familiar with it first.

Here I want to present a simple idea of ensembling classification n-folds that appears to perform slightly better than the median ensemble. 
Recall that classificators produce not only the most likely class, but also probabilities for all other classes via softmax operator. So under assumption of independence of probabilities generated by classifiers trained on different training folds, we could predict the true pressure class as

$$pressure\_class=\max\limits_{pressure\_class\_i} \prod_{k}{P_{fold_k}(pressure\_class\_i)}$$

Where $P_{fold_k}(pressure\_class\_i)$ is a probability of i-th pressure class by k-th fold via softmax. 

## A note on numerical stability

As you can see the estimated class probability is proportional to the product of multiple probabilities. The value of this product could quickly become too small for reliable calculation. We use 2 trick to fix it:

1. $\max\limits_{i} {x_i}=\max\limits_{i} {log(x_i)}$ due to mononotic property of log function. So the above expression becomes:

$$\max\limits_{pressure\_class\_i} \prod_{k}{P_{fold_k}(pressure\_class\_i)}=\max\limits_{pressure\_class\_i} \sum_{k}{log(P_{fold_k}(pressure\_class\_i))}$$

_This is the final formula we'll use to calculate our ensemble probabilities._

2. Use `float64` instead of `float32` to increase floating point resolution

## Results

Performance is only slightly better than the median (-0.0002 PL), but you might be able to increase it by manipulating hyperparameters/number of folds. 

# VPP classification 

Following is the inference part of VPP classification solution extracted from takamichitoda's code: https://www.kaggle.com/takamichitoda/ventilator-train-classification
Please, upvote it if you find it mind opening!


In [None]:
import gc
import os
import random
import wandb
import math

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AdamW
from transformers import get_cosine_schedule_with_warmup
from sklearn.preprocessing import RobustScaler

device = torch.device("cuda")


In [None]:
class config:
    EXP_NAME = "exp080_conti_rc"
    
    INPUT = "/kaggle/input/ventilator-pressure-prediction"
    OUTPUT = "/kaggle/working"
    N_FOLD = 5
    SEED = 0
    
    LR = 5e-3
    N_EPOCHS = 50
    EMBED_SIZE = 64
    HIDDEN_SIZE = 256
    BS = 512
    WEIGHT_DECAY = 1e-3

    USE_LAG = 4
    #CATE_FEATURES = ['R_cate', 'C_cate', 'RC_dot', 'RC_sum']
    CONT_FEATURES = ['u_in', 'u_out', 'time_step'] + ['u_in_cumsum', 'u_in_cummean', 'area', 'cross', 'cross2'] + ['R_cate', 'C_cate']
    LAG_FEATURES = ['breath_time']
    LAG_FEATURES += [f'u_in_lag_{i}' for i in range(1, USE_LAG+1)]
    #LAG_FEATURES += [f'u_in_lag_{i}_back' for i in range(1, USE_LAG+1)]
    LAG_FEATURES += [f'u_in_time{i}' for i in range(1, USE_LAG+1)]
    #LAG_FEATURES += [f'u_in_time{i}_back' for i in range(1, USE_LAG+1)]
    LAG_FEATURES += [f'u_out_lag_{i}' for i in range(1, USE_LAG+1)]
    #LAG_FEATURES += [f'u_out_lag_{i}_back' for i in range(1, USE_LAG+1)]
    #ALL_FEATURES = CATE_FEATURES + CONT_FEATURES + LAG_FEATURES
    ALL_FEATURES = CONT_FEATURES + LAG_FEATURES
    
    NOT_WATCH_PARAM = ['INPUT']

In [None]:
class VentilatorDataset(Dataset):
    
    def __init__(self, df, label_dic=None):
        self.dfs = [_df for _, _df in df.groupby("breath_id")]
        self.label_dic = label_dic
        
    def __len__(self):
        return len(self.dfs)
    
    def __getitem__(self, item):
        df = self.dfs[item]
        X = df[config.ALL_FEATURES].values
        y = df['pressure'].values
        if self.label_dic is None:
            label = [-1]
        else:
            label = [self.label_dic[i] for i in y]

        d = {
            "X": torch.tensor(X).float(),
            "y" : torch.tensor(label).long(),
        }
        return d

In [None]:
class VentilatorModel(nn.Module):
    
    def __init__(self):
        super(VentilatorModel, self).__init__()
        #self.r_emb = nn.Embedding(3, 2, padding_idx=0)
        #self.c_emb = nn.Embedding(3, 2, padding_idx=0)
        #self.rc_dot_emb = nn.Embedding(8, 4, padding_idx=0)
        #self.rc_sum_emb = nn.Embedding(8, 4, padding_idx=0)
        self.seq_emb = nn.Sequential(
            #nn.Linear(12+len(config.CONT_FEATURES)+len(config.LAG_FEATURES), config.EMBED_SIZE),
            nn.Linear(len(config.CONT_FEATURES)+len(config.LAG_FEATURES), config.EMBED_SIZE),
            nn.LayerNorm(config.EMBED_SIZE),
        )
        
        self.lstm = nn.LSTM(config.EMBED_SIZE, config.HIDDEN_SIZE, batch_first=True, bidirectional=True, dropout=0.0, num_layers=4)

        self.head = nn.Sequential(
            nn.Linear(config.HIDDEN_SIZE * 2, config.HIDDEN_SIZE * 2),
            nn.LayerNorm(config.HIDDEN_SIZE * 2),
            nn.ReLU(),
            nn.Linear(config.HIDDEN_SIZE * 2, 950),
        )
        
        # Encoder
        #initrange = 0.1
        #self.r_emb.weight.data.uniform_(-initrange, initrange)
        #self.c_emb.weight.data.uniform_(-initrange, initrange)
        #self.rc_dot_emb.weight.data.uniform_(-initrange, initrange)
        #self.rc_sum_emb.weight.data.uniform_(-initrange, initrange)
        
        # LSTM
        for n, m in self.named_modules():
            if isinstance(m, nn.LSTM):
                print(f'init {m}')
                for param in m.parameters():
                    if len(param.shape) >= 2:
                        nn.init.orthogonal_(param.data)
                    else:
                        nn.init.normal_(param.data)

    def forward(self, X, y=None):
        # embed
        #bs = X.shape[0]
        #r_emb = self.r_emb(X[:,:,0].long()).view(bs, 80, -1)
        #c_emb = self.c_emb(X[:,:,1].long()).view(bs, 80, -1)
        #rc_dot_emb = self.rc_dot_emb(X[:,:,2].long()).view(bs, 80, -1)
        #rc_sum_emb = self.rc_sum_emb(X[:,:,3].long()).view(bs, 80, -1)
        
        #seq_x = torch.cat((r_emb, c_emb, rc_dot_emb, rc_sum_emb, X[:, :, 4:]), 2)
        seq_x = X
        emb_x = self.seq_emb(seq_x)
        
        out, _ = self.lstm(emb_x, None) 
        logits = self.head(out)

        if y is None:
            loss = None
        else:
            loss = self.loss_fn(logits, y)
            
        return logits, loss
    
    def loss_fn(self, y_pred, y_true):
        loss = nn.CrossEntropyLoss()(y_pred.reshape(-1, 950), y_true.reshape(-1))
        return loss
    
    
model = VentilatorModel()

In [None]:
def test_loop(model, loader, target_dic_inv):
    predicts = []
    model.eval()
    for d in loader:
        with torch.no_grad():
            out, _ = model(d['X'].to(device))
        out = torch.tensor([[target_dic_inv[j.item()] for j in i] for i in out.argmax(2)])
        predicts.append(out.cpu())

    return torch.vstack(predicts).numpy().reshape(-1)

In [None]:
def add_feature(df):
    df['time_delta'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    df['delta'] = df['time_delta'] * df['u_in']
    df['area'] = df.groupby('breath_id')['delta'].cumsum()

    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] / df['count']
    
    df = df.drop(['count','one'], axis=1)
    return df

def add_lag_feature(df):
    # https://www.kaggle.com/kensit/improvement-base-on-tensor-bidirect-lstm-0-173
    for lag in range(1, config.USE_LAG+1):
        df[f'breath_id_lag{lag}']=df['breath_id'].shift(lag).fillna(0)
        df[f'breath_id_lag{lag}same']=np.select([df[f'breath_id_lag{lag}']==df['breath_id']], [1], 0)

        # u_in 
        df[f'u_in_lag_{lag}'] = df['u_in'].shift(lag).fillna(0) * df[f'breath_id_lag{lag}same']
        #df[f'u_in_lag_{lag}_back'] = df['u_in'].shift(-lag).fillna(0) * df[f'breath_id_lag{lag}same']
        df[f'u_in_time{lag}'] = df['u_in'] - df[f'u_in_lag_{lag}']
        #df[f'u_in_time{lag}_back'] = df['u_in'] - df[f'u_in_lag_{lag}_back']
        df[f'u_out_lag_{lag}'] = df['u_out'].shift(lag).fillna(0) * df[f'breath_id_lag{lag}same']
        #df[f'u_out_lag_{lag}_back'] = df['u_out'].shift(-lag).fillna(0) * df[f'breath_id_lag{lag}same']

    # breath_time
    df['time_step_lag'] = df['time_step'].shift(1).fillna(0) * df[f'breath_id_lag{lag}same']
    df['breath_time'] = df['time_step'] - df['time_step_lag']

    drop_columns = ['time_step_lag']
    drop_columns += [f'breath_id_lag{i}' for i in range(1, config.USE_LAG+1)]
    drop_columns += [f'breath_id_lag{i}same' for i in range(1, config.USE_LAG+1)]
    df = df.drop(drop_columns, axis=1)

    # fill na by zero
    df = df.fillna(0)
    return df

c_dic = {10: 0, 20: 1, 50:2}
r_dic = {5: 0, 20: 1, 50:2}
rc_sum_dic = {v: i for i, v in enumerate([15, 25, 30, 40, 55, 60, 70, 100])}
rc_dot_dic = {v: i for i, v in enumerate([50, 100, 200, 250, 400, 500, 2500, 1000])}    

def add_category_features(df):
    df['C_cate'] = df['C'].map(c_dic)
    df['R_cate'] = df['R'].map(r_dic)
    df['RC_sum'] = (df['R'] + df['C']).map(rc_sum_dic)
    df['RC_dot'] = (df['R'] * df['C']).map(rc_dot_dic)
    return df

norm_features = config.CONT_FEATURES + config.LAG_FEATURES
def norm_scale(train_df, test_df):
    scaler = RobustScaler()
    all_u_in = np.vstack([train_df[norm_features].values, test_df[norm_features].values])
    scaler.fit(all_u_in)
    train_df[norm_features] = scaler.transform(train_df[norm_features].values)
    test_df[norm_features] = scaler.transform(test_df[norm_features].values)
    return train_df, test_df

In [None]:
train_df = pd.read_csv(f"{config.INPUT}/train.csv")
test_df = pd.read_csv(f"{config.INPUT}/test.csv")
sub_df = pd.read_csv(f"{config.INPUT}/sample_submission.csv")
oof = np.zeros(len(train_df))
test_preds_lst = []

target_dic = {v:i for i, v in enumerate(sorted(train_df['pressure'].unique().tolist()))}
target_dic_inv = {v: k for k, v in target_dic.items()}

gkf = GroupKFold(n_splits=config.N_FOLD).split(train_df, train_df.pressure, groups=train_df.breath_id)
for fold, (_, valid_idx) in enumerate(gkf):
    train_df.loc[valid_idx, 'fold'] = fold

train_df = add_feature(train_df)
test_df = add_feature(test_df)
train_df = add_lag_feature(train_df)
test_df = add_lag_feature(test_df)
train_df = add_category_features(train_df)
test_df = add_category_features(test_df)
train_df, test_df = norm_scale(train_df, test_df)

test_df['pressure'] = -1
test_dset = VentilatorDataset(test_df)
test_loader = DataLoader(test_dset, batch_size=config.BS,
                         pin_memory=True, shuffle=False, drop_last=False, num_workers=os.cpu_count())



In [None]:
unique_pressures = train_df["pressure"].unique()
sorted_pressures = np.sort(unique_pressures)
total_pressures_len = len(sorted_pressures)

In [None]:

def find_nearest(prediction):
    insert_idx = np.searchsorted(sorted_pressures, prediction)
    if insert_idx == total_pressures_len:
        # If the predicted value is bigger than the highest pressure in the train dataset,
        # return the max value.
        return sorted_pressures[-1]
    elif insert_idx == 0:
        # Same control but for the lower bound.
        return sorted_pressures[0]
    lower_val = sorted_pressures[insert_idx - 1]
    upper_val = sorted_pressures[insert_idx]
    return lower_val if abs(lower_val - prediction) < abs(upper_val - prediction) else upper_val


# Implementation of probabilistic ensemble 

### 1. Load models of all 5 folds

In [None]:
from glob import glob 

models = []
for model_path in glob('/kaggle/input/ventilator-train-classification/exp080_conti_rc/ventilator_f*_best_model.bin'):
    model = VentilatorModel()
    model.load_state_dict(torch.load(model_path))
    model.cuda()
    models.append(model)

In [None]:
classes, class_freq = np.unique(train_df['pressure'].map(target_dic), return_counts=True)

In [None]:
class_proba = class_freq / np.sum(class_freq)

### 2. Calculate probabilistic pressure class & run postprocessing

In [None]:
def test_loop_pred(models, loader, target_dic_inv, class_proba=None):
    predicts = []
    for model in models:
        model.eval()
    if class_proba is not None:
        class_proba = torch.unsqueeze(torch.unsqueeze(torch.log(class_proba), dim=0), dim=0) # (1, 1, 950)
    for d in tqdm(loader):
        outs = []
        with torch.no_grad():
            for model in models:
                out, _ = model(d['X'].to(device))  
                out = torch.log(torch.nn.functional.softmax(out, dim=2))   # B, S, 950                
                outs.append(out)
        if class_proba is not None:
            outs.append(class_proba.expand(outs[-1].shape))            
        out = torch.stack(outs, dim=2)  # B, S, n_folds + 1, 950            
        out = torch.sum(out, dim=2)
        out = torch.tensor([[target_dic_inv[j.item()] for j in i] for i in out.argmax(2)])
        predicts.append(out.cpu())

    return torch.vstack(predicts).numpy().reshape(-1)



In [None]:
df_submission = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/sample_submission.csv')
df_submission['pressure'] = test_loop_pred(models, test_loader, target_dic_inv)
df_submission['pressure'] = df_submission['pressure'].apply(find_nearest)
df_submission.to_csv('submission.csv', index=False)