In [None]:
import os
os.system('pip uninstall -y transformers')
os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')

In [None]:
## The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
import shutil
from pathlib import Path
transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")
input_dir = Path("../input/deberta-v2-3-fast-tokenizer")
convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in [
    'tokenization_deberta_v2.py',
    'tokenization_deberta_v2_fast.py',
    "deberta__init__.py"]:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path/str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [None]:
# =================================
# Library
# =================================
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=true
from transformers.models.deberta_v2 import DebertaV2TokenizerFast
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
import gc
import ast
import itertools
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# =================================
# Constant
# =================================
SUB_PATH = "../input/feedback-prize-2021/sample_submission.csv"
TEST_PATH = "../input/nbme-score-clinical-patient-notes/test.csv"
FEATURE_PATH = "./input/nbme-score-clinical-patient-notes/features.csv"
PATIENT_NOTES_PATH = "../input/nbme-score-clinical-patient-notes/patient_notes.csv"

In [None]:
# =================================
# Settings
# =================================
w1 = 0.25
w2 = 0.1
w3 = 0.35
w4 = 0.3

th_dict = {0: [0.49, 0.8976772190005388],
 1: [0.525, 0.9064990886585599],
 2: [0.475, 0.8517623923219974],
 3: [0.485, 0.9260359498514676],
 4: [0.53, 0.9248769561757685],
 5: [0.47, 0.8364175195561528],
 6: [0.47, 0.906836587356394],
 7: [0.53, 0.8801618303571429],
 8: [0.485, 0.9256547241005716],
 9: [0.53, 0.9290001463914508]}

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG_ex038:
    max_len=512
    batch_size=24
    tokenizer_path="../input/deberta-v3-large-model/deberta-v3-large"
    model_path="../input/ex038-save/ex038"
    n_fold=5
    
class CFG_ex041:
    max_len=512
    batch_size=16
    tokenizer_path="../input/deberta/v2-xlarge"
    model_path="../input/ex041-save/ex041"
    n_fold=5
    
class CFG_ex051:
    max_len=512
    batch_size=8
    tokenizer_path="../input/deberta/v2-xxlarge"
    model_path="../input/nbme-ex051/ex051"
    n_fold=4

In [None]:
# ====================================================
# Function
# ====================================================
def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results

def get_results_raw_pp(char_probs, case_nums, th_dict):
    results = []
    for char_prob,case_num in zip(char_probs,case_nums):
        result = np.where(char_prob >= th_dict[int(case_num)][0])[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results

def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        if len(result) > 0:
            if result[0] == 1:
                result = np.concatenate([np.array([0]),result],axis=0)
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

def get_results_from_preds_list(preds):
    results = []
    for pred in preds:
        s = []
        for p in pred:
            s.append(' '.join(list(map(str, p))))
        s = ';'.join(s)
        results.append(s)
    return results

In [None]:
def postprocess(texts, preds):
    from nltk.tokenize import word_tokenize
    preds_pp = preds.copy()
    tk0 = tqdm(range(len(preds_pp)), total=len(preds_pp))
    for raw_idx in tk0:
        pred = preds[raw_idx]
        char_prob = char_probs[raw_idx]
        text = texts[raw_idx]
        if len(pred) != 0:
            # pp1: indexが1から始まる予測値は0から始まるように修正 ## +0.00123
            if pred[0][0] == 1:
                preds_pp[raw_idx][0][0] = 0
            for p_index, pp in enumerate(pred):
                start, end = pred[p_index]
                if start == 0:
                    break
                # pp2: startとendが同じ予測値はstartを前に1ずらす ## +0.00012
                if start == end:
                    preds_pp[raw_idx][p_index][0] = start - 1
                    break
                # pp3: 始点が改行の場合始点を1つ後ろにずらす ## +0.00032
                if text[start] == '\n':
                    preds_pp[raw_idx][p_index][0] = start + 1
                    start = start + 1
                # pp4: 1-2などは-2で予測されることがあるので修正 ## +0.00001
                if text[start-1].isdigit() and text[start] == '-' and text[start+1].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-1].isdigit() and text[start] == '/' and text[start+1].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp5: 67などは7で予測されることがあるので修正 ## +0.00001
                if text[start-1].isdigit() and text[start].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp6: 2文字前が記号でないやつに対するpp ## +0.00001
                if text[start-2] == ',' and text[start-1] != ' ':
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == '-' and text[start-1] != ' ':
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == '\"' and text[start-1] != ' ':
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == ':' and text[start-1] != ' ':
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == '(' and text[start-1] != ' ':
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == ')' and text[start-1] != ' ':
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp7: heart -> h + eart となっているようなものを汎用的に修正する ## +0.00050
                try:
                    text_token = word_tokenize(text[start-1:end])
                    text_token = [text for text in text_token]
                    first = text[start:end].split()[0]
                    if first not in text_token:
                        for t in text_token:
                            if (first == t[-len(first):]) and (t[0].isalpha()):
                                sub = len(t) - len(first)
                                preds_pp[raw_idx][p_index][0] = start - sub
                                start = start - sub
                                break
                except:
                    None
                # pp8: endの修正 ## +0.00001
                if text[end-1:end] == '.':
                    preds_pp[raw_idx][p_index][1] = end - 1
                    end = end - 1
                if text[end-1:end] == '-' and text[end:end+1].isnumeric():
                    preds_pp[raw_idx][p_index][1] = end + 1
                    end = end + 1
    return preds_pp

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(CFG, text, feature_text,tokenizer):
    inputs =  tokenizer.encode_plus(text, feature_text, 
                           add_special_tokens=True,
#                            max_length=CFG.max_len,
#                            padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

class TestDataset(Dataset):
    def __init__(self,df,tokenizer,CFG):
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.tokenizer = tokenizer
        self.CFG = CFG

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.CFG,
                               self.pn_historys[item], 
                               self.feature_texts[item],
                               self.tokenizer)
        return inputs


    
class custom_model_ex038(nn.Module):
    def __init__(self):
        super(custom_model_ex038, self).__init__()
        self.model = AutoModel.from_pretrained(
            CFG_ex038.tokenizer_path, 
        )
        self.config = AutoConfig.from_pretrained(CFG_ex038.tokenizer_path)
        self.dropout1 = nn.Dropout(p=0.2)
        self.ln1 = nn.LayerNorm(1024)
        self.linear1 = nn.Linear(1024,512)
        self.ln2 = nn.LayerNorm(512)
        self.relu = nn.ReLU()
        self.dropout2 = nn.Dropout(p=0.2)
        self.linear2 = nn.Linear(512,1)
        self._init_weights(self.linear1)
        self._init_weights(self.linear2)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
    def forward(self, ids, mask,token_type):
        # pooler
        emb = self.model(ids, mask,token_type)["last_hidden_state"]
        output = self.ln1(emb)
        output = self.dropout1(output)
        output = self.linear1(output)
        output = self.ln2(output)
        output = self.relu(output)
        output = self.dropout2(output)
        output = self.linear2(output)
        return output

    
class custom_model_ex041(nn.Module):
    def __init__(self):
        super(custom_model_ex041, self).__init__()
        self.model = AutoModel.from_pretrained(
            CFG_ex041.tokenizer_path, 
        )
        self.config = AutoConfig.from_pretrained(CFG_ex041.tokenizer_path)
        self.dropout1 = nn.Dropout(p=0.2)
        self.ln1 = nn.LayerNorm(1536)
        self.linear1 = nn.Linear(1536,768)
        self.ln2 = nn.LayerNorm(768)
        self.relu = nn.ReLU()
        self.dropout2 = nn.Dropout(p=0.2)
        self.linear2 = nn.Linear(768,1)
        self._init_weights(self.linear1)
        self._init_weights(self.linear2)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
    def forward(self, ids, mask,token_type):
        # pooler
        emb = self.model(ids, mask,token_type)["last_hidden_state"]
        output = self.ln1(emb)
        output = self.dropout1(output)
        output = self.linear1(output)
        output = self.ln2(output)
        output = self.relu(output)
        output = self.dropout2(output)
        output = self.linear2(output)
        return output
    
    
class custom_model_ex051(nn.Module):
    def __init__(self):
        super(custom_model_ex051, self).__init__()
        self.model = AutoModel.from_pretrained(
            CFG_ex051.tokenizer_path, 
        )
        self.dropout1 = nn.Dropout(p=0.2)
        self.ln1 = nn.LayerNorm(1536)
        self.linear1 = nn.Linear(1536,768)
        self.ln2 = nn.LayerNorm(768)
        self.relu = nn.ReLU()
        self.dropout2 = nn.Dropout(p=0.2)
        self.linear2 = nn.Linear(768,1)
            
    def forward(self, ids, mask,token_type):
        # pooler
        emb = self.model(ids, mask,token_type)["last_hidden_state"]
        output = self.ln1(emb)
        output = self.dropout1(output)
        output = self.linear1(output)
        output = self.ln2(output)
        output = self.relu(output)
        output = self.dropout2(output)
        output = self.linear2(output)
        return output
    

In [None]:
SUB_PATH = "../input/nbme-score-clinical-patient-notes/sample_submission.csv"
TEST_PATH = "../input/nbme-score-clinical-patient-notes/test.csv"
FEATURE_PATH = "../input/nbme-score-clinical-patient-notes/features.csv"
PATIENT_NOTES_PATH = "../input/nbme-score-clinical-patient-notes/patient_notes.csv"

In [None]:
# ===========================================
# main
# ===========================================
test = pd.read_csv(TEST_PATH)
submission = pd.read_csv(SUB_PATH)
features = pd.read_csv(FEATURE_PATH)
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv(PATIENT_NOTES_PATH)

In [None]:
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')

In [None]:
# testの並び替え
token_len = []
tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG_ex038.tokenizer_path)
for f,p in zip(test['feature_text'].values,test['pn_history'].values):
    inputs =  tokenizer.encode_plus(p, f, 
                           add_special_tokens=True,
                           return_offsets_mapping=False)
    token_len.append(len(inputs["input_ids"]))
test["token_len"] = token_len
test = test.sort_values(by="token_len").reset_index(drop=True)
display(test)

In [None]:
# ================================================
# exp051 deberta-v2-xxlarge
# ================================================
sub_preds051 = np.zeros((len(test),CFG_ex051.max_len,1))
tokenizer051 = DebertaV2TokenizerFast.from_pretrained(CFG_ex051.tokenizer_path)
test_dataset = TestDataset(test, tokenizer051,CFG_ex051)
test_loader = DataLoader(test_dataset, 
                         batch_size=CFG_ex051.batch_size,
                         shuffle=False, 
                         collate_fn=DataCollatorWithPadding(tokenizer=tokenizer051, padding='longest'),
                         pin_memory=True, drop_last=False)
model_list = ["../input/nbme-ex051/ex051_0.pth",
              "../input/nbme-ex051/ex051_1.pth",
              "../input/nbme-ex051-2/ex051_2.pth",
              "../input/nbme-ex051/ex051_3.pth",
              "../input/nbme-ex051-2/ex051_4.pth",]
for fold in tqdm(range(CFG_ex051.n_fold)):
    model = custom_model_ex051()
    model.load_state_dict(torch.load(model_list[fold]))
    model.to(device)
    model.eval()
    
    test_preds_ = np.ndarray((0,CFG_ex051.max_len,1))
    with torch.no_grad():  
        for d in test_loader:
            ids = d['input_ids'].to(device)
            mask = d['attention_mask'].to(device)
            token_type = d["token_type_ids"].to(device)
            with autocast():
                outputs = model(ids,mask,token_type)
            outputs = np.concatenate([outputs.sigmoid().detach().cpu().numpy(),np.zeros([len(outputs),CFG_ex051.max_len - outputs.shape[1],1])],axis=1)
            test_preds_ = np.concatenate([test_preds_, outputs], axis=0)
    
    torch.cuda.empty_cache()
    sub_preds051 += test_preds_ / CFG_ex051.n_fold
    del model,test_preds_; gc.collect()
print(sub_preds051)
del test_dataset,test_loader
gc.collect()

In [None]:
# ================================================
# exp038 deberta-v3-large
# ================================================
sub_preds038 = np.zeros((len(test),CFG_ex038.max_len,1))
tokenizer038 = DebertaV2TokenizerFast.from_pretrained(CFG_ex038.tokenizer_path)
test_dataset = TestDataset(test, tokenizer038,CFG_ex038)
test_loader = DataLoader(test_dataset, 
                         batch_size=CFG_ex038.batch_size,
                         shuffle=False, 
                         collate_fn=DataCollatorWithPadding(tokenizer=tokenizer038, padding='longest'),
                         pin_memory=True, drop_last=False)
for fold in tqdm(range(CFG_ex038.n_fold)):
    model = custom_model_ex038()
    model.load_state_dict(torch.load(CFG_ex038.model_path + f"_{fold}.pth"))
    model.to(device)
    model.eval()
    
    test_preds_ = np.ndarray((0,CFG_ex038.max_len,1))
    with torch.no_grad():  
        for d in test_loader:
            ids = d['input_ids'].to(device)
            mask = d['attention_mask'].to(device)
            token_type = d["token_type_ids"].to(device)
            with autocast():
                outputs = model(ids,mask,token_type)
            outputs = np.concatenate([outputs.sigmoid().detach().cpu().numpy(),np.zeros([len(outputs),CFG_ex038.max_len - outputs.shape[1],1])],axis=1)
            test_preds_ = np.concatenate([test_preds_, outputs], axis=0)
    
    torch.cuda.empty_cache()
    sub_preds038 += test_preds_ / CFG_ex038.n_fold
    del model,test_preds_; gc.collect()
print(sub_preds038)
del test_dataset,test_loader
gc.collect()

In [None]:
# ================================================
# exp041 deberta-v2-xlarge
# ================================================
sub_preds041 = np.zeros((len(test),CFG_ex041.max_len,1))
tokenizer041 = DebertaV2TokenizerFast.from_pretrained(CFG_ex041.tokenizer_path)
test_dataset = TestDataset(test, tokenizer041,CFG_ex041)
test_loader = DataLoader(test_dataset, 
                         batch_size=CFG_ex041.batch_size,
                         shuffle=False, 
                         collate_fn=DataCollatorWithPadding(tokenizer=tokenizer041, padding='longest'),
                         pin_memory=True, drop_last=False)
for fold in tqdm(range(CFG_ex041.n_fold)):
    model = custom_model_ex041()
    model.load_state_dict(torch.load(CFG_ex041.model_path + f"_{fold}.pth"))
    model.to(device)
    model.eval()
    
    test_preds_ = np.ndarray((0,CFG_ex041.max_len,1))
    with torch.no_grad():  
        for d in test_loader:
            ids = d['input_ids'].to(device)
            mask = d['attention_mask'].to(device)
            token_type = d["token_type_ids"].to(device)
            with autocast():
                outputs = model(ids,mask,token_type)
            outputs = np.concatenate([outputs.sigmoid().detach().cpu().numpy(),np.zeros([len(outputs),CFG_ex041.max_len - outputs.shape[1],1])],axis=1)
            test_preds_ = np.concatenate([test_preds_, outputs], axis=0)
    
    torch.cuda.empty_cache()
    sub_preds041 += test_preds_ / CFG_ex041.n_fold
    del model,test_preds_; gc.collect()
print(sub_preds041)
del test_dataset,test_loader
gc.collect()

In [None]:
char_probs038 = get_char_probs(test['pn_history'].values, sub_preds038.reshape([-1,512]), tokenizer038)
char_probs041 = get_char_probs(test['pn_history'].values, sub_preds041.reshape([-1,512]), tokenizer041)
char_probs051 = get_char_probs(test['pn_history'].values, sub_preds051.reshape([-1,512]), tokenizer051)

In [None]:
# ===========================
# nakama ex141
# ===========================
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/nbme-debertav3large-exp141/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=24
    fc_dropout=0.
    max_len=315
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    losses=['bce', 'bce', 'bce', 'bce']
    target_sizes=[1, 1, 1, 1]

In [None]:
# ====================================================
# tokenizer
# ====================================================
if CFG.model.find("deberta-v2") >= 0 or CFG.model.find("deberta-v3") >= 0:
    CFG.tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.path+'tokenizer/')
else:
    CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

In [None]:
test_n = test.copy()
test_n['feature_text'] = test_n['feature_text'].str.lower()
test_n['pn_history'] = test_n['pn_history'].str.lower()

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           # max_length=CFG.max_len,
                           # padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        return inputs

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device, loss='bce'):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        if loss == 'bce':
            preds.append(y_preds.sigmoid().to('cpu').numpy())
        elif loss == 'ce':
            preds.append(y_preds.softmax(2).to('cpu').numpy()[:,:,1])
    predictions = preds.copy()
    max_len = max([pred.shape[1] for pred in preds])
    for i, pred in enumerate(preds):
        bs = pred.shape[0]
        p = np.zeros((bs, max_len, 1))
        p[:,:pred.shape[1],:] = pred
        predictions[i] = p
    predictions = np.concatenate(predictions)
    predictions = predictions.reshape((-1, max_len))
    return predictions

In [None]:
test_dataset = TestDataset(CFG, test_n)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions = []
for fold, loss in zip(CFG.trn_fold, CFG.losses):
    CFG.target_size = 1 if loss == 'bce' else 2
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device, loss)
    char_probs = get_char_probs(test_n['pn_history'].values, prediction, CFG.tokenizer)
    predictions.append(char_probs)
    del model, state, prediction, char_probs; gc.collect()
    torch.cuda.empty_cache()
predictions = np.mean(predictions, axis=0)
print(predictions)

In [None]:
char_probs = []
for i in range(len(char_probs038)):
    char_probs.append(char_probs038[i] * w1 + 
                      char_probs041[i] * w2 + 
                      char_probs051[i] * w3 + 
                      predictions[i] * w4)
case_nums = test["case_num"].values
results = get_results_raw_pp(char_probs, case_nums, th_dict=th_dict)
results = get_predictions(results)
results_postprocess = postprocess(test['pn_history'].values, results)
results_postprocess = get_results_from_preds_list(results_postprocess)

In [None]:
test['location'] = results_postprocess
display(test.head())
test[['id', 'location']].to_csv('submission.csv', index=False)