- Training notebook: https://www.kaggle.com/hengzheng/training-qa-roberta-base-5-folds
- Base on: https://www.kaggle.com/tomohiroh/nbme-bert-for-beginners

In [None]:
import warnings
warnings.simplefilter('ignore')

import os
import gc

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm, trange

import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig

# config

In [None]:
ROOT = '../input/nbme-score-clinical-patient-notes'
MODEL_NAME = '../input/roberta-base'
BATCH_SIZE = 16
N_FOLDS = 5
MODELS_PATH = '../input/training-qa-roberta-base-5-folds'

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

# helper functions

In [None]:
def create_test_df():
    feats = pd.read_csv(f"{ROOT}/features.csv")
    feats.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    
    notes = pd.read_csv(f"{ROOT}/patient_notes.csv")
    test = pd.read_csv(f"{ROOT}/test.csv")

    merged = test.merge(notes, how = "left")
    merged = merged.merge(feats, how = "left")

    def process_feature_text(text):
        return text.replace("-OR-", ";-").replace("-", " ")
    merged["feature_text"] = [process_feature_text(x) for x in merged["feature_text"]]
    
    return merged

In [None]:
class NBMETestData(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data.loc[idx]
        tokenized = self.tokenizer(
            example["feature_text"],
            example["pn_history"],
            truncation = "only_second",
            max_length = 416,
            padding = "max_length",
            return_offsets_mapping = True
        )
        tokenized["sequence_ids"] = tokenized.sequence_ids()

        input_ids = np.array(tokenized["input_ids"])
        attention_mask = np.array(tokenized["attention_mask"])
        offset_mapping = np.array(tokenized["offset_mapping"])
        sequence_ids = np.array(tokenized["sequence_ids"]).astype("float16")

        return input_ids, attention_mask, offset_mapping, sequence_ids

In [None]:
class NBMEModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(MODEL_NAME)
        self.config = AutoConfig.from_pretrained(MODEL_NAME)
        self.dropout = torch.nn.Dropout(p=0.2)
        self.classifier = torch.nn.Linear(self.config.hidden_size, 1)
        
    def forward(self, input_ids, attention_mask):
        pooler_outputs = self.backbone(input_ids=input_ids, 
                                       attention_mask=attention_mask)[0]
        logits = self.classifier(self.dropout(pooler_outputs)).squeeze(-1)
        return logits

In [None]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))


def get_location_predictions(preds, offset_mapping, sequence_ids, test=False):
    all_predictions = []
    for pred, offsets, seq_ids in zip(preds, offset_mapping, sequence_ids):
        pred = sigmoid(pred)
        start_idx = None
        current_preds = []
        for p, o, s_id in zip(pred, offsets, seq_ids):
            if s_id is None or s_id == 0:
                continue
            if p > 0.5:
                if start_idx is None:
                    start_idx = o[0]
                end_idx = o[1]
            elif start_idx is not None:
                if test:
                    current_preds.append(f"{start_idx} {end_idx}")
                else:
                    current_preds.append((start_idx, end_idx))
                start_idx = None
        if test:
            all_predictions.append("; ".join(current_preds))
        else:
            all_predictions.append(current_preds)
    return all_predictions

# loading test and tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

test = create_test_df()
test_ds = NBMETestData(test, tokenizer)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, pin_memory=True, 
                                      shuffle=False, drop_last=False)

# infer

In [None]:
all_preds = None
offsets = []
seq_ids = []

for fold in range(N_FOLDS):
    model = NBMEModel().to(DEVICE)
    model.load_state_dict(torch.load(f'{MODELS_PATH}/nbme_{fold}.pth', map_location=DEVICE))
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in tqdm(test_dl):
            input_ids = batch[0].to(DEVICE)
            attention_mask = batch[1].to(DEVICE)
            logits = model(input_ids, attention_mask)
            preds.append(logits.cpu().numpy())
            if fold == 0:                  # only in the first fold
                offset_mapping = batch[2]
                sequence_ids = batch[3]
                offsets.append(offset_mapping.numpy())
                seq_ids.append(sequence_ids.numpy())
    preds = np.concatenate(preds, axis=0)
    if all_preds is None:
        all_preds = np.array(preds).astype(np.float32)
    else:
        all_preds += np.array(preds).astype(np.float32)
    torch.cuda.empty_cache()
    
    
all_preds /= N_FOLDS
all_preds = all_preds.squeeze()

offsets = np.concatenate(offsets, axis=0)
seq_ids = np.concatenate(seq_ids, axis=0)

print(all_preds.shape, offsets.shape, seq_ids.shape)

In [None]:
location_preds = get_location_predictions(all_preds, offsets, seq_ids, test=True)

test["location"] = location_preds
test[["id", "location"]].to_csv("submission.csv", index = False)
pd.read_csv("submission.csv").head()