## This is the partner notebook to [my training notebook](https://www.kaggle.com/nbroad/qa-ner-hybrid-train-nbme)

(Version 26) 5 folds of deberta-v3-base on leakless-training, annotation corrections, no pseudolabels, AND blank annotations. CV/LB is now 0.86/0.86  
(Version 25) 5 folds of deberta-v3-base on leakless-training, annotation corrections, and no pseudolabels.  
(Version 24) 5 folds of deberta-v3-base on leakless-training, no annotation corrections, and some pseudolabels.  
(Version 23) Same as Version 21 except biobert/pubmedbert instead of bert-base, leakless-training, annotation corrections, and some pseudolabels.  
(Version 21) 1 fold of bert-base, albert-v2-base, electra-base, deberta-v3-base, roberta-base (similar to versions 11-13) but with corrected training and decoding. I added some additional logic to the `add_to_char_preds` because it was skipping over spaces and the answers were then getting broken up across spaces. (lb .838)  
(Version 17) 5 folds of deberta-v3-base trained for 5 epochs (lb .826) (with fixed training and decoding)
##### ****Version 14/15/16 had been trained with a bug related to the offset mapping. This is fixed in Version 17  ****
(Version 15) 5 folds of deberta-v3-base trained for 3 epochs  (lb .659)  
(Version 14) 5 folds of deberta-v3-base trained for 5 epochs (lb .671)  

(Version 11-13) This is using 1 fold of bert-base, albert-v2-base, electra-base, deberta-v3-base, roberta-base. This is just an experiment and there is plenty of submission time to do at least 5 folds of each!  (lb .797)

(Version 10) This is using a 5 fold roberta base model so the score will likely go up when using a large model. Please feel free to leave any comments or questions! (lb .809)

In [None]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [None]:
from dataclasses import dataclass
from functools import partial
import itertools
from typing import Any

import torch
import pandas as pd
import numpy as np

from datasets import Dataset
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
    AutoTokenizer,
    AutoConfig,
    logging,
)
from transformers.file_utils import ModelOutput


logging.set_verbosity(logging.WARNING)

In [None]:
class CFG:
    
    max_length = 512
    n_folds = 5
    model_path = "../input/5-deb-v3/fold{fold}"
    args = TrainingArguments(
        output_dir=".",
        per_device_eval_batch_size=64,
        dataloader_num_workers=2,
    )
    num_proc = 2

In [None]:
test_df = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
notes_df = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv") 
feats_df = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")
merged = test_df.merge(notes_df, how="left")   
merged = merged.merge(feats_df, how="left")

# if we sort by length of texts, then there should be less unnecessary padding --> faster inference
merged["length"] = [len(x) for x in merged["pn_history"]]
merged = merged.sort_values(by="length")

merged.head()

In [None]:
def process_feature_text(text):
    return text.replace("-OR-", " or ").replace("-", " ")

def tokenize(examples, tokenizer, max_length):
    tokenized_inputs =  tokenizer(
        examples["feature_text"],
        examples["pn_history"],
        padding=True,
        max_length=max_length,
        truncation="only_second",
        return_offsets_mapping=True
    )
    tokenized_inputs["sequence_ids"] = [tokenized_inputs.sequence_ids(i) for i in range(len(tokenized_inputs["input_ids"]))]
    return tokenized_inputs

ds = Dataset.from_pandas(merged)

ds = ds.map(lambda x: {"feature_text": process_feature_text(x["feature_text"])}, num_proc=CFG.num_proc)

In [None]:
@dataclass
class TokenClassifierOutput(ModelOutput):
    """
    Base class for outputs of token classification models.
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
            Classification loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before crf).
    """

    loss: Any = None
    logits: Any = None


# Functions that are similar across all models
def __init__(self, config):
    super(self.PreTrainedModel, self).__init__(config)

    kwargs = {"add_pooling_layer": False}
    if config.model_type not in {"bert", "roberta"}:
        kwargs = {}
    setattr(self, self.backbone_name, self.ModelClass(config, **kwargs))

    self.classifier = torch.nn.Linear(config.hidden_size, 1)


def forward(
    self,
    input_ids=None,
    attention_mask=None,
    token_type_ids=None,
    position_ids=None,
    labels=None,
):

    outputs = getattr(self, self.backbone_name)(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
    )

    sequence_output = outputs[0]

    logits = self.classifier(sequence_output)

    loss = None

    return TokenClassifierOutput(
        loss=loss,
        logits=logits.sigmoid(),
    )

In [None]:
def get_model(config, init=False):
    model_type = type(config).__name__[: -len("config")]
    if model_type == "Bart":
        name = f"{model_type}PretrainedModel"
    else:
        name = f"{model_type}PreTrainedModel"
    PreTrainedModel = getattr(__import__("transformers", fromlist=[name]), name)
    name = f"{model_type}Model"
    ModelClass = getattr(__import__("transformers", fromlist=[name]), name)

    model = type(
        "CustomModel",
        (PreTrainedModel,),
        {"__init__": __init__, "forward": forward},
    )

    model._keys_to_ignore_on_load_unexpected = [r"pooler"]
    model._keys_to_ignore_on_load_missing = [r"position_ids"]

    model.PreTrainedModel = PreTrainedModel
    model.ModelClass = ModelClass
    model.backbone_name = config.model_type

    # changes deberta-v2 --> deberta
    if "deberta" in model.backbone_name:
        model.backbone_name = "deberta"

    if init:
        return model(config)
    return model


def get_pretrained(model_name_or_path, config, **kwargs):

    model = get_model(config, init=False)

    return model.from_pretrained(
        pretrained_model_name_or_path=model_name_or_path,
        config=config,
        **kwargs,
    )

In [None]:
def get_char_preds(preds, dataset):
    """
    Finds the prediction indexes at the character level.
    """
    all_predictions = []
    for pred, offsets, seq_ids, text in zip(
        preds, dataset["offset_mapping"], dataset["sequence_ids"], dataset["pn_history"]
    ):
        char_preds = np.zeros((len(text)))
        for p, o, s_id in zip(pred, offsets, seq_ids):
            if s_id is None or s_id == 0:
                continue

            start, end = o
            char_preds[start:end] = p

        all_predictions.append(char_preds)

    return all_predictions

# https://www.kaggle.com/code/yasufuminakama?scriptVersionId=87264998&cellId=11
def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results

In [None]:
%env TOKENIZERS_PARALLELISM=false

all_char_preds = [np.zeros(len(t)) for t in ds["pn_history"]]

for fold in range(CFG.n_folds):

    # This gets reset each loop for some reason
    logging.set_verbosity(logging.WARNING)
    
    model_path = CFG.model_path.format(fold=fold)

    config = AutoConfig.from_pretrained(model_path)
    print(f"Getting predictions for {config._name_or_path}")
    
    if "deberta-v2" in config.model_type:
        from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
        tokenizer = DebertaV2TokenizerFast.from_pretrained(model_path)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
    tokenized_ds = ds.map(
        partial(tokenize, tokenizer=tokenizer, max_length=CFG.max_length), 
        batched=True,
        num_proc=CFG.num_proc,
    )
    
    model = get_model(config=config, init=True)
    model.load_state_dict(torch.load(model_path+"/pytorch_model.bin"))
        
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding="longest")
    
    trainer = Trainer(
        model=model,
        args=CFG.args,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    results = trainer.predict(tokenized_ds)
    
    char_preds = get_char_preds(results.predictions, tokenized_ds)
    all_char_preds = [x+y for x,y in zip(all_char_preds, char_preds)]

    torch.cuda.empty_cache()
all_char_preds = [cp/CFG.n_folds for cp in all_char_preds]

In [None]:
location_predictions = get_results(all_char_preds)

submission_df = pd.DataFrame(data={
            "id": tokenized_ds["id"], 
            "location": location_predictions
        })

submission_df.to_csv("submission.csv", index=False)
display(submission_df.head())