# Intro
Competition home page: https://www.kaggle.com/competitions/nbme-score-clinical-patient-notes

This notebook is a starter kit for a transfer learning by Transformers pre-trainded models. 

We employ the code from the following references.

* Hugging Face - Fine tune NER models: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb

* Hugging Face - Create a dataset loading script: https://huggingface.co/docs/datasets/dataset_script
 
* Hugging Face - dataset loading script template: https://github.com/huggingface/datasets/blob/master/datasets/wnut_17/wnut_17.py

For training, we use the conll data converted by https://www.kaggle.com/code/crischir/nbme2-conll-via-spacy. This data will be imported by the utility script under usr/lib from Github.

For prediction, we use Transformers pipeline to load the fine tuned model.
https://huggingface.co/docs/transformers/pipeline_tutorial

Use displacy to visualize the results.

# Import Libraries

In [None]:
%%capture
!pip install seqeval

In [None]:
import os
import random
import numpy as np
import pandas as pd
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, pipeline
from datasets import load_dataset, load_metric
from spacy import displacy

os.environ["WANDB_DISABLED"] = "true"

# Utilities

In [None]:
def tokenize_and_align_labels(task,examples,tokenizer,label_all_tokens = True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


def compute_metrics(p,label_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    metric = load_metric("seqeval")
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


def getDatasets(datasetScript,task,tokenizer,split=False,debug=False):       
    datasets = load_dataset(datasetScript)
    labelList = datasets["train"].features[f"{task}_tags"].feature.names
    tokenizedDatasets = datasets.map(lambda x: tokenize_and_align_labels(task=task,
                                                                         examples=x,
                                                                         tokenizer=tokenizer,
                                                                         label_all_tokens = True), 
                                    batched=True)
    if split:
        tokenizedDatasets = tokenizedDatasets["train"].train_test_split(test_size=0.2)
        eval_dataset=tokenizedDatasets["test"] 
    else:
        eval_dataset=tokenizedDatasets["validation"]
    train_dataset=tokenizedDatasets["train"]
    
    if debug:
        train_dataset = train_dataset.shuffle(seed=42).select(range(100))
        eval_dataset = eval_dataset.shuffle(seed=42).select(range(100))
    
    return train_dataset,eval_dataset,labelList


def runTrainer(cfg):
    # Set datasets
    tokenizer = AutoTokenizer.from_pretrained(cfg.modelCheckpoint)
    assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)    
    train_dataset,eval_dataset,label_list = getDatasets(datasetScript=cfg.datasetScript,
                                                       task=cfg.task,
                                                       tokenizer=tokenizer,
                                                       split=cfg.split,
                                                       debug=cfg.debug)
                                          
    # Set trainer   
    model = AutoModelForTokenClassification.from_pretrained(cfg.modelCheckpoint, num_labels=len(label_list))
    model_name = cfg.modelCheckpoint.split("/")[-1]
    args = TrainingArguments(
                                output_dir=os.path.join(cfg.outdir,f"{model_name}-finetuned-{cfg.task}"),
                                evaluation_strategy="epoch",
                                learning_rate=2e-5,
                                per_device_train_batch_size=cfg.batchSize,
                                per_device_eval_batch_size=cfg.batchSize,
                                num_train_epochs=3,
                                weight_decay=0.01,
                            )
    data_collator = DataCollatorForTokenClassification(tokenizer)
    trainer = Trainer(
                        model,
                        args,
                        train_dataset=train_dataset,
                        eval_dataset=eval_dataset,
                        data_collator=data_collator,
                        tokenizer=tokenizer,
                        compute_metrics=lambda x:compute_metrics(x,label_list)
                      )
    
    # Train
    trainOutput = trainer.train()
    print("Training completed.")
    trainer.save_metrics(split="train",metrics=trainOutput.metrics)
    trainer.save_metrics(split="eval",metrics=trainer.evaluate())
    print("Evaluation completed.")
    #predOutput = trainer.predict(tokenized_datasets["test"])
    #trainer.save_metrics(split="test",metrics=predOutput.metrics)
    #print("Testing completed.") 
    trainer.save_model(output_dir=os.path.join(cfg.outdir,cfg.saveModelName))
    print("Model saved.")

    
class CFG:
    task = "ner"
    modelCheckpoint = "distilbert-base-uncased"
    batchSize = 16
    datasetScript = "../usr/lib/nbmedatasetloadingscript/nbmedatasetloadingscript.py"
    split = False # split dataset to train/validation
    outdir = "./"
    saveModelName = "mytrfmodel"
    debug = True # Disable this may cause CUDA OOM!!!!


# Train

In [None]:
cfg = CFG
cfg.split = True
cfg.debug = True
runTrainer(cfg)

# Predict

In [None]:
task = "token-classification"
modelName = f"./{cfg.saveModelName}"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForTokenClassification.from_pretrained(modelName) 
    
nlp = pipeline(task=task,
               model=model,
               tokenizer=tokenizer,
               aggregation_strategy="simple")

Note that LABEL_i maps to label_list[i]

In [None]:
notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
sample = notes.sample(1)
text = sample["pn_history"].values[0]
doc = nlp(text)
df = pd.DataFrame(doc,columns=["start","end","entity_group"])
df.columns = ["start","end","label"]
ex = [{"text": text,
       "ents": df.to_dict("records"),
       "title": None}]
displacy.render(ex, style="ent", manual=True)