In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Training of base model:
This is my version of training baseline model using hugginface trainer, compute_loss function and datasets only. It would help people to get familar with hugginface eco system without worrying about internal details.

Huge thanks to @nboard notebook https://www.kaggle.com/code/nbroad/qa-ner-hybrid-train-nbme, most of the snippets inspired from his work, checkout his work if you haven't already.

Checkout inference notebook [0.867] https://www.kaggle.com/code/raghavendrakotala/inference-deberta-trainer-compute-loss-datasets

### **I Hope this helps in enhancing more NLP skills in your DS journey. Don't forget to upvote if you find it useful :), Thanks!**

In [None]:

from torch.utils.data import Dataset, DataLoader
import pdb
import torch
from torch import cuda
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support
import datasets
from functools import partial
from ast import literal_eval
from datetime import datetime
import gc


import warnings
warnings.filterwarnings('ignore')

### Load data and set the config : we will run for 5 folds, 5 epochs

In [None]:
config = {'model_name': '../input/deberta-v3-base/deberta-v3-base/',
         'max_length': 512,
         'train_batch_size':8,
         'valid_batch_size':16,
         'epochs':5,
         'learning_rate':2e-05,
         'max_grad_norm':10,
          'warmup':0.1,
          "grad_acc":8,
          "model_save_path":"deberta-trained",
          "folds":5,
          "seed":42,
          'num_proc' : 2,
         'device': 'cuda' if cuda.is_available() else 'cpu'}

In [None]:
df_features = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/features.csv")
df_patients = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv")
df_train = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/train.csv")

In [None]:
df_patients.head()

In [None]:
df_patients['pn_num'].nunique(), df_patients['case_num'].nunique()

In [None]:
df_train.head()

In [None]:
df_patients.head()

### Load tokenizer and clean the data

While cleaning we don't remove empty annotations as it leads to removing true positive cases and we see CV and LB results not in sync [ref](http://kaggle.com/competitions/nbme-score-clinical-patient-notes/discussion/318224)

In [None]:
!pip uninstall -q -y transformers

In [None]:
!pip install git+https://github.com/huggingface/transformers.git -qq

In [None]:
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForTokenClassification


tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

In [None]:
def pre_process_data(df_train):
    print(f"before converting annotations of type :{type(df_train.annotation[0])}, {df_train.annotation[0]}, location of type: {type(df_train.location[0])}, {df_train.location[0]}")
    df_train['anno_list'] = [literal_eval(x) for x in df_train.annotation]
    df_train['loc_list'] = [literal_eval(x) for x in df_train.location]
    print(f"after converting annotations of type :{type(df_train.annotation[0])}, {df_train.annotation[0]}, location of type: {type(df_train.location[0])}, {df_train.location[0]}")
    print(f"column names of df_train : {df_train.columns}")
    merged = df_train.merge(df_patients, how='left')
    print(f"column names of df_train after merging with patietns: {merged.columns}")
    merged = merged.merge(df_features, how='left')
    print(f"column names of df_train after merging with features: {merged.columns}")
    return merged

In [None]:
merged = pre_process_data(df_train)
merged.shape

In [None]:
# incorrect annotations
merged.loc[338, "anno_list"] =  '["father heart attack"]'
merged.loc[338, "loc_list"] =  '["764 783"]'

merged.loc[621, "anno_list"] =  '["for the last 2-3 months", "over the last 2 months"]'
merged.loc[621, "loc_list"] =  '["77 100", "398 420"]'

merged.loc[655, "anno_list"] =  '["no heat intolerance", "no cold intolerance"]'
merged.loc[655, "loc_list"] =  '["285 292;301 312", "285 287;296 312"]'

merged.loc[1262, "anno_list"] =  '["mother thyroid problem"]'
merged.loc[1262, "loc_list"] =  '["551 557;565 580"]'

merged.loc[1265, "anno_list"] =  '[\'felt like he was going to "pass out"\']'
merged.loc[1265, "loc_list"] =  '["131 135;181 212"]'

merged.loc[1396, "anno_list"] =  '["stool , with no blood"]'
merged.loc[1396, "loc_list"] =  '["259 280"]'

merged.loc[1591, "anno_list"] =  '["diarrhoe non blooody"]'
merged.loc[1591, "loc_list"] =  '["176 184;201 212"]'

merged.loc[1615, "anno_list"] =  '["diarrhea for last 2-3 days"]'
merged.loc[1615, "loc_list"] =  '["249 257;271 288"]'

merged.loc[1664, "anno_list"] =  '["no vaginal discharge"]'
merged.loc[1664, "loc_list"] =  '["822 824;907 924"]'

merged.loc[1714, "anno_list"] =  '["started about 8-10 hours ago"]'
merged.loc[1714, "loc_list"] =  '["101 129"]'

merged.loc[1929, "anno_list"] =  '["no blood in the stool"]'
merged.loc[1929, "loc_list"] =  '["531 539;549 561"]'

merged.loc[2134, "anno_list"] =  '["last sexually active 9 months ago"]'
merged.loc[2134, "loc_list"] =  '["540 560;581 593"]'

merged.loc[2191, "anno_list"] =  '["right lower quadrant pain"]'
merged.loc[2191, "loc_list"] =  '["32 57"]'

merged.loc[2553, "anno_list"] =  '["diarrhoea no blood"]'
merged.loc[2553, "loc_list"] =  '["308 317;376 384"]'

merged.loc[3124, "anno_list"] =  '["sweating"]'
merged.loc[3124, "loc_list"] =  '["549 557"]'

merged.loc[3858, "anno_list"] =  '["previously as regular", "previously eveyr 28-29 days", "previously lasting 5 days", "previously regular flow"]'
merged.loc[3858, "loc_list"] =  '["102 123", "102 112;125 141", "102 112;143 157", "102 112;159 171"]'

merged.loc[4373, "anno_list"] =  '["for 2 months"]'
merged.loc[4373, "loc_list"] =  '["33 45"]'

merged.loc[4763, "anno_list"] =  '["35 year old"]'
merged.loc[4763, "loc_list"] =  '["5 16"]'

merged.loc[4782, "anno_list"] =  '["darker brown stools"]'
merged.loc[4782, "loc_list"] =  '["175 194"]'

merged.loc[4908, "anno_list"] =  '["uncle with peptic ulcer"]'
merged.loc[4908, "loc_list"] =  '["700 723"]'

merged.loc[6016, "anno_list"] =  '["difficulty falling asleep"]'
merged.loc[6016, "loc_list"] =  '["225 250"]'

merged.loc[6192, "anno_list"] =  '["helps to take care of aging mother and in-laws"]'
merged.loc[6192, "loc_list"] =  '["197 218;236 260"]'

merged.loc[6380, "anno_list"] =  '["No hair changes", "No skin changes", "No GI changes", "No palpitations", "No excessive sweating"]'
merged.loc[6380, "loc_list"] =  '["480 482;507 519", "480 482;499 503;512 519", "480 482;521 531", "480 482;533 545", "480 482;564 582"]'

merged.loc[6562, "anno_list"] =  '["stressed due to taking care of her mother", "stressed due to taking care of husbands parents"]'
merged.loc[6562, "loc_list"] =  '["290 320;327 337", "290 320;342 358"]'

merged.loc[6862, "anno_list"] =  '["stressor taking care of many sick family members"]'
merged.loc[6862, "loc_list"] =  '["288 296;324 363"]'

merged.loc[7022, "anno_list"] =  '["heart started racing and felt numbness for the 1st time in her finger tips"]'
merged.loc[7022, "loc_list"] =  '["108 182"]'

merged.loc[7422, "anno_list"] =  '["first started 5 yrs"]'
merged.loc[7422, "loc_list"] =  '["102 121"]'

merged.loc[8876, "anno_list"] =  '["No shortness of breath"]'
merged.loc[8876, "loc_list"] =  '["481 483;533 552"]'

merged.loc[9027, "anno_list"] =  '["recent URI", "nasal stuffines, rhinorrhea, for 3-4 days"]'
merged.loc[9027, "loc_list"] =  '["92 102", "123 164"]'

merged.loc[9938, "anno_list"] =  '["irregularity with her cycles", "heavier bleeding", "changes her pad every couple hours"]'
merged.loc[9938, "loc_list"] =  '["89 117", "122 138", "368 402"]'

merged.loc[9973, "anno_list"] =  '["gaining 10-15 lbs"]'
merged.loc[9973, "loc_list"] =  '["344 361"]'

merged.loc[10513, "anno_list"] =  '["weight gain", "gain of 10-16lbs"]'
merged.loc[10513, "loc_list"] =  '["600 611", "607 623"]'

merged.loc[11551, "anno_list"] =  '["seeing her son knows are not real"]'
merged.loc[11551, "loc_list"] =  '["386 400;443 461"]'

merged.loc[11677, "anno_list"] =  '["saw him once in the kitchen after he died"]'
merged.loc[11677, "loc_list"] =  '["160 201"]'

merged.loc[12124, "anno_list"] =  '["tried Ambien but it didnt work"]'
merged.loc[12124, "loc_list"] =  '["325 337;349 366"]'

merged.loc[12279, "anno_list"] =  '["heard what she described as a party later than evening these things did not actually happen"]'
merged.loc[12279, "loc_list"] =  '["405 459;488 524"]'

merged.loc[12289, "anno_list"] =  '["experienced seeing her son at the kitchen table these things did not actually happen"]'
merged.loc[12289, "loc_list"] =  '["353 400;488 524"]'

merged.loc[13238, "anno_list"] =  '["SCRACHY THROAT", "RUNNY NOSE"]'
merged.loc[13238, "loc_list"] =  '["293 307", "321 331"]'

merged.loc[13297, "anno_list"] =  '["without improvement when taking tylenol", "without improvement when taking ibuprofen"]'
merged.loc[13297, "loc_list"] =  '["182 221", "182 213;225 234"]'

merged.loc[13299, "anno_list"] =  '["yesterday", "yesterday"]'
merged.loc[13299, "loc_list"] =  '["79 88", "409 418"]'

merged.loc[13845, "anno_list"] =  '["headache global", "headache throughout her head"]'
merged.loc[13845, "loc_list"] =  '["86 94;230 236", "86 94;237 256"]'

merged.loc[14083, "anno_list"] =  '["headache generalized in her head"]'
merged.loc[14083, "loc_list"] =  '["56 64;156 179"]'

merged["anno_list"] = [
    literal_eval(x) if isinstance(x, str) else x for x in merged["anno_list"]
]
merged["loc_list"] = [
    literal_eval(x) if isinstance(x, str) else x for x in merged["loc_list"]
]

In [None]:
merged.shape

In [None]:
merged = merged[~merged['pn_history'].isnull()]
merged.shape

In [None]:
def clean_data(merged):
    # Not removing empty annotations as mentioned in some discussions it would lead to removing true positives.
    
#     print(f"before clearning: count of empty annotations :{merged.loc[merged['annotation'] == '[]'].shape} and its shape {merged.shape}")
#     merged = merged.loc[merged['annotation'] != "[]"].copy().reset_index(drop=False)
#     print(f"after clearning: count of empty annotations :{merged.loc[merged['annotation'] == '[]'].shape} and its shape {merged.shape}")
    print(f"before clearning: count of '-OR-' in feature text: {merged[merged['feature_text'].str.contains('-OR-')].shape} and its shape {merged.shape}")
    merged['feature_text'] = merged['feature_text'].apply(lambda x:x.replace("-OR-", ';-').replace("-", " ").lower())
    print(f"after clearning: count of '-OR-' in feature text: {merged[merged['feature_text'].str.contains('-OR-')].shape} and its shape {merged.shape}")
    print(f"before clearning: lower pn_history {merged['pn_history'].values[1]} and its shape {merged.shape}")
    merged['pn_history'] = merged['pn_history'].apply(lambda x:x.lower())
    print(f"before clearning: lower pn_history {merged['pn_history'].values[1]} and its shape {merged.shape}")
    return merged

In [None]:
merged = clean_data(merged)

In [None]:
merged.head()

In [None]:
merged.shape

### Do the KFold validation and load the data into data loaders.

[ref](http://kaggle.com/competitions/nbme-score-clinical-patient-notes/discussion/305599kaggle.com/competitions/nbme-score-clinical-patient-notes/discussion/305599) followed this strategy to avoid leakage of data.


In [None]:
skf = StratifiedKFold(n_splits=config['folds'], random_state=config['seed'], shuffle=True)

merged["fold"] = -1

for fold, (_, val_idx) in enumerate(skf.split(merged, y=merged["case_num"])):
    merged.loc[val_idx, "fold"] = fold
    
counts = merged.groupby(["fold", "pn_num"], as_index=False).count()

# If the number of rows is the same as the number of 
# unique pn_num, then each pn_num is only in one fold.
# Also if all the counts=1
print(counts.shape, counts.pn_num.nunique(), counts.case_num.unique(), merged['pn_num'].nunique())
merged['fold'].value_counts()

In [None]:
first = merged.loc[35]

example = {"feature_text": first.feature_text,
          "pn_history": first.pn_history,
          "loc_list": first.loc_list,
          "annotation_list": first.anno_list}

for key in example.keys():
    print(key)
    print(example[key])
    print('='*10)

In [None]:
def loc_list_to_tuples(loc_list):
    to_return = []
    for loc_str in loc_list:
        loc_strs = loc_str.split(";")
        for loc in loc_strs:
            start, end = loc.split()
            to_return.append((int(start), int(end)))
    return to_return

print(example['loc_list'])
example_loc_ints = loc_list_to_tuples(example['loc_list'])
print(example_loc_ints)
for loc in example_loc_ints:
    print(example['pn_history'][loc[0] : loc[1]])

In [None]:
def tokenize_and_label(example):
    tokenized_inputs = tokenizer(example['feature_text'],
                                example['pn_history'],
                                truncation='only_second',
                                max_length = config['max_length'],
                                padding='max_length',
                                return_offsets_mapping=True,)
#                                 return_tensors='pt')
    labels = [0.0] * len(tokenized_inputs['input_ids'])
    tokenized_inputs['location'] = loc_list_to_tuples(example['loc_list'])
    tokenized_inputs['sequence_ids'] = tokenized_inputs.sequence_ids()
    
    if len(tokenized_inputs["location"]) > 0:
        for idx, (seq_id, offsets) in enumerate(
            zip(tokenized_inputs["sequence_ids"], tokenized_inputs["offset_mapping"])
        ):
            if seq_id is None or seq_id == 0:
                # don't calculate loss on question part or special tokens
                labels[idx] = -100.0
                continue

            token_start, token_end = offsets
            for label_start, label_end in tokenized_inputs["location"]:
                if (
                    token_start <= label_start < token_end
                    or token_start < label_end <= token_end
                    or label_start <= token_start < label_end
                ):
                    labels[idx] = 1.0  # labels should be float

    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [None]:
tokenized_inputs = tokenize_and_label(example)
tokenized_inputs.keys()

In [None]:
merged = merged[["pn_history", "feature_text", "loc_list", "fold"]]
merged.head()

In [None]:
def convert_to_dataset(merged):
    dataset = datasets.Dataset.from_pandas(merged)
    print(f"keys before applying tokenization: {dataset[0].keys()}")
    dataset_mapped = dataset.map(tokenize_and_label, num_proc=config['num_proc'])
    dataset_mapped.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'], output_all_columns=True)
    # dataset_mapped = dataset_mapped.remove_columns(['pn_history',"feature_text","loc_list", "token_type_ids","offset_mapping", "location_int", "sequence_ids"])
    print(f"keys after applying tokenization: {dataset_mapped[0].keys()}")
    return dataset_mapped

In [None]:
dataset_mapped = convert_to_dataset(merged)

In [None]:
dataset_mapped.save_to_disk('./processed_data/')

In [None]:
dataset_mapped

In [None]:
dataset_mapped[0]['offset_mapping']

### Load the model set the training arguments, wandb logging and metrics.


In [None]:
model = AutoModelForTokenClassification.from_pretrained(config['model_name'], num_labels=1)

In [None]:
args = TrainingArguments('test_trainer',
                        do_train=True,
                        do_eval=True,
                        do_predict=True,
                        num_train_epochs=config['epochs'],
                        evaluation_strategy ='epoch',
                        per_device_train_batch_size=config['train_batch_size'],
                        per_device_eval_batch_size=config['valid_batch_size'],
                        #                 fp16=True,
                        learning_rate=config['learning_rate'],
                        weight_decay=0.01,
                        save_strategy = "no",
                        warmup_ratio= config['warmup'],
                        gradient_accumulation_steps=config['grad_acc'],
                        logging_strategy="epoch",
                        save_total_limit=1,
                        seed=18,
                        group_by_length=True,
                        report_to='wandb'
                        )

In [None]:
if "wandb" in args.report_to:
    !pip install -U wandb -qq
    import wandb
    from kaggle_secrets import UserSecretsClient

    user_secrets = UserSecretsClient()
    wandb_key = user_secrets.get_secret("wandb")
    
    os.environ["WANDB_PROJECT"] = "NBME"
    os.environ["WANDB_RUN_GROUP"] = "DEBERTA_MLM_fine-tune" + datetime.now().strftime(
        "%Y-%m-%d %H:%M"
    )
    wandb.login(key=wandb_key)

In [None]:
def kaggle_metrics(eval_prediction, dataset):
    """
    For `compute_metrics`

    Use partial for the args and kwargs to pass other data
    into the `compute_metrics` function.
    """

    pred_idxs = get_location_predictions(eval_prediction.predictions, dataset)

    all_labels = []
    all_preds = []
    for preds, locations, text in zip(
        pred_idxs,
        dataset["location"],
        dataset["pn_history"],
    ):

        num_chars = len(text)
        char_labels = np.zeros((num_chars), dtype=bool)

        for start, end in locations:
            char_labels[start:end] = 1

        char_preds = np.zeros((num_chars))

        for start_idx, end_idx in preds:
            char_preds[start_idx:end_idx] = 1
            if (
                text[start_idx].isspace()
                and start_idx > 0
                and not char_preds[start_idx - 1]
            ):
                char_preds[start_idx] = 0

        all_labels.extend(char_labels)
        all_preds.extend(char_preds)

    results = precision_recall_fscore_support(all_labels, all_preds, average="binary")

    return {"precision": results[0], "recall": results[1], "f1": results[2]}

import pdb
def get_location_predictions(preds, dataset):
    """
    Finds the prediction indexes at the character level.
    """
#     pdb.set_trace()
    preds = torch.sigmoid(torch.tensor(preds))
    all_predictions = []
    for pred, offsets, seq_ids in zip(
        preds, dataset["offset_mapping"], dataset["sequence_ids"]
    ):
        start_idx = None
        current_preds = []
        for p, o, s_id in zip(pred, offsets, seq_ids):
            if s_id is None or s_id == 0:
                continue
            if p > 0.5:
                if start_idx is None:
                    start_idx = o[0]
                end_idx = o[1]
            elif start_idx is not None:
                current_preds.append((start_idx, end_idx))
                start_idx = None

        if start_idx is not None:
            current_preds.append((start_idx, end_idx))

        all_predictions.append(current_preds)

    return all_predictions

# compute_metrics = partial(kaggle_metrics, dataset=dataset_mapped['test'])

### Override the compute_loss function of trainer class

since its a binary classification and trainer class not allowing 1 as a target variable for tokenclassification, I had to override the compute_loss to make it adaptable to this use case.

In [None]:
class BinaryClassificationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss(reduction="none")
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        loss = torch.masked_select(loss, labels.view(-1, 1) > -1).mean()
        return (loss, outputs) if return_outputs else loss

In [None]:
dataset_mapped.shape

### Run on all folds and log the metrics to wandb

In [None]:
for fold in range(config['folds']):
    print(f"current training fold: {fold}")
    train_dataset = dataset_mapped.filter(lambda x:x['fold'] != fold, num_proc=config['num_proc'])
    eval_dataset = dataset_mapped.filter(lambda x:x['fold'] == fold, num_proc=config['num_proc'])
    print(f"train_dataset shape {train_dataset.shape}, eval dataset shape{eval_dataset.shape}")
    compute_metrics = partial(kaggle_metrics, dataset=eval_dataset)
    if "wandb" in args.report_to:
        wandb_config = {
            **args.__dict__,
        }
        wandb_config["fold"] = fold
        wandb.init(config=wandb_config, group=os.environ["WANDB_RUN_GROUP"])
    trainer = BinaryClassificationTrainer(model=model,
                 args=args,
                 train_dataset=train_dataset,
                 eval_dataset=eval_dataset,
                 tokenizer=tokenizer,
                compute_metrics=compute_metrics)
    trainer.train()
    trainer.save_model(f"fold_{fold}")
    if "wandb" in args.report_to:
        wandb.finish()
    torch.cuda.empty_cache()
    gc.collect()
#     break

### I hope you leant a new way of using trainer class from hugginface, Upvote if you find it usefull. Happy learning!