## NBME Competition Notebook

### Notebook Features
- HuggingFace API, PyTorch
- Sequence Classification
- Binary Token Classification for character multi-span

## 0. Load Dependencies
Load `datasets` library offline. The method refer to:
- https://www.kaggle.com/code/samuelepino/pip-downloading-packages-to-your-local-machine/notebook?scriptVersionId=29576961

In [None]:
!ls ../input/nbme-pre-trained-models/datasets

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/nbme-pre-trained-models/datasets

In [None]:
import re
import numpy as np 
import pandas as pd
import tensorflow as tf

import os
import torch
from torch import nn

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
from transformers import DataCollatorWithPadding, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer

path = '../input/nbme-score-clinical-patient-notes'

train = pd.read_csv(path + '/train.csv')
features = pd.read_csv(path + '/features.csv')
pns = pd.read_csv(path + '/patient_notes.csv')

test = pd.read_csv(path + '/test.csv')
submission = pd.read_csv(path + '/sample_submission.csv')

print(len(train), train.columns)
print(len(features), features.columns)
print(len(pns), pns.columns)

print(len(test), test.columns)
print(len(submission), submission.columns)

print()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Current Device: ", device)
if torch.cuda.is_available():
    print("Number of CUDA device: ", torch.cuda.device_count())
    print("Device name: ", torch.cuda.get_device_name(0))

In [None]:
len_feature_text = features["feature_text"].apply(lambda x: len(x.strip()))
print(f"Character Length of feature text: MAX - {max(len_feature_text)}, MIN - {min(len_feature_text)}")

len_pn_history = pns["pn_history"].apply(lambda x: len(x))
print(f"Character Length of patient note: MAX - {max(len_pn_history)}, MIN - {min(len_pn_history)}")

max_len_annotation = train["annotation"].apply(lambda x: max(list(map(len, x.split(';')))))
min_len_annotation = train["annotation"].apply(lambda x: min(list(map(len, x.split(';')))))
print(f"Character Length of annotation: MAX - {max(max_len_annotation)}, MIN - {min(min_len_annotation)}")

## 1. Prepare Datasets

In [None]:
tokenizer = AutoTokenizer.from_pretrained("../input/nbme-pre-trained-models/tokenizer")

def tokenize(text):
    """Tokenize a sequence."""
    return tokenizer.tokenize(text, add_special_tokens=False)

example_text = pns.iloc[0].at['pn_history']
print("Example Tokens: \n", tokenize(example_text))

### 1-1. Prepare sequence training dataset.

In [None]:
seq_train = pns[['pn_history', 'case_num']]
seq_train.rename(columns = {'pn_history':'sequence', 'case_num':'labels'}, inplace = True)
seq_train['sequence'] = seq_train['sequence'].apply(lambda text: tokenize(text))

token_len = seq_train['sequence'].apply(lambda x: len(x))
print(f'Token length: MAX - {max(token_len)}, MIN - {min(token_len)}')

num_labels = pns["case_num"].nunique()
print(f"Number of Cases: {num_labels}")

seq_train_shuffled = seq_train.sample(frac=1).reset_index(drop=True)
seq_valid = seq_train_shuffled.loc[: len(seq_train) * 0.01]
seq_train = seq_train_shuffled.loc[len(seq_train) * 0.01:]

print(f'Length of Train data: {len(seq_train)}')
print(f'Length of Validation data: {len(seq_valid)}')

display(seq_train)

### 1-2. Prepare token classification training dataset.
First, define functions to encode from character indices to token labels and decode the other way round.

In [None]:
def idx2token_label(text, tokens, location):
    """Converts character indices('location') to token labels."""
    
    token_len = len(tokens)
    token_label = [0] * (token_len)
    
    pat = re.compile('\d+ \d+')
    indices = pat.findall(location)
    if not indices:
        return token_label
    indices = list(map(lambda s: s.split(), indices))
    indices.append(['0', '0'])

    s, e = list(map(int, indices.pop(0)))
    char_idx, token_idx = 0, 0
    text_len = len(text)
    while char_idx < text_len and token_idx < token_len and indices:
        if s <= char_idx < e:
            while token_idx < token_len and s <= char_idx < e:
#                 print(tokens[token_idx], char_idx)
                token_label[token_idx] = 1
                char_idx += len(re.sub('#', '', tokens[token_idx], flags=re.MULTILINE))
                while char_idx < text_len and text[char_idx] in " \t\n\r\f\v":
                    char_idx += 1
                token_idx += 1
            s, e = list(map(int, indices.pop(0)))
        else:
            char_idx += len(re.sub('#', '', tokens[token_idx], flags=re.MULTILINE))
            while char_idx < text_len and text[char_idx] in " \t\n\r\f\v":
                char_idx += 1
            token_idx += 1
    return token_label


def token_label2idx(text, tokens, token_label):
    """Converts token labels back to character indices."""
    
    char_indices = []

    token_len = len(tokens)
    text_len = len(text)
    char_idx, token_idx = 0, 0
    
    while char_idx < text_len and token_idx < token_len:
        if token_label[token_idx] == 1:
            s = char_idx
            while token_idx < token_len and token_label[token_idx] == 1:
                flag = False
                char_idx += len(re.sub('#', '', tokens[token_idx], flags=re.MULTILINE))
                while char_idx < text_len and text[char_idx] in " \t\n\r\f\v":
                    char_idx += 1
                    flag = True
                token_idx += 1
            e = char_idx - 1 if flag else char_idx
            char_indices.append(' '.join((str(s), str(e))))
        else:
            char_idx += len(re.sub('#', '', tokens[token_idx], flags=re.MULTILINE))
            while char_idx < text_len and text[char_idx] in " \t\n\r\f\v":
                char_idx += 1
            token_idx += 1
    
    return ';'.join(char_indices)


tmp = train.merge(pns, on='pn_num')
for idx in range(50, 70):
    print(tmp.iloc[idx].at['annotation'])
    example_text = tmp.iloc[idx].at['pn_history']
    example_idx = tmp.iloc[idx].at['location']
    print(f'Example Location: {example_idx}')
    ex_token_label = idx2token_label(example_text, tokenize(example_text), example_idx)
    print(f'Example Token Label: {ex_token_label}')
    print(f'Example Decoding: {token_label2idx(example_text, tokenize(example_text), ex_token_label)}\n')
del tmp

Prepare train dataset in DataFrame form.

In [None]:
def get_tokens_and_labels(df):
    """Get input and output of token classification model."""
    df = df.merge(pns, on='pn_num')
    df = df.merge(features, on='feature_num')
    df.rename(columns = {'feature_text':'features', 'pn_history':'text'}, inplace = True)
    df['tokens'] = df['features'].apply(lambda s: [s]) + df['text'].apply(lambda s: tokenize(s))
    if 'location' in df:
        inputs = df[['text', 'tokens', 'location']]
        df['tags'] = inputs.apply(lambda sr: [0] + idx2token_label(sr[0], sr[1], sr[2]), axis=1)
        return df[['id', 'features', 'text', 'tokens', 'tags', 'location']]
    return df[['id', 'features', 'text', 'tokens']]

tag_train = get_tokens_and_labels(train)
tag_test = get_tokens_and_labels(test)

tag_train_shuffled = tag_train.sample(frac=1).reset_index(drop=True)
tag_valid = tag_train_shuffled.loc[: len(tag_train) * 0.03]
tag_train = tag_train_shuffled.loc[len(tag_train) * 0.03:]

print(f'Length of Train data: {len(tag_train)}')
print(f'Length of Validation data: {len(tag_valid)}')
print(f'Length of Test data: {len(tag_test)}')
display(tag_train)

## 2. Task 1 - Sequence Classification

In [None]:
# Encode pandas DataFrame into HuggingFace Dataset object.
train_seq_dataset = Dataset.from_pandas(seq_train)
valid_seq_dataset = Dataset.from_pandas(seq_valid)

In [None]:
# Observe an example from train data
example_input = train_seq_dataset['sequence'][0]
example_tokenized = tokenizer(example_input, is_split_into_words=True)

print("Example input: \n", example_input)
print("Example tokenized : \n", example_tokenized.input_ids)

In [None]:
# Tokenize Dataset for training
def prepare_class_features(examples):
    tokenized_data = tokenizer(examples["sequence"],
                               truncation=True,
                               is_split_into_words=True)
    tokenized_data['labels'] = examples['labels']
    return tokenized_data

tokenized_seq_train = train_seq_dataset.map(prepare_class_features, 
                                            batched=True, 
                                            remove_columns=train_seq_dataset.column_names)
tokenized_seq_valid = valid_seq_dataset.map(prepare_class_features, 
                                            batched=True, 
                                            remove_columns=valid_seq_dataset.column_names)

In [None]:
# Load Pretrained Model to current device
seq_model = AutoModelForSequenceClassification.from_pretrained("../input/nbme-pre-trained-models/seq_model", 
                                                              num_labels=num_labels).to(device)
# Print model structure
# seq_model

In [None]:
batch_size = 16
epochs = 5

args = TrainingArguments(
    "./train/nbme-case",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to="none",
    fp16=True, # half precision
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    seq_model,
    args,
    train_dataset=tokenized_seq_train,
    eval_dataset=tokenized_seq_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
def print_summary(result):
    print(f"Total Time: {result.metrics['train_runtime']:.2f}s")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}s")

In [None]:
result = trainer.train()
print_summary(result)

### 3. Task 2 - Token Tagging 

In [None]:
# Encode pandas DataFrame into HuggingFace Dataset object.
train_tag_dataset = Dataset.from_pandas(tag_train)
valid_tag_dataset = Dataset.from_pandas(tag_valid)
test_dataset = Dataset.from_pandas(tag_test)

In [None]:
# Observe an example from train dataset.
example_input = train_tag_dataset['tokens'][0]
example_tokenized = tokenizer(example_input, is_split_into_words=True)

print("Example text: \n", train_tag_dataset['text'][0])
print("\nExample input: \n", example_input)
print("\nExample tokenized : \n", example_tokenized.input_ids)
print("\nExample label: \n", train_tag_dataset['tags'][0])

In [None]:
# Prepare dataset for training.
# For labels, convert it to a tokenized label form using word_idx of BatchEncode object.
def prepare_tag_features(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        padding="max_length",
        is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx: 
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_tag_train = train_tag_dataset.map(prepare_tag_features, 
                                            batched=True, 
                                            remove_columns=train_tag_dataset.column_names)
tokenized_tag_valid = valid_tag_dataset.map(prepare_tag_features, 
                                            batched=True, 
                                            remove_columns=valid_tag_dataset.column_names)

In [None]:
# Load PreTrained Model Structure.
token_model = AutoModelForTokenClassification.from_pretrained("../input/nbme-pre-trained-models/token_model", 
                                                              num_labels=2).to(device)

In [None]:
# Load previous sequence model weight into token model.
state_dict = seq_model.state_dict()
state_dict.pop('classifier.weight')
state_dict.pop('classifier.bias')

token_model.load_state_dict(state_dict, strict=False)

In [None]:
class NbmeTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to(device)
        outputs = model(**inputs)
        logits = outputs.get("logits").to(device)
        loss_fct = nn.CrossEntropyLoss().to(device)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
batch_size = 16
epochs = 7

args = TrainingArguments(
    "./nbme-tag",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to="none",
    fp16=True # half precision
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = NbmeTrainer(
    token_model,
    args,
    train_dataset=tokenized_tag_train,
    eval_dataset=tokenized_tag_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
result = trainer.train()
print_summary(result)

## 4. Evaluate Samples

In [None]:
# Get prediction values from the last model.
eval_outputs = trainer.predict(tokenized_tag_valid)
eval_pred = eval_outputs.predictions

In [None]:
# Print index many examples from the evaluation data.
index = 50

for idx in range(index):
    eval_text = valid_tag_dataset['text'][idx]
    eval_tokens = valid_tag_dataset['tokens'][idx]
    eval_labels = np.argmax(eval_pred[idx], axis=-1)

    eval_indices = token_label2idx(eval_text, eval_tokens, eval_labels)
    result = eval_indices if eval_indices else "0 0"
    eval_true = valid_tag_dataset['location'][idx]

    print(f"Prediction: {result} / True value: {eval_true}")

## 5. Predict on Test Dataset

In [None]:
# Prepare dataset for prediction.
def prepare_test_features(examples):
    return tokenizer(
        examples["tokens"], 
        truncation=True, 
        padding="max_length",
        is_split_into_words=True)

tokenized_test_dataset = test_dataset.map(prepare_test_features, 
                                          batched=True, 
                                          remove_columns=test_dataset.column_names)

In [None]:
test_output = trainer.predict(tokenized_test_dataset)
test_pred = test_output.predictions

In [None]:
predictions = []

for idx in range(len(test_pred)):
    test_text = test_dataset['text'][idx]
    test_tokens = test_dataset['tokens'][idx]
    test_labels = np.argmax(test_pred[idx], axis=-1)
    
    test_indices = token_label2idx(test_text, test_tokens, test_labels)
    predictions.append(test_indices)

In [None]:
submission['location'] = predictions
submission.to_csv('submission.csv', index=False)
display(submission)