# NBME / DeBERTa Inference | TensorFlow

## Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd 
import os
import gc
import ast
import spacy
import random
import itertools
import matplotlib.pyplot as plt
from typing import List, Tuple

import tensorflow as tf
import tensorflow_addons as tfa



from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoConfig, TFAutoModel,AutoModel

## Config

In [None]:
# ---------- Model ---------- 
MODEL_NAME = 'microsoft/deberta-base'
TOKENIZER_PATH = "microsoft/deberta-base_tokenizer"
MAX_LEN = 512

# ---------- Training ----------
BATCH_SIZE = 8
EPOCHS = 12
LEARNING_RATE = 2e-5
CLIP_NORM = 1000

# ---------- Dataset ----------
seed=42
n_fold=5
trn_fold=[0, 1, 2, 3, 4]

debug=False

if debug:
    EPOCHS = 5
    trn_fold = [0]

## Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../input/nbme-deberta/microsoft/deberta-base_tokenizer')

In [None]:

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [None]:
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

# OOF

In [None]:

oof0 = pd.read_pickle('../input/nbme-debertabase/oof_fold0.pkl')
oof1 = pd.read_pickle('../input/nbme-debertabase/oof_fold1.pkl')
oof2 = pd.read_pickle('../input/nbme-debertabase/oof_fold2.pkl')
oof3 = pd.read_pickle('../input/nbme-debertabase/oof_fold3.pkl')
oof4 = pd.read_pickle('../input/nbme-debertabase/oof_fold4.pkl')

oof_df = pd.DataFrame()  
oof_df = pd.concat([oof0, oof1,oof2,oof3,oof4])


truths = create_labels_for_scoring(oof_df)
char_probs = get_char_probs(oof_df['pn_history'].values,
                            oof_df[[i for i in range(MAX_LEN)]].values, 
                            tokenizer)
best_th = 0.5
best_score = 0.
for th in np.arange(0.45, 0.55, 0.01):
    th = np.round(th, 2)
    results = get_results(char_probs, th=th)
    preds = get_predictions(results)
    score = get_score(preds, truths)
    if best_score < score:
        best_th = th
        best_score = score
    print(f"th: {th}  score: {score:.5f}")
print(f"best_th: {best_th}  score: {best_score:.5f}")

## Data Loading

### train.csv

In [None]:
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval) # Construct an object from a string
train['location'] = train['location'].apply(ast.literal_eval) # Construct an object from a string
print(f"train.shape: {train.shape}")
train.head()

### features.csv

In [None]:
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
print(f"features.shape: {features.shape}")
features.head()

### patient_notes.csv

In [None]:
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
print(f"patient_notes.shape: {patient_notes.shape}")
patient_notes.head()

### Merging

In [None]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
train['annotation_length'] = train['annotation'].apply(len)
print(f"train.shape: {train.shape}")
train.head()


if debug:
    train = train.sample(n=50, random_state=0).reset_index(drop=True)

## Annotations Visualization

In [None]:
idx=5910

locations = train.loc[idx,'location']
pn_history= train.loc[idx,'pn_history']

start_pos = []
end_pos = []

for location in locations:
    for loc in [s.split() for s in location.split(';')]:
        start_pos.append(int(loc[0]))
        end_pos.append(int(loc[1]))


ents = []
for i in range(len(start_pos)):
    ents.append({
        'start': int(start_pos[i]), 
        'end' : int(end_pos[i]),
        "label" : "Annotation"
    })
doc = {
    'text' : pn_history,
    "ents" : ents
}

colors = {"Annotation": "linear-gradient(0deg, #888, #eeaaaa)"} 
options = {"colors": colors}
spacy.displacy.render(doc, style="ent", options=options , manual=True, jupyter=True);

## Preprocess the data

In [None]:
# ------------------------- prepare_location ------------------------------
def prepare_location(locations: str) -> List[Tuple[int]]:
    """
    This function returns list of tuples of locations
    """
    location_tuple_list = []
    for location in locations:
        for loc in [s.split() for s in location.split(';')]:
            start, end = int(loc[0]), int(loc[1])
            location_tuple_list.append((start, end))
    
    return location_tuple_list
# ------------------------- prepare_input ------------------------------
def prepare_input(pn_history: str, feature_text: str) -> Tuple[np.array]:
    """
    This function tokenizes pn_history and feature text and
    returns numpy array of input_ids and attention_masks
    """
    tokens = tokenizer(
        pn_history,
        feature_text,
        max_length=MAX_LEN,
        padding="max_length",
        add_special_tokens=True,
    )
    
    input_ids = tokens['input_ids']
    attention_mask = tokens["attention_mask"]
    return (np.array(input_ids), np.array(attention_mask))
# ------------------------- prepare_labels ------------------------------
# Thanks yasufuminakama 
# https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train
def prepare_labels(pn_history, annotation_length, location_list):
    """
    This function creates labels with are vectors of zeros (no entity)
    and ones (entity)
    """
    tokenized = tokenizer(
        pn_history,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding="max_length",
        return_offsets_mapping=True
    )
    offset_mapping = tokenized["offset_mapping"]
    label = np.zeros(len(offset_mapping))
    if annotation_length != 0:
        locations = prepare_location(location_list)
        for location in locations:
            start_idx, end_idx = -1, -1
            start, end = location
            for idx in range(len(offset_mapping)):
                if (start_idx == -1) & (start < offset_mapping[idx][0]):
                    start_idx = idx - 1
                if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                    end_idx = idx + 1
            if start_idx == -1:
                start_idx = end_idx
            if (start_idx != -1) & (end_idx != -1):
                label[start_idx:end_idx] = 1
            
    return np.array(label)

In [None]:
# metrics
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.f1 = tfa.metrics.F1Score(num_classes=2, average='micro', threshold=0.50)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.reshape(y_true, (-1,MAX_LEN))
        y_pred = tf.reshape(y_pred, (-1,MAX_LEN))
        self.f1.update_state(y_true, y_pred)
        
    def reset_state(self):
        self.f1.reset_state()
    
    def result(self):
        return self.f1.result()
    
metrics = [
    F1Score(), 
    tf.keras.metrics.Recall(thresholds=[0.5]), 
    tf.keras.metrics.Precision(thresholds=[0.5])
]

In [None]:
# create_model
def create_model() -> tf.keras.Model:
    input_tokens = tf.keras.layers.Input(shape=(MAX_LEN,), name = 'input_tokens', dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(MAX_LEN,), name = 'attention_mask', dtype=tf.int32)
    
    backbone = TFAutoModel.from_pretrained('../input/nbme-deberta/microsoft/deberta-base/model')

    out = backbone(input_tokens, attention_mask=attention_mask)[0]
    out = tf.keras.layers.Dropout(0.2)(out)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(out)
    
    model = tf.keras.Model(inputs=[input_tokens, attention_mask], outputs=out)

    return model

model = create_model()
model.summary()

optimizer = tf.keras.optimizers.Adam(LEARNING_RATE, clipnorm=CLIP_NORM)
loss = tf.keras.losses.BinaryCrossentropy(reduction="none")
model.compile(optimizer=optimizer,loss=loss,metrics=metrics)

## Inference

In [None]:
test = pd.read_csv('../input/nbme-score-clinical-patient-notes/test.csv')
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
test.head()

## Test Dataset

In [None]:
def create_data(dataframe: pd.DataFrame,train=True):
    pn_history = dataframe["pn_history"].values
    feature_text = dataframe["feature_text"].values
    if train:
        annotation_length = dataframe['annotation_length'].values
        location = dataframe['location'].values
    input_ids = []
    attention_mask = []
    labels = []

    for i in range(len(dataframe)):
        inputs, masks = prepare_input(pn_history[i], feature_text[i])
        input_ids.append(inputs)
        attention_mask.append(masks)
        if train:
            lbls = prepare_labels(pn_history[i], annotation_length[i], location[i])
            labels.append(lbls)
    return {"input_ids":input_ids,"attention_mask":attention_mask}, labels
    

In [None]:
test_data, test_labels = create_data(test,train=False)

## Functions

In [None]:
# Thanks yasufuminakama 
# https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train

def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results

In [None]:
preds = []
for fold in trn_fold:
    model.load_weights(f'../input/nbme-debertabase/model_deberta_fold{fold}.h5')
    pred = model.predict((np.asarray(test_data['input_ids']),np.asarray(test_data['attention_mask']),))
    pred = pred.reshape(len(test), MAX_LEN)
    char_probs = get_char_probs(test['pn_history'].values, pred, tokenizer)

    preds.append(char_probs)
    del pred, char_probs; gc.collect()
preds = np.mean(preds, axis=0)

results = get_results(preds, th=best_th)

## Submission

In [None]:
submission = pd.read_csv('../input/nbme-score-clinical-patient-notes/sample_submission.csv')
submission['location'] = results
display(submission.head())
submission[['id', 'location']].to_csv('submission.csv', index=False)

## References

1. https://huggingface.co/course/chapter1/4?fw=tf
1. https://www.microsoft.com/en-us/research/publication/deberta-decoding-enhanced-bert-with-disentangled-attention-2/
1. https://github.com/microsoft/DeBERTa
1. https://huggingface.co/course/chapter1/1
1. https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train
1. https://www.kaggle.com/huchlatymon/nbme-eda-deberta-train-cv-0-85
1. https://colab.research.google.com/drive/1pH9NKhAHT40ygOOf4bwld8j221r9SRq_?usp=sharing#scrollTo=SzCwE_mHkPxj
1. https://tensorexamples.com/2020/07/27/Using-the-tf.data.Dataset.html