In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd 
import nltk
import spacy
from wordcloud import WordCloud, STOPWORDS
from transformers import AutoTokenizer, AutoConfig, TFAutoModel
import tensorflow as tf
import plotly.express as px
from tqdm.auto import tqdm
nlp = spacy.load("en_core_web_sm")
from typing import List, Tuple
import keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import tensorflow_addons as tfa
import random
import warnings
import matplotlib

warnings.filterwarnings("ignore")

import os

In this notebook I show a simple solution to the **NBME** competition using **HuggingFace** auto classes with roBERTa. For data loading and model building i use TensorFlow on top of HuggingFace. Training the model for 10 epochs with cosine decay scheduled Adam optimizer.

In [None]:
# Data consts
TRAIN_PATH = "/kaggle/input/nbme-score-clinical-patient-notes/train.csv"
TEST_PATH = "/kaggle/input/nbme-score-clinical-patient-notes/test.csv"
FEATURES_PATH = "/kaggle/input/nbme-score-clinical-patient-notes/features.csv"
PATIENT_NOTES_PATH = "/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv"
SAMPLE_SUBMISSION_PATH = "/kaggle/input/nbme-score-clinical-patient-notes/sample_submission.csv"

# Model consts
MODEL_NAME = 'microsoft/deberta-base'
DATA_PATH = f"../input/{MODEL_NAME}"
SEQUENCE_LENGTH = 512
TOKENIZER_PATH = f'{MODEL_NAME}_tokenizer'
BATCH_SIZE = 8
AUTOTUNE = tf.data.AUTOTUNE
EPOCHS = 20
MODEL_CHECKPOINT = f"model.h5"
LEARNING_RATE = 2e-5
CLIP_NORM = 1000

In [None]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
SEED = 42
seed_everything(SEED)

# Loading training data

**features.csv** - The rubric of features (or key concepts) for each clinical case.
* feature_num - A unique identifier for each feature.
* case_num - A unique identifier for each case.
* feature_text - A description of the feature.

In [None]:
features_df = pd.read_csv(FEATURES_PATH)
print(f"Number of rows in features dataframe {len(features_df)}")
features_df.head()

In [None]:
print(f"Number of unique cases {len(features_df['case_num'].unique())}")

**patient_notes.csv** - A collection of about 40,000 Patient Note history portions.The patient notes in the test set are not included in the public version of this file.
* pn_num - A unique identifier for each patient note.
* case_num - A unique identifier for the clinical case a patient note represents.
* pn_history - The text of the encounter as recorded by the test taker

In [None]:
patient_notes_df = pd.read_csv(PATIENT_NOTES_PATH)
print(f"Number of rows in patient notes dataframe {len(patient_notes_df)}")
patient_notes_df.head()

**train.csv** - Feature annotations for 1000 of the patient notes, 100 for each of ten cases.
* id - Unique identifier for each patient note / feature pair.
* pn_num - The patient note annotated in this row.
* feature_num - The feature annotated in this row.
* case_num - The case to which this patient note belongs.
* annotation - The text(s) within a patient note indicating a feature. A feature may be indicated multiple times within a single note.
* location - Character spans indicating the location of each annotation within the note. Multiple spans may be needed to represent an annotation, in which case the spans are delimited by a semicolon ;.

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
print(f"Number of rows train dataframe {len(train_df)}")
train_df.head()

In [None]:
print(f"There are {len(train_df.pn_num.unique())} unique patients in this dataset")

In [None]:
print(f"There are {len(train_df.feature_num.unique())} classes of features which are NER classes in this dataset")

In [None]:
print(f"There are {len(train_df.case_num.unique())} cases")

In [None]:
from typing import List, Optional

def get_sample(df: pd.DataFrame, cols: Optional[List[str]]) -> None:
    idx = np.random.randint(len(df))
    if cols:
        sample = df.iloc[idx][cols]
    else:
        sample = df.iloc[idx]
    
    if not sample.empty:
        print(sample.values[0])
    
def length_distribution(series: pd.Series, series_name: str, nbins: int = 100) -> None:
    lengths = [len(text) for text in series]
    print(f"Average length of {series_name} - {np.mean(lengths)}")
    fig = px.histogram(x=lengths, nbins=nbins)
    fig.update_layout(template="plotly_white")
    fig.update_xaxes(title=f"Lenght of {series_name}")
    fig.show()
    
def create_word_cloud(series: pd.Series) -> None:
    texts = " ".join(series)
    word_cloud = WordCloud(
        stopwords=STOPWORDS,
        background_color="white",
        contour_color="green",
        width=1500,
        height=750,
        max_font_size=256,
        contour_width=2
    )
    word_cloud.generate(texts)
    fig, ax = plt.subplots(figsize=(14,10))

    ax.imshow(word_cloud, interpolation="bilinear")
    ax.set_axis_off()
    plt.show()
    
def distribution_plot(df: pd.DataFrame, group_by_col_name: str, agg_col_name: str, title: str) -> None:
    count = df.groupby(group_by_col_name).agg({agg_col_name: "count"})
    fig = px.bar(
        data_frame=count,
        x=count.index,
        y=agg_col_name,
        color=agg_col_name,
        color_continuous_scale="teal"
    )

    fig.update_layout(
        title={
            'text': title,
            'x': 0.5,
            'y': 0.95,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        xaxis={
            "tickmode": "array",
            "tickvals": list(range(10)),
            "ticktext": [f"case {number}" for number in range(10)]
        },
        template='plotly_white'
    )
    fig.show()

# Patient notes visualization
* Random patient note
* Word cloud
* Patients note length distribution
* Count of patients note per case distribution

In [None]:
get_sample(patient_notes_df, ["pn_history"])

In [None]:
create_word_cloud(patient_notes_df['pn_history'])

In [None]:
length_distribution(patient_notes_df['pn_history'], "patient notes", 100)

In [None]:
distribution_plot(patient_notes_df, "case_num", "pn_history", "Count of patient notes per case")

# Annotations visualization
* Random patient note
* Word cloud
* Patients note length distribution
* Count of annotations per case distribution

In [None]:
get_sample(train_df, ['annotation'])

In [None]:
create_word_cloud(train_df['annotation'])

In [None]:
length_distribution(train_df['annotation'], "Train annotations", 100)

In [None]:
distribution_plot(train_df, 'case_num', "annotation", "Count of annotations per case")

# Feature text visualization
* Random patient note
* Word cloud
* Patients note length distribution
* Count of feature texts per case distribution

In [None]:
get_sample(features_df, ['feature_text'])

In [None]:
create_word_cloud(features_df['feature_text'])

In [None]:
length_distribution(features_df['feature_text'], "Feature text", 200)

In [None]:
distribution_plot(features_df, "case_num", "feature_text", "Count of feature texts per case")

# Sample patient
let's take a look at a random patient 

In [None]:
print(f"Unique patients in the dataset - {len(train_df['pn_num'].unique())}")

In [None]:
random_patient = np.random.choice(train_df['pn_num'].unique())

patient_df = train_df[train_df['pn_num'] == random_patient]
print(f"DataFrame for patient {random_patient}")
patient_df

In [None]:
print("Patient annotations: \n")
print("\n".join(patient_df['annotation'].values))

In [None]:
print("Patient notes: \n")
print(patient_notes_df["pn_history"][patient_notes_df["pn_num"] == random_patient].values[0])

# Annotations visualization

In [None]:
locations = patient_df['location']
start_pos = []
end_pos = []

for location in locations:
    location = location.replace("[", '')
    location = location.replace("]", '')
    location = location.replace("'", '')
    location = location.replace(",", '')
    location = location.replace(";", ' ')
    if location:
        location = location.split(" ")
        for idx, value in enumerate(location):
            if idx % 2 == 0:
                start_pos.append(value)
            else:
                end_pos.append(value)


ents = []
for i in range(len(start_pos)):
    ents.append({
        'start': int(start_pos[i]), 
        'end' : int(end_pos[i]),
        "label" : "Annotation"
    })

patient_history = patient_notes_df[patient_notes_df["pn_num"] == random_patient]["pn_history"].item()    

doc = {
    'text' : patient_history,
    "ents" : ents
}

colors = {"Annotation": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"} 
options = {"colors": colors}
spacy.displacy.render(doc, style="ent", options=options , manual=True, jupyter=True);

**NER visualization**

In [None]:
doc = nlp(patient_history)
spacy.displacy.render(doc, style='ent', jupyter=True)

# NER & Hugging Face

**NER** - Named entity recognition processes text by location and defining entities, basic NER works by only locating certain entities for examples locating names of people in some text, but more advanced NER systems can also identify a class, so for example instead of only locating names such system would also label this location as "Name". In this competition we are only required to find the locations of entities


**HuggingFace**  is a popular Python library containing implementation of various transformers models, predefined tokenizers and even datasets.
It supports both TensorFlow and PyTorch and has easy to use and well documented API.

Check out the details on HuggingFace [website](https://huggingface.co/)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained(f'{TOKENIZER_PATH}')

config = AutoConfig.from_pretrained(MODEL_NAME)
config.save_pretrained(f'{TOKENIZER_PATH}')

In [None]:
def create_model() -> tf.keras.Model:
    input_tokens = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int32)

    config = AutoConfig.from_pretrained(MODEL_NAME, output_hidden_states=True)
    backbone = TFAutoModel.from_pretrained(MODEL_NAME, config=config)

    out = backbone(input_tokens, attention_mask=attention_mask)[0]
    out = tf.keras.layers.Dropout(0.2)(out)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(out)

    return tf.keras.Model(inputs=[input_tokens, attention_mask], outputs=out)

In [None]:
model = create_model()
model.summary()

# Dataset tokenization

In [None]:
train = train_df.merge(features_df, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes_df, on=['pn_num', 'case_num'], how='left')
train['annotation_length'] = train['annotation'].apply(len)

train.head()

In [None]:
def create_inputs(pn_history: str, feature_text: str) -> Tuple[np.array]:
    """
    This function tokenizes pn_history and feature text and
    returns numpy array of input_ids and attention_masks
    """
    tokens = tokenizer(
        pn_history,
        feature_text,
        max_length=SEQUENCE_LENGTH,
        padding="max_length",
        add_special_tokens=True,
    )
    
    input_ids = tokens['input_ids']
    attention_mask = tokens["attention_mask"]
    return (np.array(input_ids), np.array(attention_mask))

In [None]:
def decode_location(location: str) -> List[Tuple[int]]:
    """
    This function decodes ['ab cd ...'] format of location annotations
    from dataset and return list of tuples of locations
    """
    location = location.replace("[", '')
    location = location.replace("]", '')
    location = location.replace("'", '')
    location = location.replace(",", '')
    location = location.replace(";", ' ')
    location = location.split(" ")
    if list(filter(None, location)) == []:
        return []
    
    location = list(map(int, location))
    location_tuple_list = []
    
    for i in range(0, len(location), 2):
        x1 = location[i]
        x2 = location[i+1]
        location_tuple_list.append((x1, x2))
    
    return location_tuple_list

In [None]:
# https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train

def create_labels(pn_history, annotation_length, location_list):
    """
    This function creates labels with are vectors of zeros (no entity)
    and ones (entity)
    """
    tokenized = tokenizer(
        pn_history,
        add_special_tokens=True,
        max_length=SEQUENCE_LENGTH,
        padding="max_length",
        return_offsets_mapping=True
    )
    
    offset_mapping = tokenized["offset_mapping"]
    label = np.zeros(len(offset_mapping))
    if annotation_length != 0:
        locations = decode_location(location_list)
        for location in locations:
            start_idx, end_idx = -1, -1
            start, end = location
            for idx in range(len(offset_mapping)):
                if (start_idx == -1) & (start < offset_mapping[idx][0]):
                    start_idx = idx - 1
                if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                    end_idx = idx + 1
            if start_idx == -1:
                start_idx = end_idx
            if (start_idx != -1) & (end_idx != -1):
                label[start_idx:end_idx] = 1
            
    return np.array(label)

In [None]:
sample_pn_history = list(train["pn_history"].values)[0]
sample_annot_len = list(train["annotation_length"].values)[0] 
sample_loc = list(train["location"].values)[0]
sample_feature_text = list(train["feature_text"].values)[0]

In [None]:
print("Sample pn_history\n")
print(sample_pn_history)

In [None]:
print("Sample annotation length\n")
print(sample_annot_len)

In [None]:
print("Sample location\n")
print(sample_loc)

In [None]:
print("Sample feature text\n")
print(sample_feature_text)

# Optimized dataloader

In [None]:
train, valid = train_test_split(
    train[['pn_history', 'feature_text','annotation_length', 'location']],
    test_size=0.2
)

In [None]:
def get_dataset_generator(dataframe: pd.DataFrame):
    pn_history = dataframe["pn_history"].values
    feature_text = dataframe["feature_text"].values
    annotation_length = dataframe['annotation_length'].values
    location = dataframe['location'].values
    
    for i in range(len(dataframe)):
        inputs, masks = create_inputs(pn_history[i], feature_text[i])
        labels = create_labels(pn_history[i], annotation_length[i], location[i])
        yield (inputs, masks), labels

In [None]:
def get_dataloader(dataset_generator) -> tf.data.Dataset:
    dataloader = tf.data.Dataset.from_generator(
        dataset_generator,
        output_signature=(
            (
                tf.TensorSpec(shape=(SEQUENCE_LENGTH,), dtype=tf.dtypes.int32, name="inputs"),
                tf.TensorSpec(shape=(SEQUENCE_LENGTH,), dtype=tf.dtypes.int32, name="attention_masks"),
            ),
            tf.TensorSpec(shape=(SEQUENCE_LENGTH,), dtype=tf.dtypes.int32, name="labels"),
        )
    )

    dataloader = dataloader.batch(BATCH_SIZE)
    return dataloader.prefetch(AUTOTUNE)

# Training

In [None]:
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    min_delta=1e-5, 
    patience=4, 
    verbose=1,
    mode='auto', 
    restore_best_weights=True
)

rlr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=1e-5, 
    patience=2, 
    mode='auto', 
    min_delta=0.001
)

In [None]:
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.f1 = tfa.metrics.F1Score(num_classes=2, average='micro', threshold=0.50)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.reshape(y_true, (-1, SEQUENCE_LENGTH))
        y_pred = tf.reshape(y_pred, (-1, SEQUENCE_LENGTH))
        self.f1.update_state(y_true, y_pred)
        
    def reset_state(self):
        self.f1.reset_state()
    
    def result(self):
        return self.f1.result()

In [None]:
metrics = [
    F1Score(), 
    tf.keras.metrics.Recall(thresholds=[0.5]), 
    tf.keras.metrics.Precision(thresholds=[0.5])
]

callbacks = [rlr, es]
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE, clipnorm=CLIP_NORM)
loss = tf.keras.losses.BinaryCrossentropy(reduction="none")

In [None]:
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

history = model.fit(
    get_dataloader(lambda: get_dataset_generator(train)), 
    epochs=EPOCHS,
    validation_data=get_dataloader(lambda: get_dataset_generator(valid)),
    callbacks=callbacks,
)

In [None]:
model.save_weights(f"{MODEL_NAME}.h5")

In [None]:
def plot_history():
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    
    precision = history.history["precision"]
    val_precision = history.history["val_precision"]
    
    recall = history.history["recall"]
    val_recall = history.history["val_recall"]
    
    f1 = history.history["f1"]
    val_f1 = history.history["val_f1"]
    
    fig, ax = plt.subplots(4, figsize=(10,25))
    
    min_train_loss = min(loss)
    min_val_loss = min(val_loss)
    
    ax[0].plot(loss, label="Train loss")
    ax[0].plot(val_loss, label="Valid loss")
    ax[0].scatter(y=min_train_loss, x=loss.index(min_train_loss))
    ax[0].scatter(y=min_val_loss, x=val_loss.index(min_val_loss))
    ax[0].title.set_text('Loss')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylabel('Loss value')
    ax[0].legend(loc="lower left")

    ax[1].plot(precision, label="Train precision")
    ax[1].plot(val_precision, label="Valid precision")
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylabel('Precision value')
    ax[1].title.set_text('Precision')
    ax[1].legend(loc="lower right")
    
    ax[2].plot(recall, label="Train recall")    
    ax[2].plot(val_recall, label="Valid recall")
    ax[2].set_xlabel('Epoch')
    ax[2].set_ylabel('Recall value')
    ax[2].title.set_text('Recall')
    ax[2].legend(loc="lower right")
    
    ax[3].plot(f1, label="Train F1 score")
    ax[3].plot(val_f1, label="Valid F1 score")
    ax[3].set_xlabel('Epoch')
    ax[3].set_ylabel('F1 score value')
    ax[3].title.set_text("F1 score")
    ax[3].legend(loc="lower right")
    
    plt.show()

In [None]:
plot_history()