In [1]:
from itertools import combinations

import dill as pickle
import evaluate
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from datasets import Dataset
from gensim.models.keyedvectors import KeyedVectors
from ipymarkup import show_span_line_markup
from more_itertools import chunked
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model
from sentence_transformers import InputExample, SentenceTransformer, losses, models
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    pipeline,
)

from snomed_graph import *
from constants import id2label, label2id


2024-02-27 10:55:14.318425: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-27 10:55:14.344066: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-27 10:55:14.344086: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-27 10:55:14.344092: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-27 10:55:14.348240: I tensorflow/core/platform/cpu_feature_g

In [30]:
random_seed = 10  # For reproducibility
max_seq_len = 512  # Maximum sequence length for (BERT-based) encoders
cer_model_path = "best_yikuan8-Clinical-Longformer-noLoRA_fold4"
kb_embedding_model_id = ("sentence-transformers/all-MiniLM-L6-v2") # base model for concept encoder
use_LoRA = False  # Whether to use a LoRA to fine-tune the CER model
torch.manual_seed(random_seed)
assert torch.cuda.is_available()

In [31]:
if use_LoRA:
    config = PeftConfig.from_pretrained(cer_model_path)

    cer_model = AutoModelForTokenClassification.from_pretrained(
        pretrained_model_name_or_path=config.base_model_name_or_path,
        num_labels=3,
        id2label=id2label,
        label2id=label2id,
    )
    cer_model = PeftModel.from_pretrained(cer_model, cer_model_path)
else:
    cer_model = AutoModelForTokenClassification.from_pretrained(
        pretrained_model_name_or_path=cer_model_path,
        num_labels=3,
        id2label=id2label,
        label2id=label2id,
    )

cer_tokenizer = AutoTokenizer.from_pretrained(cer_model_path)

In [87]:
# If using the adaptor, ignore the warning:
# "The model 'PeftModelForTokenClassification' is not supported for token-classification."
# The PEFT model is wrapped just fine and will work within the pipeline.
# N.B. moving model to CPU makes inference slower, but enables us to feed the pipeline
# directly with strings.
cer_pipeline = pipeline(
    task="token-classification",
    model=cer_model,
    tokenizer=cer_tokenizer,
    aggregation_strategy="simple",
    device="cpu",
)

In [88]:
notes_df = pd.read_csv("data/training_notes.csv").set_index("note_id")
annotations_df = pd.read_csv("data/train_annotations.csv").set_index("note_id")

training_notes_df, test_notes_df = train_test_split(
    notes_df, test_size=32, random_state=random_seed
)
test_annotations_df = annotations_df.loc[test_notes_df.index]

In [89]:
test_annotations_df

Unnamed: 0_level_0,start,end,concept_id
note_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12204158-DS-10,182,193,91936005
12204158-DS-10,196,201,1003755004
12204158-DS-10,239,258,267036007
12204158-DS-10,299,312,91602002
12204158-DS-10,318,328,264957007
...,...,...,...
12986424-DS-6,4697,4705,368208006
12986424-DS-6,4707,4714,439470001
12986424-DS-6,4747,4757,247347003
12986424-DS-6,4823,4828,72670004


In [90]:
from sklearn.model_selection import KFold

fFold = KFold(5, shuffle=True, random_state=42)
note_ids = notes_df.index.unique().to_series()
fold_split = list(fFold.split(note_ids))[4]

In [91]:
test_note_ids = note_ids.iloc[fold_split[1]]

In [92]:
test_note_ids = test_note_ids.index.to_list()

In [93]:
notes_df["text"][0]

  notes_df["text"][0]


' \nName:  ___                  Unit No:   ___\n \nAdmission Date:  ___              Discharge Date:   ___\n \nDate of Birth:  ___             Sex:   M\n \nService: SURGERY\n \nAllergies: \nPenicillins\n \nAttending: ___.\n \nChief Complaint:\nBiliary pancreatitis\n \nMajor Surgical or Invasive Procedure:\n___: Laparoscopic cholecystectomy\n\n \nHistory of Present Illness:\nMr. ___ is a ___ man who had severe biliary \npancreatitis resulting in pancreatic necrosis for which he was \ntreated with nasojejunal feedings and pancreatic rest.  He had \ninitially had multisystem organ failure, which improved. Mr. \n___ has a large postnecrotic pseudocyst, which has been \ndrained through a minimally invasive approach into his GI tract. \n He has some debris, but this is not currently infected. The \npatient was followed by Dr. ___ in his ___ \nclinic to discuss cholecystectomy. After discussion of all \nrisks, benefits and possible outcomes, patient was scheduled for \nelective cholecystectom

In [94]:
cer_string = "Mr. ___ is a ___ man who had severe biliary \npancreatitis resulting in pancreatic necrosis for which he was \ntreated with nasojejunal feedings and pancreatic rest."
tokens = cer_tokenizer(cer_string, return_tensors="pt")
cer_model(**tokens)

LongformerTokenClassifierOutput(loss=None, logits=tensor([[[ 7.1441, -3.1399, -3.7972],
         [ 6.7422, -2.9281, -3.7073],
         [ 7.0214, -3.2133, -3.5900],
         [ 7.2215, -3.0807, -3.6904],
         [ 7.2497, -3.0971, -3.7509],
         [ 7.3136, -3.3825, -3.6830],
         [ 7.1121, -2.7195, -3.8693],
         [ 6.8077, -2.5195, -4.1473],
         [ 6.9402, -2.9003, -3.8868],
         [ 6.2262, -1.8107, -3.9503],
         [ 4.7152,  0.0207, -3.6280],
         [-4.1640,  5.6739, -1.5900],
         [-4.4639,  4.5339,  0.2508],
         [-2.7900, -2.7628,  6.9566],
         [-2.6114, -3.1640,  7.1332],
         [-2.8681, -2.7780,  6.9722],
         [-3.0501, -2.7380,  6.9105],
         [-2.8660, -2.8045,  6.8297],
         [-2.9778, -2.5184,  6.7543],
         [ 5.5428, -2.9113, -2.3258],
         [ 6.0834, -2.8958, -2.8365],
         [-4.1106,  5.7273, -1.3854],
         [-3.7238,  5.3728, -1.3896],
         [-3.1829, -2.0638,  5.9434],
         [-2.9313, -2.5812,  6.6004],


In [112]:
# Visualise the predicted clinical entities against the actual annotated entities.
# N.B. only the first 512 tokens of the note will contain predicted spans.
# Not run due to sensitivity of MIMIC-IV notes

def is_pos_in_span(pos, span):
    return span[0] <= pos < span[1]

for note_id in test_note_ids[:1]:
    text = notes_df.loc[note_id].text

    # +1 to offset the [CLS] token which will have been added by the tokenizer
    predicted_annotations = [
        (span["start"], span["end"], "PRED") for span in cer_pipeline(text)
    ]

    # combine spans that are sequential (with no space between them)
    for i in range(len(text)):
        for j in range(len(predicted_annotations)-1):
            if is_pos_in_span(i, predicted_annotations[j]) and is_pos_in_span(i+1, predicted_annotations[j+1]):
                predicted_annotations[j] = (predicted_annotations[j][0], predicted_annotations[j+1][1], "PRED")
                predicted_annotations.pop(j+1)
                break
    
    # remove all invert spans
    for i in range(len(predicted_annotations)-1):
        if predicted_annotations[i][1] >= predicted_annotations[i+1][0]:
            predicted_annotations.pop(i)
            break

    gt_annotations = [
        (row.start, row.end, "GT") for row in annotations_df.loc[note_id].itertuples()
    ]

    show_span_line_markup(text, predicted_annotations + gt_annotations)

In [None]:
note_id = "12986424-DS-6"
text = test_notes_df.loc[note_id].text[512:]

# +1 to offset the [CLS] token which will have been added by the tokenizer
predicted_annotations = [
    (span["start"] + 1, span["end"], "PRED") for span in cer_pipeline(text)
]

gt_annotations = [
    (row.start-512, row.end-512, "GT") if row.start > 512 else None for row in test_annotations_df.loc[note_id].itertuples()
]

gt_annotations = list(set(gt_annotations))

gt_annotations.remove(None)

show_span_line_markup(text, predicted_annotations + gt_annotations)