In [None]:
import pickle
import re
from collections import Counter
from datetime import datetime
from pathlib import Path

from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from tqdm.notebook import tqdm

from discharge_summaries.preprocessing.preprocess_snomed import Snomed
from discharge_summaries.schemas.mimic import Record

In [None]:
DATA_DIR = Path.cwd().parent / "data"

TRAINING_DATASET_PATH = DATA_DIR / "train_all_ds.pkl"
TIMESTAMP = datetime.now().strftime("%Y_%m_%d_%H_%M")
TRAINING_ANNO_DATASET_PATH = DATA_DIR / f"train_anno_all_{TIMESTAMP}.pkl"
RANDOM_SEED = 23
SNOMED_PATH = (
    Path.cwd().parent / "data" / "SnomedCT_InternationalRF2_PRODUCTION_20230731T120000Z"
)

SPACY_MODEL = "en_core_sci_md"
MAX_SEGMENT_TOKEN_LENGTH = 128
HF_MODEL_NAME = "roberta-base"

In [None]:
with open(TRAINING_DATASET_PATH, "rb") as in_file:
    dataset = [Record(**record) for record in pickle.load(in_file)]
dataset = dataset
len(dataset)

Preprocessing SNOMED CT for MedCAT

In [None]:
sowmed = Snomed(str(SNOMED_PATH))
sowmed.uk_ext = True

In [None]:
df = sowmed.to_concept_df()
df.head()

In [None]:
df["description_type_ids"].unique()

In [None]:
filter_type_names = {
    "disorder",
    "finding",
    "morphologic abnormality",
    "organism",
    "physical object",
    "clinical drug",
    "medicinal product form",
    "procedure",
    "product",
}
assert all(name in df["description_type_ids"].unique() for name in filter_type_names)

In [None]:
df = df[df["description_type_ids"].isin(filter_type_names)]
len(df)

In [None]:
# df_subset = df[df['description_type_ids'].isin(['finding', 'disorder'])]
df_subset = df[df["name_status"] == "A"]
len(df_subset), len(df_subset["cui"].unique())

In [None]:
df_subset.head(10)

In [None]:
tokenizer_spacy = English().tokenizer

In [None]:
snomed_matcher = PhraseMatcher(tokenizer_spacy.vocab, "LOWER")
for cui, group_df in tqdm(df_subset.groupby("cui")):
    snomed_matcher.add(cui, list(tokenizer_spacy.pipe(group_df["name"])))

In [None]:
doc = tokenizer_spacy("heart attack")
matches = snomed_matcher(doc)
tokenizer_spacy.vocab.strings[matches[0][0]]

In [None]:
no_cuis = 0
dataset_annotations = []
for doc in tqdm(dataset[:1000]):
    doc_annotations = []

    bhc_cui_ids = {
        match_id
        for match_id, _, _ in snomed_matcher(tokenizer_spacy(doc.discharge_summary.bhc))
    }

    for note in doc.physician_notes:
        for section in note.text.split("\n\n"):
            for sentence in re.split("\n(?=[^ a-z])|(?<=[?|!|.])\\s", section):
                sentence_cui_ids = {
                    match_id
                    for match_id, _, _ in snomed_matcher(tokenizer_spacy(sentence))
                }
                if not sentence_cui_ids:
                    no_cuis += 1
                    continue
                elif sentence_cui_ids.intersection(bhc_cui_ids) == set():
                    label = "NEGATIVE"
                else:
                    label = "POSITIVE"
                doc_annotations.append(
                    {"text": sentence, "labels": label, "cui_ids": sentence_cui_ids}
                )
        dataset_annotations.append(doc_annotations)

In [None]:
flattened_dataset_labels = [
    str(sentence_annotation["labels"])
    for doc_annotations in dataset_annotations
    for sentence_annotation in doc_annotations
]
len(flattened_dataset_labels)

In [None]:
num_sentences = len(flattened_dataset_labels) + no_cuis

In [None]:
for label, count in Counter(flattened_dataset_labels).most_common():
    print(label, count / num_sentences)
print("NONE", no_cuis / num_sentences)

In [None]:
num_bhc_cuis = 0
num_matches = 0

for doc, docs_annotations in tqdm(zip(dataset, dataset_annotations)):
    note_cui_ids = {
        cui_id
        for sentence_annotation in docs_annotations
        for cui_id in sentence_annotation["cui_ids"]
    }

    bhc_cui_ids = {
        match_id
        for match_id, _, _ in snomed_matcher(tokenizer_spacy(doc.discharge_summary.bhc))
    }
    num_matches += len(note_cui_ids.intersection(bhc_cui_ids))
    num_bhc_cuis += len(bhc_cui_ids)

In [None]:
num_matches / num_bhc_cuis