https://htmlpreview.github.io/?https://github.com/CogStack/MedCATtutorials/blob/main/notebooks/specialised/Preprocessing_SNOMED_CT.html

In [None]:
import logging
import pickle
from pathlib import Path

from medcat.utils.preprocess_snomed import Snomed
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from tqdm.notebook import tqdm

from discharge_summaries.schemas.mimic import Record

In [None]:
SNOMED_PATH = (
    Path.cwd().parent / "data" / "SnomedCT_InternationalRF2_PRODUCTION_20230731T120000Z"
)
MODEL_DIR = Path.cwd().parent / "models"
logging.basicConfig(level=logging.INFO)

In [None]:
DATA_DIR = Path.cwd().parent / "data"
GT_DATA_PATH = DATA_DIR / "train.pkl"

MODEL_PATH = (
    Path.cwd().parent
    / "models"
    / "mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5.zip"
)

In [None]:
with open(GT_DATA_PATH, "rb") as in_file:
    gt_dataset = [Record(**record) for record in pickle.load(in_file)]
len(gt_dataset)

Preprocessing SNOMED CT for MedCAT

In [None]:
sowmed = Snomed(str(SNOMED_PATH))
sowmed.uk_ext = True

In [None]:
df = sowmed.to_concept_df()
df.head()

In [None]:
df["description_type_ids"].unique()

In [None]:
filter_type_names = {
    "disorder",
    "finding",
    "morphologic abnormality",
    "organism",
    "physical object",
    "clinical drug",
    "medicinal product form",
    "procedure",
    "product",
}
assert all(name in df["description_type_ids"].unique() for name in filter_type_names)

In [None]:
df = df[df["description_type_ids"].isin(filter_type_names)]
len(df)

In [None]:
# df_subset = df[df['description_type_ids'].isin(['finding', 'disorder'])]
df_subset = df[df["name_status"] == "A"]
len(df_subset), len(df_subset["cui"].unique())

In [None]:
df_subset.head(10)

In [None]:
tokenizer = English().tokenizer

In [None]:
matcher = PhraseMatcher(tokenizer.vocab, "LOWER")
for cui, group_df in tqdm(df_subset.groupby("cui")):
    matcher.add(cui, list(tokenizer.pipe(group_df["name"])))

In [None]:
matches = matcher(tokenizer("code status"), as_spans=True)
matches

In [None]:
matches = matcher(tokenizer("this patient had a Cardiac infarction"), as_spans=True)
matches[0].label_

In [None]:
DATA_DIR = Path.cwd().parent / "data"

TRAINING_DATASET_PATH = DATA_DIR / "train.pkl"
DATASET_NOTE_CUI_CACHE_PATH = DATA_DIR / "dataset_note_cui_cache.json"
MODEL_PATH = (
    Path.cwd().parent
    / "models"
    / "mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5.zip"
)
RANDOM_SEED = 23
LOG_FILE = "./medcat.log"

In [None]:
with open(TRAINING_DATASET_PATH, "rb") as in_file:
    dataset = [Record(**record) for record in pickle.load(in_file)]
dataset = dataset
len(dataset)

In [None]:
para_matches = matcher(tokenizer("heart attack"), as_spans=True)
para_matches

In [None]:
sum(
    len(
        "/n/n".join(
            note.text for note in doc.physician_notes if note.category in {"Physician "}
        )
    )
    for doc in dataset
) / len(dataset)

In [None]:
num_headings = 0
num_matches = 0
cui_hits = []
partial_cui_hit = []
strict_match = []
no_match = []
for doc in tqdm(dataset[:10]):
    # para_headings = [para.heading for para in doc.discharge_summary.bhc_paragraphs if para.heading]
    # direct_match_ids = [f"D-{idx}" for idx, _ in enumerate(para_headings)]

    # direct_matcher = (tokenizer.vocab, "LOWER")
    # for heading, idx in zip(para_headings, direct_match_ids):
    #     direct_matcher.add(idx, [tokenizer(heading)])

    doc_note_text = "\n\n".join(
        note.text.lower()
        for note in doc.physician_notes
        if note.category in {"Physician "}
    )
    doc_note_spacy = tokenizer(doc_note_text)
    doc_note_matches = matcher(doc_note_spacy, as_spans=True)
    doc_note_cuis = {span.label_ for span in doc_note_matches}

    for para in doc.discharge_summary.bhc_paragraphs:
        if not para.heading:
            continue
        num_headings += 1
        para_spacy = tokenizer(para.heading)
        para_matches = matcher(para_spacy, as_spans=True)
        para_cuis = {span.label_ for span in para_matches}

        if para_cuis and para_cuis.issubset(doc_note_cuis):
            cui_hits.append(para.heading)
        elif para_cuis.intersection(doc_note_cuis) != set():
            partial_cui_hit.append(para.heading)
        elif para.heading.lower() in doc_note_text:
            strict_match.append(para.heading)
        else:
            no_match.append(para.heading)

        # print("Heading", para.heading)
        # print(para_cuis)
        # for doc_note_match in doc_note_matches:
        #     if doc_note_match.label_ in para_cuis:
        #         print(doc_note_match)

In [None]:
total_hit_rate = (
    len(cui_hits) + len(partial_cui_hit) + len(strict_match)
) / num_headings
cui_hit_rate = len(cui_hits) / num_headings
partial_cui_hit_rate = len(partial_cui_hit) / num_headings
strict_match_rate = len(strict_match) / num_headings
no_match_rate = len(no_match) / num_headings

total_hit_rate, cui_hit_rate, partial_cui_hit_rate, strict_match_rate, no_match_rate

In [None]:
matches = matcher(tokenizer("ST Elevation Myocardial Infarction"), as_spans=True)
matches

In [None]:
for ent in matches:
    print(ent.label_)

In [None]:
no_match

In [None]:
# if not para_cuis:
#     not_annotated.append(para.heading)
# elif para_cuis.issubset(doc_note_cuis):
#     complete_match.append(para.heading)
# elif para_cuis.intersection(doc_note_cuis):
#     partial_match.append(para.heading)
#     matched_cuis = para_cuis.intersection(doc_note_cuis)
#     if matched_cuis(direct_match_ids).intersection():
#         heading_match.append(para.heading)
# else:
#     no_match.append(para.heading)