https://htmlpreview.github.io/?https://github.com/CogStack/MedCATtutorials/blob/main/notebooks/specialised/Preprocessing_SNOMED_CT.html

In [None]:
import logging
import pickle
import re
from collections import Counter
from pathlib import Path

from medcat.utils.preprocess_snomed import Snomed
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from tqdm.notebook import tqdm

from discharge_summaries.schemas.mimic import Record

In [None]:
SNOMED_PATH = (
    Path.cwd().parent / "data" / "SnomedCT_InternationalRF2_PRODUCTION_20230731T120000Z"
)
MODEL_DIR = Path.cwd().parent / "models"
logging.basicConfig(level=logging.INFO)

In [None]:
DATA_DIR = Path.cwd().parent / "data"
GT_DATA_PATH = DATA_DIR / "train.pkl"

MODEL_PATH = (
    Path.cwd().parent
    / "models"
    / "mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5.zip"
)

In [None]:
with open(GT_DATA_PATH, "rb") as in_file:
    gt_dataset = [Record(**record) for record in pickle.load(in_file)]
len(gt_dataset)

Preprocessing SNOMED CT for MedCAT

In [None]:
sowmed = Snomed(str(SNOMED_PATH))
sowmed.uk_ext = True

In [None]:
df = sowmed.to_concept_df()
df.head()

In [None]:
df["description_type_ids"].unique()

In [None]:
# df_subset = df[df['description_type_ids'].isin(['finding', 'disorder'])]
df_subset = df[df["name_status"] == "A"]
len(df_subset), len(df_subset["cui"].unique())

In [None]:
df_subset.head(10)

In [None]:
tokenizer = English().tokenizer

In [None]:
matcher = PhraseMatcher(tokenizer.vocab, "LOWER")
for cui, group_df in tqdm(df_subset.groupby("cui")):
    matcher.add(cui, list(tokenizer.pipe(group_df["name"])))

In [None]:
matches = matcher(tokenizer("stroke"), as_spans=True)
matches[0].label_

In [None]:
num_hits = 0
num_examples = 0
misses = set()
for sample in tqdm(gt_dataset):
    for para in sample.discharge_summary.bhc_paragraphs:
        if not para.heading:
            continue
        clean_heading = para.heading
        heading_match = re.search(r"[a-zA-Z]", para.heading)
        clean_heading = (
            para.heading[heading_match.start() :] if heading_match else para.heading
        )
        # clean_heading = clean_heading.replace('/', ' ')
        matches = matcher(tokenizer(clean_heading), as_spans=True)
        # if not(any(word in clean_heading.lower() for word in {"fen", "communication", "access", "code"})):
        num_examples += 1
        if matches:
            num_hits += 1
        else:
            misses.add(clean_heading.lower().strip())

num_hits / num_examples

In [None]:
misses

In [None]:
df.head()

In [None]:
matcher(tokenizer("uti"), as_spans=True)

In [None]:
df[df["name"].str.contains(" UTI ", case=False)]

In [None]:
df_p_names = df[df["name_status"] == "P"].set_index("cui")
len(df_p_names)

In [None]:
type_ids = [
    df_p_names.loc[match.label_].description_type_ids
    for sample in tqdm(gt_dataset)
    for para in sample.discharge_summary.bhc_paragraphs
    for match in matcher(tokenizer(para.heading), as_spans=True)
]

In [None]:
counter = Counter(type_ids)
counter.most_common()

In [None]:
num_examples, len(misses), num_hits

In [None]:
misses

In [None]:
# from spacy.matcher import Matcher

# token_matcher = Matcher(tokenizer.vocab)

In [None]:
# for cui, group_df in tqdm(df_subset.groupby("cui")):
#     pattern = [[{"LOWER": {"FUZZY": token.text}} for token in pattern_doc]
#     for pattern_doc in tokenizer.pipe(group_df["name"])]
#     token_matcher.add(cui, pattern)

In [None]:
# num_hits = 0
# num_examples = 0
# misses = set()
# for sample in tqdm(gt_dataset):
#     for para in sample.discharge_summary.bhc_paragraphs:
#         if not para.heading:
#             continue
#         clean_heading = para.heading
#         # heading_match = re.search(r'[a-zA-Z]', para.heading)
#         # clean_heading = para.heading[heading_match.start():] if heading_match else para.heading
#         # clean_heading = clean_heading.replace('/', ' ')
#         matches = token_matcher(tokenizer(clean_heading), as_spans=True)
#         # if not(any(word in clean_heading.lower() for word in {"fen", "communication", "access", "code"})):
#         num_examples += 1
#         if matches:
#             num_hits += 1
#         else:
#             misses.add(clean_heading.lower().strip())

# num_hits / num_examples