In [None]:
from pathlib import Path

import pandas as pd

from discharge_summaries.preprocessing.preprocess_snomed import Snomed

In [None]:
MIMIC_III_DIR = (
    Path.cwd().parent / "data" / "physionet.org" / "files" / "mimiciii" / "1.4"
)
SNOMED_DIR = (
    Path.cwd().parent
    / "data"
    / "SnomedCT_InternationalRF2_PRODUCTION_20230731T120000Z"
    / "Snapshot"
    / "Terminology"
)
description_file = SNOMED_DIR / "sct2_Description_Snapshot-en_INT_20230731.txt"
relation_file = SNOMED_DIR / "sct2_Relationship_Snapshot_INT_20230731.txt"

In [None]:
snomed = Snomed.load_from_raw_snomed_files(description_file, relation_file)

In [None]:
physician_notes_df = pd.read_csv(MIMIC_III_DIR / "physician_notes.csv")
procedures_df = pd.read_csv(MIMIC_III_DIR / "PROCEDUREEVENTS_MV.csv")
items_df = pd.read_csv(MIMIC_III_DIR / "D_ITEMS.csv", usecols=["ITEMID", "LABEL"])
procedures_df = pd.merge(
    procedures_df,
    items_df[["ITEMID", "LABEL"]],
    on="ITEMID",
    how="inner",
)

In [None]:
sample_hadm_id = list(physician_notes_df["HADM_ID"].unique())[1]
sample_hadm_id

In [None]:
sample_physician_notes = physician_notes_df[
    physician_notes_df["HADM_ID"] == sample_hadm_id
]
sample_physician_notes

In [None]:
sample_procedures = procedures_df[procedures_df["HADM_ID"] == sample_hadm_id]
sample_procedures

In [None]:
procedure_cuis = snomed.get_child_cuis("71388002").union(
    snomed.get_child_cuis("129125009")
)
len(procedure_cuis)

In [None]:
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from tqdm.notebook import tqdm

tokenizer_spacy = English().tokenizer
snomed_matcher = PhraseMatcher(tokenizer_spacy.vocab, "LOWER")

In [None]:
grouped_synonyms = snomed.synonyms_df.groupby("cui")
for cui in tqdm(procedure_cuis):
    snomed_matcher.add(
        cui, list(tokenizer_spacy.pipe(grouped_synonyms.get_group(cui)["name"].values))
    )

In [None]:
doc = tokenizer_spacy("operation")
matches = snomed_matcher(doc)
snomed.get_preferred_term(tokenizer_spacy.vocab.strings[matches[0][0]])

In [None]:
joined_sample_physician_notes = "\n\n".join(sample_physician_notes["TEXT"].values)

In [None]:
doc = tokenizer_spacy(joined_sample_physician_notes)
matches = snomed_matcher(doc)
snomed.get_preferred_term(tokenizer_spacy.vocab.strings[matches[0][0]])

In [None]:
print(sample_physician_notes.iloc[-1]["TEXT"])