In [None]:
import random
from pathlib import Path

import pandas as pd
import tiktoken
from dotenv import load_dotenv
from spacy.lang.en import English

from discharge_summaries.preprocessing.preprocess_snomed import Snomed
from discharge_summaries.structured_data_extractors.mimic import (
    MIMICStructuredDataExtractor,
)

In [None]:
load_dotenv()

In [None]:
MIMIC_III_DIR = (
    Path.cwd().parent / "data" / "physionet.org" / "files" / "mimiciii" / "1.4"
)
AZURE_ENGINE = "gpt-3-turbo-16k"
AZURE_API_VERSION = "2023-07-01-preview"
TOKENIZER = tiktoken.get_encoding("cl100k_base")
SNOMED_DIR = Path.cwd().parent / "data" / "snomed"

In [None]:
physician_notes_df = pd.read_csv(MIMIC_III_DIR / "physician_notes.csv")
discharge_summary_df = pd.read_csv(MIMIC_III_DIR / "discharge_summaries.csv")

In [None]:
structured_data_extractor = MIMICStructuredDataExtractor(MIMIC_III_DIR)

In [None]:
hadm_ids = discharge_summary_df["HADM_ID"].unique()
random.Random(23).shuffle(hadm_ids)
hadm_id = hadm_ids[5]
hadm_id

In [None]:
physician_notes_hadm_id_df = physician_notes_df[
    physician_notes_df["HADM_ID"] == hadm_id
]
len(physician_notes_hadm_id_df)

In [None]:
structured_data_summary = structured_data_extractor.complete_prsb_discharge_summary(
    hadm_id
)
# structured_data_summary_dict = structured_data_summary.dict()
# medications_structured_data = structured_data_summary_dict.pop(
#     "medications_and_medical_devices"
# )
# procedures_structured_data = structured_data_summary_dict.pop("procedures")
print()

In [None]:
tokenizer = English().tokenizer

In [None]:
hadm_id_drugs = structured_data_extractor.prescriptions_df[
    structured_data_extractor.prescriptions_df["HADM_ID"] == hadm_id
]

In [None]:
hadm_id_drugs[hadm_id_drugs["DRUG"] == "HYDROmorphone (Dilaudid)"].sort_values(
    "STARTDATE"
)

In [None]:
structured_data_summary.medications_and_medical_devices.medication_change_summary_cluster

In [None]:
hadm_id_drugs_list = set(hadm_id_drugs["DRUG"].tolist())
sorted(hadm_id_drugs_list)

In [None]:
structured_data_summary.medications_and_medical_devices.medication_change_summary_cluster

In [None]:
# drug_to_cui = {drug: int(span.label_) for drug in hadm_id_drugs_list for span in whole_snomed_matcher(tokenizer(drug), as_spans=True)}
# drug_to_cui

In [None]:
for note in physician_notes_hadm_id_df["TEXT"]:
    print(note)

In [None]:
snomed = Snomed.load(SNOMED_DIR)

In [None]:
whole_snomed_matcher = snomed.get_phrase_matcher({"SNOMED CT Concept"}, tokenizer)

In [None]:
doc = tokenizer("Paracetamol")
whole_snomed_matcher(doc)

In [None]:
# for drug in hadm_id_drugs_list:
#     drug_phrase_matcher = snomed.get_phrase_matcher({drug}, tokenizer)
#     for note in physician_notes_hadm_id_df["TEXT"]:
#         if drug_phrase_matcher(tokenizer(note), as_spans=True):
#             print(drug)

In [None]:
# for drug in hadm_id_drugs_list:
#     for note in physician_notes_hadm_id_df["TEXT"]:
#         if drug.lower() in note.lower():
#             print(drug)