## Extract BHCs

1. Load Discharge Dataset
2. Extract BHCs
3. Run Snomed Phrase Matcher + evaluate misses
5. Tune Snomed PhraseMatcher
6. Save BHCs and Tuned Snomed Phrase Matcher

In [None]:
import json
import pickle
import re
from collections import Counter
from pathlib import Path
from typing import List, Tuple

import pandas as pd
from tqdm.notebook import tqdm

from discharge_summaries.schemas.mimic import BHC, ProblemSection
from discharge_summaries.snomed.lookup import SnomedLookup

In [None]:
MIMIC_III_DIR = (
    Path.cwd().parent / "data" / "physionet.org" / "files" / "mimiciii" / "1.4"
)
BHC_FPATH = MIMIC_III_DIR / "BHCS.json"

SNOMED_DIR = Path.cwd().parent / "data" / "snomed"
PHRASE_MATCHER_FPATH = SNOMED_DIR / "snomed_phrase_matcher.pkl"
TUNED_PHRASE_MATCHER_FPATH = SNOMED_DIR / "tuned_snomed_phrase_matcher.pkl"

## 1. Load Discharge Summaries

In [None]:
discharge_summary_df = pd.read_csv(MIMIC_III_DIR / "discharge_summaries_mimic.csv")

In [None]:
for text in discharge_summary_df.iloc[:5]["BHC"]:
    print(text)
    print("*" * 80)

## 2. Extract BHCs

In [None]:
def get_paragraph_splitting_regexes() -> Tuple[re.Pattern, re.Pattern]:
    punctuation_prefix = "[^A-Za-z]*"
    heading_regex = "[A-Za-z][^\n]*?"
    heading_delimiter = "[-:][ \n]"
    paragraph_text = ".*"

    paragraph_split_regex = re.compile(
        f"\n\n(?={punctuation_prefix}{heading_regex}{heading_delimiter})"
    )
    heading_grouping_regex = re.compile(
        f"^{punctuation_prefix}({heading_regex}){heading_delimiter}({paragraph_text})",
        re.DOTALL,
    )
    return paragraph_split_regex, heading_grouping_regex


def extract_bhcs_from_discharge_summaries(
    discharge_summary_df: pd.DataFrame,
) -> List[BHC]:
    paragraph_split_regex, heading_grouping_regex = get_paragraph_splitting_regexes()
    bhcs = []
    for _, discharge_summary in tqdm(
        discharge_summary_df.iterrows(), total=len(discharge_summary_df)
    ):
        paragraphs = re.split(paragraph_split_regex, str(discharge_summary["BHC"]))

        first_match = re.match(heading_grouping_regex, paragraphs[0])
        if (
            not first_match
            or "assessment" in first_match.group(1).strip().lower()
            or "a/p" in first_match.group(1).strip().lower()
        ):
            assessment_and_plan = paragraphs[0]
        else:
            assessment_and_plan = ""
        problem_paragraph_start_idx = 1 if assessment_and_plan else 0

        problem_sections = []
        for para in paragraphs[problem_paragraph_start_idx:]:
            match = re.match(heading_grouping_regex, para)
            problem_sections.append(
                ProblemSection(
                    heading=match.group(1).strip() if match else "",
                    text=match.group(2).strip() if match else para.strip(),
                )
            )

        bhcs.append(
            BHC(
                hadm_id=int(discharge_summary["HADM_ID"]),
                full_text=str(discharge_summary["BHC"]),
                assessment_and_plan=assessment_and_plan,
                problem_sections=problem_sections,
            )
        )
    return bhcs

In [None]:
bhcs = extract_bhcs_from_discharge_summaries(discharge_summary_df)

In [None]:
valid_bhcs = []
incorrect_format_bhcs = []
for bhc in bhcs:
    if bhc.problem_sections and all(
        problem_paragraph.heading for problem_paragraph in bhc.problem_sections
    ):
        valid_bhcs.append(bhc)
    else:
        incorrect_format_bhcs.append(bhc)
len(valid_bhcs) / len(bhcs), len(valid_bhcs)

In [None]:
for bhc in incorrect_format_bhcs[:10]:
    print(bhc.full_text)
    print("*" * 80)

In [None]:
for bhc in valid_bhcs[:5]:
    print("Full Text:")
    print(bhc.full_text)
    print("---")
    print("Assessment and Plan:")
    print(bhc.assessment_and_plan)
    print("Sections")
    for section in bhc.problem_sections:
        print("---")
        print(section.heading)
        print(section.text)
    print("*" * 80)

## 3. Run 'Vanilla' Snomed Phrase Matcher

In [None]:
snomed_phrase_matcher = pickle.loads(PHRASE_MATCHER_FPATH.read_bytes())

In [None]:
headings = [para.heading for bhc in valid_bhcs for para in bhc.problem_sections]
headings_snomed_spans = snomed_phrase_matcher.pipe(headings)
missed_headings = [
    heading.lower()
    for heading, cuis in zip(headings, headings_snomed_spans)
    if not cuis
]
1 - (len(missed_headings) / len(headings))

In [None]:
for value, count in Counter(missed_headings).most_common():
    if count >= 15:
        print(f"{value}: {count} times")

## 4. Tune Snomed Phrase Matcher

In [None]:
snomed_lookup = SnomedLookup.load(SNOMED_DIR)

Extra cuis that don't fall under previously defined parent cuis

In [None]:
extra_parent_cuis = {
    169443000,
    311788003,
    384760004,
}
print([snomed_lookup.cui_to_preferred_term[cui] for cui in extra_parent_cuis])

In [None]:
for parent_cui in tqdm(extra_parent_cuis):
    snomed_phrase_matcher.add_parent_cui(parent_cui, snomed_lookup)

Add obvious misses for any disorder/finding that occurs over 15 times in train set

In [None]:
cui_and_missing_synonyms = [
    (365870005, "Code"),
    (311788003, "Access"),
    (384760004, "FEN"),  # Added as parent cui
    # Access is a parent cui
    (118231006, "Communication"),
    (301113001, "Rhythm"),
    (106063007, "Pump"),
    (169443000, "PPX"),  # Added as parent cui
    (73211009, "Diabetes"),
    (102957003, "Neuro"),
    (251015000, "Coronaries"),
    (36456004, "Dispo"),
    (384760004, "Nutrition"),
    (106063007, "CV"),
    (903081000000107, "Contact"),
    (36456004, "Disposition"),
    (118231006, "Comm"),
    (160931000119108, "Transaminitis"),
    (106048009, "Pulmonary"),  # ?
    (44054006, "DM2"),
    (299691001, "Heme"),
    (49436004, "Afib"),
    (401314000, "NSTEMI"),
    (106176003, "Endocrine"),
    (118238000, "Renal"),  # ?
    (19943007, "Cirrhosis"),
    (106048009, "Respiratory"),
    (116367006, "Psych"),
    (299691001, "Hematology"),
    (419284004, "AMS"),
    (301095005, "Cardiac"),
    (74474003, "GIB"),
    (166603001, "Elevated LFTs"),
    (106048009, "Resp"),
    (44054006, "DMII"),
    (301120008, "EKG changes"),
    # ('micu course', 30),
    (444931001, "Elevated troponin"),
    (106176003, "Endo"),
    (191480000, "ETOH withdrawal"),
    (37372002, "UGIB"),
    # ('goals of care', 77),
    # ('last name (un)', 25),
    (401303003, "STEMI"),
    (235856003, "ESLD"),
    #  ('anticoagulation', 24),
    (398137007, "CRI"),
    (106048009, "Pulm"),
    (233604007, "PNA"),
    (106063007, "Cardiovascular"),
    (284465006, "Social"),
    (405729008, "BRBPR"),  # 5 letter acronym
    (237840007, "Anion gap"),  # 3 different options here so chose parent
    (721104000, "Urosepsis"),
    (2776000, "Delerium"),
]

In [None]:
for cui, missing_synonym in cui_and_missing_synonyms:
    snomed_phrase_matcher._phrase_matcher.add(
        str(cui), list(snomed_phrase_matcher._nlp.pipe([missing_synonym.lower()]))
    )

Re-evaluate misses with new fixes

In [None]:
headings_snomed_spans_v2 = snomed_phrase_matcher.pipe(headings)
missed_headings_v2 = [
    heading.lower()
    for heading, cuis in zip(headings, headings_snomed_spans_v2)
    if not cuis
]
1 - len(missed_headings_v2) / len(headings)

In [None]:
for value, count in Counter(missed_headings_v2).most_common():
    if count >= 15:
        print(f"{value}: {count} times")

In [None]:
# for bhc in valid_bhcs[:1000]:
#     for idx, para in enumerate(bhc.problem_sections):
#         if para.heading.lower() == "cri":
#             print(idx, para)

## 5. Save BHCs and tuned snomed phrase matcher to file

In [None]:
BHC_FPATH.write_text(json.dumps([bhc.dict() for bhc in valid_bhcs]))

In [None]:
TUNED_PHRASE_MATCHER_FPATH.write_bytes(pickle.dumps(snomed_phrase_matcher))