In [None]:
import json
from datetime import datetime
from pathlib import Path
from typing import Dict

import openpyxl
import pandas as pd
from docx import Document
from openpyxl.styles import Font

from discharge_summaries.schemas.mimic import PhysicianNote
from discharge_summaries.schemas.rcp_guidelines import RCPGuidelines
from discharge_summaries.utils.deduplicate import deduplicate_physician_notes

In [None]:
GPT_OUTPUT_DIR = Path.cwd() / "output_eval_v3"
HUMAN_OUTPUT_DIR = Path.cwd() / "output_human_v3"
EXAMPLE_DIR = Path.cwd() / "examples"

MIMIC_III_DIR = (
    Path.cwd().parent / "data" / "physionet.org" / "files" / "mimiciii" / "1.4"
)
PHYSICIAN_NOTE_FPATH = MIMIC_III_DIR / "physician_notes_mimic.csv"
HADM_IDS = [154417, 115949, 103411, 157928, 179134]

In [None]:
HUMAN_OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
notes_df = pd.read_csv(PHYSICIAN_NOTE_FPATH)

## Helper funcs

In [None]:
def response_json_to_df(response_json: Dict) -> pd.DataFrame:
    rows = []
    for section, fields_and_values in response_json.items():
        if isinstance(fields_and_values, dict):
            for field, value in fields_and_values.items():
                if isinstance(value, str):
                    rows.append([section, field, value])
                elif isinstance(value, list):
                    if not value:
                        rows.append([section, field, ""])
                    for item in value:
                        rows.append([section, field, item])
                else:
                    raise NotImplementedError
        elif isinstance(fields_and_values, list):
            if not fields_and_values:
                rows.append([section, field, ""])
            for item_idx, item in enumerate(fields_and_values):
                if isinstance(item, dict):
                    for field, value in item.items():
                        if isinstance(value, str):
                            rows.append([section, f"{field} {item_idx}", value])
                        else:
                            raise NotImplementedError
                else:
                    raise NotImplementedError
        rows.append(["", "", ""])
    return pd.DataFrame(rows, columns=["Section", "Field", "Value"])

In [None]:
def format_rows_df(rows_df: pd.DataFrame) -> pd.DataFrame:
    rows_df["Section"] = rows_df["Section"].drop_duplicates(keep="first")
    rows_df["Field"] = rows_df["Field"].drop_duplicates(keep="first")

    rows_df.fillna("", inplace=True)
    rows_df[["Section", "Field"]] = rows_df[["Section", "Field"]].applymap(
        lambda x: x.replace("_", " ").title()
    )
    return rows_df


def fill_empty_values(rows_df: pd.DataFrame) -> pd.DataFrame:
    rows_df.loc[rows_df["Field"].ne("") & rows_df["Value"].eq(""), "Value"] = (
        "Information not found in notes"
    )
    return rows_df

In [None]:
def format_excel(ws):
    ws.insert_rows(1)
    ws.cell(row=1, column=1, value="GPT Discharge Summary")
    ws.cell(row=1, column=5, value="Evaluation")

    for idx, eval_heading in enumerate(
        [
            "Missed- Severe",
            "Missed- Minor",
            "Added- Hallucination",
            "Added- Not relevant",
            "Explanation of Error",
            "Comments",
        ]
    ):
        ws.cell(row=2, column=5 + idx, value=eval_heading)

    for c in ws["A"]:
        c.font = Font(bold=True, sz=11)
    for cell in ws.iter_rows(min_row=2, max_row=2, values_only=True):
        c.font = Font(bold=True, sz=12)
    for cell in ws.iter_rows(min_row=1, max_row=1, values_only=True):
        c.font = Font(bold=True, sz=14)

    for column in ws.columns:
        ws.column_dimensions[column[0].column_letter].width = 40
    ws.column_dimensions["C"].width = 80

    for row in ws.iter_rows():
        for cell in row:
            cell.alignment = openpyxl.styles.Alignment(wrap_text=True)

In [None]:
def write_response_json_to_excel(response_json_fpath: Path, excel_fpath: Path):
    response_json = json.loads(response_json_fpath.read_text())
    discharge_summary = RCPGuidelines(**response_json)

    rows_df = response_json_to_df(discharge_summary.dict())
    rows_df = format_rows_df(rows_df)
    rows_df = fill_empty_values(rows_df)

    rows_df.to_excel(excel_fpath, index=False)

    wb = openpyxl.load_workbook(excel_fpath)
    format_excel(wb.active)
    wb.save(excel_fpath)

In [None]:
def hadm_id_notes_df_to_word(notes_df, hadm_id, deduplicate=False) -> Document:
    physician_notes = [
        PhysicianNote(
            hadm_id=row["HADM_ID"],
            title=row["DESCRIPTION"],
            timestamp=row["CHARTTIME"],
            text=row["TEXT"],
        )
        for _, row in notes_df[notes_df["HADM_ID"] == hadm_id].iterrows()
    ]
    if deduplicate:
        physician_notes = deduplicate_physician_notes(physician_notes)

    doc = Document()

    doc.add_heading(f"Physician Notes Patient ID {hadm_id}", level=1)
    for note in physician_notes:
        date_uk_format = datetime.strptime(
            note.timestamp, "%Y-%m-%d %H:%M:%S"
        ).strftime("%d-%m-%Y %H:%M:%S")
        doc.add_heading(f"{note.title}: {date_uk_format}", level=2)
        doc.add_paragraph(note.text)

    return doc

In [None]:
faulty_json_path = GPT_OUTPUT_DIR / "179544" / "discharge_summary.json"
faulty_json = json.loads(faulty_json_path.read_text())
faulty_json["GP_practice"] = {"GP_name": ""}
faulty_json_path.write_text(json.dumps(faulty_json, indent=4))

## Write to file

In [None]:
for hadm_id_output_dir in GPT_OUTPUT_DIR.iterdir():
    hadm_id = hadm_id_output_dir.stem

    human_output_hadm_id_dir = HUMAN_OUTPUT_DIR / hadm_id
    human_output_hadm_id_dir.mkdir(exist_ok=True)
    # write_response_json_to_excel(
    #     (EXAMPLE_DIR / "blank.json"), (hadim_output_dir / f"template_{hadm_id}.xlsx")
    # )

    write_response_json_to_excel(
        (hadm_id_output_dir / "discharge_summary.json"),
        (human_output_hadm_id_dir / f"discharge_summary_{hadm_id}.xlsx"),
    )

    doc = hadm_id_notes_df_to_word(notes_df, int(hadm_id), deduplicate=False)
    doc.save(human_output_hadm_id_dir / f"physician_notes_{hadm_id}.docx")

    # doc = hadm_id_notes_df_to_word(notes_df, hadm_id, deduplicate=True)
    # doc.save(hadim_output_dir / f"physician_notes_deduplicated_{hadm_id}.docx")