In [None]:
import json
import re
from pathlib import Path
from typing import Dict, List

import openpyxl
import PyPDF2
import tiktoken

from discharge_summaries.schemas.prsb_guidelines import (
    ArrayElement,
    ClusterElement,
    Element,
    RecordElement,
    Row,
    Section,
    clean_text,
    to_camel_case,
)

In [None]:
GUIDELINES_DIR = Path.cwd().parent / "guidelines"
GUIDELINES_EXCEL_PATH = GUIDELINES_DIR / "eDischarge-Summary-v2.1-1st-Feb-21.xlsx"
GUIDELINES_PYDANTIC_MODEL_PATH = (
    GUIDELINES_DIR / "eDischarge-Summary-v2.1-1st-Feb-21_pydantic.json"
)
GUIDELINES_IMPLEMENTATION_PDF_PATH = (
    GUIDELINES_DIR
    / "eDischarge-Summary-Maintenance-Release-Implementation-Guidance-Report-v2.1-23.1.19.pdf"
)
TOKENIZER = tiktoken.get_encoding("cl100k_base")

In [None]:
sheet = openpyxl.load_workbook(GUIDELINES_EXCEL_PATH)["Sheet1"]
# First 4 rows are headers
rows = list(sheet.iter_rows(values_only=True, min_row=4))

In [None]:
def get_cluster_rows(rows: List[Row]) -> List[Row]:
    # This function returns the cluster rows between the cluster header
    # e.g. medication_item_cluster and cluster tail e.g. end_of_medication_item_cluster
    # If no cluster tail header present the end of the cluster is the final row
    cluster_rows = []
    if len(rows) <= 1:
        raise ValueError(f"Only the header of a cluster was found. {rows}")
    for row in rows[1:]:
        if row.name == f"end_of_{rows[0].name}":
            break
        cluster_rows.append(row)

    return cluster_rows


def rows_to_elements(rows: List[Row]) -> List[Element]:
    elements = []
    row_idx = 0
    rows = [row for row in rows if row.do_not_use is False]
    while row_idx < len(rows):
        row = rows[row_idx]
        if "record entry" in row.description:
            element = RecordElement(
                name=row.name,
                description=row.description,
                items=rows_to_elements(rows[row_idx + 1 :]),
            )
            # Record entries are whole section objects
            row_idx = len(rows)
        elif row.name.endswith("item_entry"):
            cluster_rows = get_cluster_rows(rows[row_idx + 1 :])
            element = RecordElement(
                name=row.name,
                description=row.description,
                items=rows_to_elements(cluster_rows),
            )
            # Item entries have title, cluster head and tail rows
            row_idx += len(cluster_rows) + 3
        elif row.cardinality.startswith("0 to many"):
            element = ArrayElement(
                name=row.name,
                description=row.description,
            )
            row_idx += 1
        elif row.name.endswith("cluster"):
            cluster_rows = get_cluster_rows(rows[row_idx:])
            element = ClusterElement(
                name=row.name,
                description=row.description,
                elements=rows_to_elements(cluster_rows),
            )
            # Clusters have head and tail rows
            row_idx += len(cluster_rows) + 2
        else:
            element = Element(
                name=row.name,
                description=row.description,
            )
            row_idx += 1
        elements.append(element)
    return elements


def rows_to_section(section_rows: List[Row]) -> Section:
    section_row = section_rows[1]
    element_rows = section_rows[3:]
    return Section(
        name=section_row.name,
        description=section_row.description,
        elements=rows_to_elements(element_rows),
    )


def rows_to_sections(rows: List[List[str]]) -> List[Section]:
    section_models = []
    section_rows: List[Row] = []
    for row in rows:
        if all(element is None for element in row):
            section_models.append(rows_to_section(section_rows))
            section_rows = []
        else:
            section_rows.append(Row.from_record(row))
    section_models.append(rows_to_section(section_rows))
    return [
        section_model for section_model in section_models if section_model is not None
    ]

In [None]:
sections = rows_to_sections(rows)
sections

In [None]:
json_schema_dict: Dict = {
    "type": "object",
    "properties": {
        k: v for section in sections for k, v in section.to_json_schema_dict().items()
    },
}

In [None]:
len(TOKENIZER.encode(json.dumps(json_schema_dict)))

## PDF Parsing

In [None]:
reader = PyPDF2.PdfReader(GUIDELINES_IMPLEMENTATION_PDF_PATH)

In [None]:
text = [line for page in reader.pages for line in page.extract_text().split("\n")]
# Remove pdf footers
text = [
    re.sub(
        (
            "(PRSB eDischarge Summary  – Implementation Guidance  V2.1)|(January 2019 "
            r" Page \d+  )|(January 2019  Page \d+  )"
        ),
        "",
        line,
    ).strip()
    for line in text
]
# Tody up parsed text
text = [re.sub(" {2,}", " ", line) for line in text]
text = [re.sub("reco rd", "record", line) for line in text]
text = [line for line in text if line]

In [None]:
last_heading_idx = len(text)
heading_to_text = {}
for heading_idx, line in enumerate(reversed(text)):
    if re.match(r"^\d+\.\d+ [A-Za-z ]+$", line):
        section_text = "\n".join(text[len(text) - heading_idx : last_heading_idx])
        section_text = re.sub(r"\d+.\d+.\d+ ", "", section_text)
        section_text = re.sub("\n(?=[a-z])", " ", section_text)
        section_text = clean_text(section_text)
        heading = to_camel_case(re.sub(r"\d+.\d+ ", "", line))

        heading_to_text[heading] = section_text
        last_heading_idx = len(text) - heading_idx - 1

In [None]:
for property_heading, property_body in json_schema_dict["properties"].items():
    if property_heading in heading_to_text:
        property_body["description"] += f"\n{heading_to_text[property_heading]}"

In [None]:
len(TOKENIZER.encode(json.dumps(json_schema_dict)))

In [None]:
GUIDELINES_PYDANTIC_MODEL_PATH.write_text(json.dumps(json_schema_dict, indent=4))
GUIDELINES_PYDANTIC_MODEL_PATH