In [None]:
import json
import re
from pathlib import Path
from typing import Dict, List

import openpyxl
import PyPDF2

In [None]:
GUIDELINES_DIR = Path.cwd().parent / "guidelines"
GUIDELINES_EXCEL_PATH = GUIDELINES_DIR / "eDischarge-Summary-v2.1-1st-Feb-21.xlsx"
GUIDELINES_IMPLEMENTATION_PDF_PATH = (
    GUIDELINES_DIR
    / "eDischarge-Summary-Maintenance-Release-Implementation-Guidance-Report-v2.1-23.1.19.pdf"
)

GUIDELINES_JSON_PATH = GUIDELINES_DIR / "eDischarge-Summary-v2.1-1st-Feb-21_schema.json"

In [None]:
sheet = openpyxl.load_workbook(GUIDELINES_EXCEL_PATH)["Sheet1"]
rows = list(sheet.iter_rows(values_only=True, min_row=4))

In [None]:
def row_list_to_dict(row: List[str]) -> Dict[str, str]:
    return {
        "name": row[0].strip() if row[0] else "",
        "description": row[1].strip() if row[1] else "",
        "cardinality": row[2].strip() if row[2] else "",
        "data_type": row[3].strip() if row[3] else "",
        "values": row[4].strip() if row[4] else "",
        "extract_from_free_text": row[5].strip() if row[5] else "",
    }


def element_rows_to_json_schema(element_rows: List[List[str]]) -> Dict:
    first_element_row = row_list_to_dict(element_rows[0])
    if (
        first_element_row["values"] == ""
        and "record entry" in first_element_row["description"]
    ):
        return {
            first_element_row["name"]: {
                "description": first_element_row["description"],
                "type": "array",
                "items": element_rows_to_json_schema(element_rows[1:]),
            }
        }

    elements_schema = {}
    for element_row_list in element_rows:
        element_row = row_list_to_dict(element_row_list)
        if any(field == "" for field in element_row.values()):
            print(element_row)
            continue
        # elif element_row["extract_from_free_text"] == "Y":
        else:
            elements_schema[element_row["name"]] = {
                "description": element_row["description"].strip(),
                "type": "string",
            }
    return elements_schema


def create_section_json_schema_from_rows(section_rows: List[List[str]]) -> Dict:
    SECTION_ROW = 1
    ELEMENT_HEADER_ROW = 2

    section_row = row_list_to_dict(section_rows[SECTION_ROW])

    return (
        {
            section_row["name"]: {
                "type": "object",
                "description": section_row["description"],
                "properties": element_rows_to_json_schema(
                    section_rows[ELEMENT_HEADER_ROW + 1 :]
                ),
            }
        }
        # if section_row["extract_from_free_text"] == "Y"
        # else {}
    )


def create_schema_from_rows(rows: List[List[str]]) -> Dict:
    section_schema = {}
    section_rows: List[List[str]] = []
    for row in rows:
        if all(element is None for element in row):
            section = create_section_json_schema_from_rows(section_rows)
            if section:
                section_schema.update(section)
            section_rows = []
        else:
            section_rows.append(row)
    section = create_section_json_schema_from_rows(section_rows)
    if section:
        section_schema.update(section)
    return {"type": "object", "properties": section_schema}

In [None]:
schema = create_schema_from_rows(rows)
# GUIDELINES_JSON_PATH.write_text(json.dumps(schema, indent=4))

In [None]:
reader = PyPDF2.PdfReader(GUIDELINES_IMPLEMENTATION_PDF_PATH)

In [None]:
text = [line for page in reader.pages for line in page.extract_text().split("\n")]
text = [
    re.sub(
        (
            "(PRSB eDischarge Summary  â€“ Implementation Guidance  V2.1)|(January 2019 "
            r" Page \d+  )|(January 2019  Page \d+  )"
        ),
        "",
        line,
    ).strip()
    for line in text
]
text = [re.sub(" {2,}", " ", line) for line in text]
text = [re.sub("reco rd", "record", line) for line in text]
text = [line for line in text if line]

In [None]:
last_heading_idx = len(text)
heading_to_text = {}
for heading_idx, line in enumerate(reversed(text)):
    if re.match(r"^4\.\d+ [A-Za-z ]+$", line):
        # print(text[last_heading_idx])
        section_text = "\n".join(text[len(text) - heading_idx : last_heading_idx])
        section_text = re.sub(r"\d+.\d+.\d+ ", "", section_text)
        section_text = re.sub("\n(?=[a-z])", " ", section_text)
        heading = re.sub(r"\d+.\d+ ", "", line)
        heading_to_text[heading] = section_text
        last_heading_idx = len(text) - heading_idx - 1
        print(text[last_heading_idx])

In [None]:
for k, v in heading_to_text.items():
    print(k)

In [None]:
set(heading_to_text.keys()) - set(schema["properties"].keys())

In [None]:
schema["properties"].keys()

In [None]:
print(json.dumps(schema, indent=4))

In [None]:
GUIDELINES_JSON_PATH