In [None]:
from docx import Document
from docx.shared import Inches
from docx.oxml import parse_xml


def getText(filename):
    doc = Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

filename = 'Textbausteine TOM-App aktuell.docx'
text = getText(filename)

text = text.replace('„', '“')

<built-in method capitalize of str object at 0x00000268C5D76010>


In [None]:
import pandas as pd
import json
from docx import Document
import re
from collections import defaultdict


# -------- Step 2: Load Interaction Data --------
def load_interaction_data(file_path):
    xl = pd.ExcelFile(file_path)
    interaction_sheet = xl.parse(xl.sheet_names[0])
    severity_sheet = xl.parse("Tabelle2")

    interactions = []
    current_ia = None
    for _, row in interaction_sheet.iterrows():
        if pd.notna(row.get("IA-Nr")):
            current_ia = {
                "IA_number": int(row["IA-Nr"]),
                "category": row.get("Interaktion", ""),
                "pairs": []
            }
            interactions.append(current_ia)
        if pd.notna(row.get("Spezialitäten")) and current_ia:
            medications = row["Spezialitäten"].split(" - ")
            description = row.get("Interaktionsbeschreibung", "")
            current_ia["pairs"].append({
                "medications": [med.strip() for med in medications],
                "description": description.strip() if isinstance(description, str) else ""
            })

    for ia in interactions:
        level_row = severity_sheet[severity_sheet["IA-Nummer"] == ia["IA_number"]]
        if not level_row.empty and pd.notna(level_row["Stufe"].values[0]):
            try:
                ia["interaction_level"] = int(level_row["Stufe"].values[0])
            except ValueError:
                ia["interaction_level"] = None
        else:
            ia["interaction_level"] = None
    return interactions

load_interaction_data("")

In [None]:

# -------- Step 2: Load Interaction Data --------
def load_interaction_data(file_path):
    xl = pd.ExcelFile(file_path)
    interaction_sheet = xl.parse(xl.sheet_names[0])
    severity_sheet = xl.parse("Tabelle2")

    interactions = []
    current_ia = None
    for _, row in interaction_sheet.iterrows():
        if pd.notna(row.get("IA-Nr")):
            current_ia = {
                "IA_number": int(row["IA-Nr"]),
                "category": row.get("Interaktion", ""),
                "pairs": []
            }
            interactions.append(current_ia)
        if pd.notna(row.get("Spezialitäten")) and current_ia:
            medications = row["Spezialitäten"].split(" - ")
            description = row.get("Interaktionsbeschreibung", "")
            current_ia["pairs"].append({
                "medications": [med.strip() for med in medications],
                "description": description.strip() if isinstance(description, str) else ""
            })

    for ia in interactions:
        level_row = severity_sheet[severity_sheet["IA-Nummer"] == ia["IA_number"]]
        if not level_row.empty and pd.notna(level_row["Stufe"].values[0]):
            try:
                ia["interaction_level"] = int(level_row["Stufe"].values[0])
            except ValueError:
                ia["interaction_level"] = None
        else:
            ia["interaction_level"] = None
    return interactions

# -------- Step 3: Parse Word Document --------
def parse_word_doc(doc_path):
    doc = Document(doc_path)
    lines = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
    sections = {
        "no_known_interactions": None,
        "unapproved_note": None,
        "supplements_note": None,
        "consultation_recommendations": {},
        "interaction_levels": {},
        "specific_interactions": [],
        "food_interactions": [],
        "intake_guidance": {"general": None, "exceptions": []},
        "generics": {"available": [], "not_available": [], "epilepsy_caution": False},
        "contact": None,
        "double_medication": None,
        "indications": defaultdict(list),
        "medication_descriptions": {}
    }

    current_section = None
    current_indication = None
    current_med = None
    temp_desc = []

    for line in lines:
        if "Es liegt keine bekannte Wechselwirkung" in line:
            sections["no_known_interactions"] = line
        elif "nicht um ein in der Schweiz zugelassenes Medikament" in line:
            sections["unapproved_note"] = line
        elif "Nahrungsergänzungsmittel" in line and "nicht berücksichtigt" in line:
            sections["supplements_note"] = line
        elif line.startswith("IA 1 und 2"):
            sections["consultation_recommendations"]["IA_1_2"] = line
        elif line.startswith("IA 3 und 4"):
            sections["consultation_recommendations"]["IA_3_4"] = line
        elif line.startswith("IA 5/6/7"):
            sections["consultation_recommendations"]["IA_5_6_7"] = line
        elif "Wechselwirkung der Klasse" in line:
            match = re.search(r"Wechselwirkung der Klasse (\d) \((.+?)\)", line)
            if match:
                sections["interaction_levels"][match.group(1)] = match.group(2)
        elif "Methotrexat und Folsäure" in line:
            current_section = "specific_interactions"
        elif current_section == "specific_interactions" and "Metoject" in line:
            sections["specific_interactions"].append({
                "medications": ["Metoject", "Acidum folicum"],
                "class": 4,
                "note": line
            })
            current_section = None
        elif "Nahrungsmittel-Interaktionen" in line:
            current_section = "food_interactions"
        elif current_section == "food_interactions":
            if "während Therapie mit" in line:
                substances = re.findall(r"Keine[n]? (.*?) während Therapie", line)
                meds = re.findall(r"mit … ?\((.*?)\)", line)
                if not meds:
                    meds = re.findall(r"mit …(.*)", line)
                if substances and meds:
                    sections["food_interactions"].append({
                        "substances": [s.strip() for s in substances[0].split(",")],
                        "affected_medications": [m.strip() for m in meds[0].split(",")]
                    })
            elif "Einnahmehinweise" in line:
                current_section = "intake_guidance"
        elif current_section == "intake_guidance":
            if "unabhängig vom Essen" in line:
                sections["intake_guidance"]["general"] = line
            elif "Milchprodukten" in line:
                sections["intake_guidance"]["exceptions"].append(line)
        elif "Generika" in line:
            current_section = "generics"
        elif current_section == "generics":
            if "Generika im Handel" in line:
                sections["generics"]["available"].append(line)
            elif "kein Generikum" in line:
                sections["generics"]["not_available"].append(line)
            elif "Epilepsie" in line:
                sections["generics"]["epilepsy_caution"] = True
        elif "Telefonnummer" in line:
            sections["contact"] = line
        elif "Doppelmedikation" in line:
            current_section = "double_medication"
        elif current_section == "double_medication" and "keine" in line.lower():
            sections["double_medication"] = line
        elif re.match(r"^[A-ZÄÖÜ]{5,}$", line):  # Capital divider like BBBBB
            if current_med and temp_desc:
                sections["medication_descriptions"][current_med] = parse_med_description(temp_desc)
            current_med = None
            temp_desc = []
        elif line in ["Arthrose", "Asthma", "Augen", "Blutdruck", "Blutverdünner",
                      "Blutzucker / Diabetes", "Cholesterinsenker", "Entzündliche Erkrankungen (Morbus Crohn, Rheumatoide Arthritis etc.)",
                      "Epilepsie", "Hormonersatz Wechseljahre", "Magen (PPI / Antazida)",
                      "Psychische Erkrankungen / Depressionen", "Schlafmittel", "Schmerzmittel"]:
            current_section = "indications"
            current_indication = line
        elif re.match(r"^[A-ZÄÖÜa-zäöü0-9\s\-/]+$", line) and len(line.split()) < 6:
            if current_med and temp_desc:
                sections["medication_descriptions"][current_med] = parse_med_description(temp_desc)
            current_med = line
            temp_desc = []
        elif current_med:
            temp_desc.append(line)

    if current_med and temp_desc:
        sections["medication_descriptions"][current_med] = parse_med_description(temp_desc)

    return sections

def parse_med_description(desc_lines):
    usage_line = next((l for l in desc_lines if "Einnahme" in l), None)
    if usage_line:
        desc_lines.remove(usage_line)
    return {
        "description": " ".join(desc_lines),
        "usage": usage_line
    }

# -------- Step 4: Combine and Export --------
def combine_to_json(atc, interactions, structured_notes, out_path):
    result = {
        "ATC_catalogue": atc,
        "interactions": interactions,
        "structured_guidance": structured_notes
    }
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

# -------- Step 5: Main Execution --------
if __name__ == "__main__":
    atc_data = load_atc_excel("ATC-Code sortierte Textbausteine aktuell.xlsx")
    interaction_data = load_interaction_data("Interaktionen nach IA-Nummern.xlsx")
    structured_notes = parse_word_doc("Textbausteine TOM-App aktuell.docx")
    combine_to_json(atc_data, interaction_data, structured_notes, "med_data_final.json")