# Visualization of results

This Python script generates an interactive HTML visualization for comparing named entity annotations (\<persName>, \<placeName>) between a French TEI-encoded source file and its German translation. It produces an HTML file with side-by-side aligned paragraphs, enabling users to hover over entities to trace corresponding references.


In [2]:
from lxml import etree
from html import escape

def detect_namespace(xml_string):
    root = etree.fromstring(xml_string.encode("utf-8"))
    return root.nsmap.get(None)

def extract_refs_and_wrap(xml_string, lang="fr", namespace_uri=None):
    parser = etree.XMLParser(remove_blank_text=True)
    root = etree.fromstring(xml_string.encode("utf-8"), parser)

    NS = {"tei": namespace_uri} if namespace_uri else None
    p_xpath = ".//tei:p" if namespace_uri else ".//p"
    pers_xpath = ".//tei:persName" if namespace_uri else ".//persName"
    place_xpath = ".//tei:placeName" if namespace_uri else ".//placeName"

    paragraphs = root.xpath(p_xpath, namespaces=NS)
    data = {}

    for i, p in enumerate(paragraphs):
        pid = p.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
        if lang == "de" and not pid:
            first_text = (p.text or "").strip()
            if first_text.startswith("[P.") and "]" in first_text:
                pid = first_text.split("]")[0].strip("[")
        if not pid:
            pid = f"{lang}-unnamed-{i}"

        for tag in p.xpath(f"{pers_xpath} | {place_xpath}", namespaces=NS):
            ref = tag.attrib.get("ref", "")
            tag.set("class", "entity")
            tag.set("data-ref", ref)
            tag.set("title", ref)

        data[pid] = etree.tostring(p, encoding="unicode")
    return data

def compare_refs_only(fr_xml, de_xml, fr_ns=None, de_ns=None):
    def get_refs(xml_string, lang="fr", namespace_uri=None):
        parser = etree.XMLParser(remove_blank_text=True)
        root = etree.fromstring(xml_string.encode("utf-8"), parser)

        NS = {"tei": namespace_uri} if namespace_uri else None
        p_xpath = ".//tei:p" if namespace_uri else ".//p"
        pers_xpath = ".//tei:persName" if namespace_uri else ".//persName"
        place_xpath = ".//tei:placeName" if namespace_uri else ".//placeName"

        refs = {}
        for i, p in enumerate(root.xpath(p_xpath, namespaces=NS)):
            pid = p.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
            if lang == "de" and not pid:
                first_text = (p.text or "").strip()
                if first_text.startswith("[P.") and "]" in first_text:
                    pid = first_text.split("]")[0].strip("[")
            if not pid:
                pid = f"{lang}-unnamed-{i}"

            pers = {e.attrib.get("ref") for e in p.xpath(pers_xpath, namespaces=NS) if e.attrib.get("ref")}
            place = {e.attrib.get("ref") for e in p.xpath(place_xpath, namespaces=NS) if e.attrib.get("ref")}
            refs[pid] = {"pers": pers, "place": place}
        return refs

    fr_refs = get_refs(fr_xml, lang="fr", namespace_uri=fr_ns)
    de_refs = get_refs(de_xml, lang="de", namespace_uri=de_ns)

    comparison = {}
    all_pids = sorted(set(fr_refs.keys()) | set(de_refs.keys()))
    for pid in all_pids:
        fr_pers = fr_refs.get(pid, {}).get("pers", set())
        de_pers = de_refs.get(pid, {}).get("pers", set())
        fr_place = fr_refs.get(pid, {}).get("place", set())
        de_place = de_refs.get(pid, {}).get("place", set())

        comparison[pid] = {
            "missing_pers": fr_pers - de_pers,
            "extra_pers": de_pers - fr_pers,
            "missing_place": fr_place - de_place,
            "extra_place": de_place - fr_place,
        }
    return comparison

def build_interactive_html(fr_paras, de_paras, mismatches, output_file):
    style = """
    <style>
    body { font-family: sans-serif; padding: 1em; }
    .container { display: flex; gap: 40px; }
    .col { width: 48%; vertical-align: top; }
    .entity { background-color: #d0f0d0; padding: 2px; border-radius: 4px; transition: background-color 0.2s ease, outline 0.2s ease; }
    .missing { background-color: #fdd !important; }
    .extra { background-color: #ddf !important; }
    .entity:hover { cursor: pointer; }
    h2 { border-bottom: 1px solid #ccc; }
    </style>
    """

    script = """
    <script>
    document.querySelectorAll(".entity").forEach(el => {
      el.addEventListener("mouseenter", () => {
        const ref = el.dataset.ref;
        const matches = document.querySelectorAll(`.entity[data-ref="${ref}"]`);
        matches.forEach(match => {
          match.style.outline = "2px solid green";
          match.style.backgroundColor = "#d0ffd0"; // light green
        });
      });

      el.addEventListener("mouseleave", () => {
        const ref = el.dataset.ref;
        const matches = document.querySelectorAll(`.entity[data-ref="${ref}"]`);
        matches.forEach(match => {
          match.style.outline = "";
          if (match.classList.contains("missing")) {
            match.style.backgroundColor = "#fdd";
          } else if (match.classList.contains("extra")) {
            match.style.backgroundColor = "#ddf";
          } else {
            match.style.backgroundColor = "#d0f0d0";
          }
        });
      });
    });
    </script>
    """

    html = ["<html><head><meta charset='utf-8'>", style, "</head><body>"]
    html.append("<h1>Interactive TEI Entity Comparison</h1>")

    for pid in sorted(fr_paras.keys()):
        fr = fr_paras.get(pid, "")
        de = de_paras.get(pid, "")
        miss = mismatches.get(pid, {})

        for ref in miss.get("missing_pers", set()) | miss.get("missing_place", set()):
            de = de.replace(f'data-ref="{ref}"', f'data-ref="{ref}" class="entity missing" title="{ref}"')
        for ref in miss.get("extra_pers", set()) | miss.get("extra_place", set()):
            de = de.replace(f'data-ref="{ref}"', f'data-ref="{ref}" class="entity extra" title="{ref}"')

        html.append(f"<h2>Paragraph {escape(pid)}</h2>")
        html.append(f"<div class='container paragraph' data-pid='{escape(pid)}'>")
        html.append(f"<div class='col'><b>🇫🇷 French:</b><br>{fr}</div>")
        html.append(f"<div class='col'><b>🇩🇪 German:</b><br>{de}</div>")
        html.append("</div><hr>")

    html.append(script)
    html.append("</body></html>")

    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(html))

    print(f"✅ Interactive report saved to: {output_file}")

# === MAIN ===
if __name__ == "__main__":
    with open("input/french_5-105.xml", "r", encoding="utf-8") as f:
        fr_xml = f.read()
    with open("output/gpt/german_annotated_5-105.xml", "r", encoding="utf-8") as f:
        de_xml = f.read()

    fr_ns = detect_namespace(fr_xml)
    de_ns = detect_namespace(de_xml)

    fr_paragraphs = extract_refs_and_wrap(fr_xml, lang="fr", namespace_uri=fr_ns)
    de_paragraphs = extract_refs_and_wrap(de_xml, lang="de", namespace_uri=de_ns)

    mismatches = compare_refs_only(fr_xml, de_xml, fr_ns, de_ns)

    build_interactive_html(
        fr_paragraphs,
        de_paragraphs,
        mismatches,
        output_file="html/tei_entity_alignment_gpt.html"
    )


✅ Interactive report saved to: html/tei_entity_alignment_gpt_test.html
