# Load Libraries

In [None]:
!pip install --upgrade httpx

In [1]:
from lxml import etree
import unicodedata
import re
from difflib import SequenceMatcher
from collections import defaultdict, Counter
import os
import anthropic
import openai

# Annotate with LLM

This Python script automatically transfers named entity annotations from a French TEI/XML document to its corresponding German translation using Anthropic’s Claude LLM (claude-3-5-sonnet-20240620). It aligns paragraph-by-paragraph and annotates the German text with TEI entity tags like <persName>, <placeName>, and <bibl> based on the French source.

In [11]:
import os
import openai
from lxml import etree

# === STEP 1: OpenAI Setup ===
client = openai.OpenAI(api_key="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")

french_path = "input/french_5-105.xml"
german_path = "input/german_5-50.xml"
output_path = "output/gpt/all_annotations/german_annotated_5-50.xml"
debug_folder = "output/gpt/all_annotations/debug"
os.makedirs(debug_folder, exist_ok=True)

ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

# === Load TEI Documents ===
french_tree = etree.parse(french_path)
german_tree = etree.parse(german_path)

french_paragraphs = french_tree.xpath('//tei:text//tei:p', namespaces=ns)
german_paragraphs = german_tree.xpath('//tei:text//tei:p', namespaces=ns)

def clean_gpt_output(text):
    # Remove code fences like ```xml ... ```
    if text.strip().startswith("```"):
        text = text.strip()
        # Remove first and last lines if they are code fences
        lines = text.splitlines()
        if lines[0].startswith("```") and lines[-1].startswith("```"):
            text = "\n".join(lines[1:-1])
    return text.strip()


# === GPT Annotation Function ===
def annotate_german_with_gpt(french_xml, german_text):
    prompt = (
        "You are an expert in TEI/XML and named entity annotation.\n\n"
        "Given a paragraph in French (with named entities encoded using TEI tags like <persName>, <placeName> and <bibl>) "
        "and its German translation (not annotated), annotate exactly the same entities in the German text "
        "by wrapping them in the corresponding TEI tags. Encode persons, places, works (<bibl>), dates (<date>) and transfer also the pointer to the facsimile (<anchor corresp=\"#facs_142\" xml:id=\"normalized_facs_142\"/>). "
        "ONLY encode these entities and structural things and nothing else; also map the correct ID from the @ref-Attribute. Sometimes the @ref can contain multiple IDs. It is important to not encode additional entities. "
        "Do not change the German text itself.\n\n"
        "Return only a single valid TEI XML element, starting with <p> and ending with </p>. "
        "Do not include any commentary, code blocks, or explanations. The result must be valid XML.\n\n"
        f"French TEI/XML:\n{french_xml}\n\n"
        f"German Text:\n{german_text}\n\n"
        "Output (annotated German <p> element only):"
    )

    response = client.chat.completions.create(
        model="gpt-4o",  # or gpt-o4-mini if you're using that one intentionally
        messages=[
            {"role": "system", "content": "You are a TEI/XML expert aligning named entity annotations and structural annotations from a French source to a German translation."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=4000
    )

    return response.choices[0].message.content.strip()

# === Annotate Paragraphs ===
failures = []

for i, (fr_p, de_p) in enumerate(zip(french_paragraphs, german_paragraphs)):
    xml_id = de_p.attrib.get('{http://www.w3.org/XML/1998/namespace}id', f'p{i}')
    print(f"🔍 Aligning paragraph {xml_id}...")

    fr_xml = etree.tostring(fr_p, encoding="unicode")
    de_text = "".join(de_p.itertext()).strip()

    try:
        annotated_de_raw = annotate_german_with_gpt(fr_xml, de_text)
        annotated_de = clean_gpt_output(annotated_de_raw)


        if not is_valid_xml_snippet(annotated_de):
            print(f"⚠️ Skipping invalid XML for {xml_id}: Not wrapped in <p>")
            failures.append((xml_id, "Output not wrapped in <p>"))
            with open(os.path.join(debug_folder, f"{xml_id}.txt"), "w", encoding="utf-8") as f:
                f.write(annotated_de)
            continue

        try:
            new_p = etree.fromstring(annotated_de.encode("utf-8"))
            parent = de_p.getparent()
            parent.replace(de_p, new_p)
        except Exception as parse_error:
            print(f"⚠️ XML parsing failed for {xml_id}: {parse_error}")
            failures.append((xml_id, f"Invalid XML: {parse_error}"))
            with open(os.path.join(debug_folder, f"{xml_id}.txt"), "w", encoding="utf-8") as f:
                f.write(annotated_de)
            continue

    except Exception as e:
        print(f"❌ GPT call failed for {xml_id}: {e}")
        failures.append((xml_id, str(e)))
        continue

# === Save Annotated German TEI ===
german_tree.write(output_path, encoding="utf-8", xml_declaration=True, pretty_print=True)
print(f"\n✅ Annotated German TEI saved to: {output_path}")

# === Report Failures ===
if failures:
    print("\nThese paragraphs failed:")
    for fid, reason in failures:
        print(f"- {fid}: {reason}")

print(f"\n✅ Finished: {len(german_paragraphs) - len(failures)} succeeded, {len(failures)} failed out of {len(german_paragraphs)} paragraphs.")


🔍 Aligning paragraph p0...
🔍 Aligning paragraph p1...
🔍 Aligning paragraph p2...
🔍 Aligning paragraph p3...
🔍 Aligning paragraph p4...
🔍 Aligning paragraph p5...
🔍 Aligning paragraph p6...
🔍 Aligning paragraph p7...
🔍 Aligning paragraph p8...
🔍 Aligning paragraph p9...
🔍 Aligning paragraph p10...
🔍 Aligning paragraph p11...
🔍 Aligning paragraph p12...
🔍 Aligning paragraph p13...
🔍 Aligning paragraph p14...
🔍 Aligning paragraph p15...
🔍 Aligning paragraph p16...
🔍 Aligning paragraph p17...
🔍 Aligning paragraph p18...
🔍 Aligning paragraph p19...
🔍 Aligning paragraph p20...
🔍 Aligning paragraph p21...
🔍 Aligning paragraph p22...
🔍 Aligning paragraph p23...
🔍 Aligning paragraph p24...
🔍 Aligning paragraph p25...
🔍 Aligning paragraph p26...
🔍 Aligning paragraph p27...
🔍 Aligning paragraph p28...
🔍 Aligning paragraph p29...
🔍 Aligning paragraph p30...
🔍 Aligning paragraph p31...
🔍 Aligning paragraph p32...
🔍 Aligning paragraph p33...
🔍 Aligning paragraph p34...
🔍 Aligning paragraph p35...
🔍 

# Validate with LLM - extra and missing annotations

This script compares named entity annotations between two TEI/XML documents: a French source and an annotated German translation. It checks for discrepancies in the @ref attributes of <persName> and <placeName> elements, reporting mismatches, missing or extra references, and elements with empty @ref values.

The comparison is done paragraph by paragraph, using their @xml:id attributes.

## Persons and places

In [12]:
# === Dateien und Namespace ===
french_file = "input/french_5-50.xml"
german_file = "output/gpt/all_annotations/german_annotated_5-50.xml"
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

# === Daten extrahieren ===
def extract_entity_refs_and_text(tree):
    entity_map = {}
    text_map = {}
    empty_ref_log = []
    valid_entity_counter = Counter()
    for p in tree.xpath('//tei:p[@xml:id]', namespaces=ns):
        xml_id = p.attrib.get('{http://www.w3.org/XML/1998/namespace}id')
        refs = set()
        for e in p.xpath('.//tei:persName|.//tei:placeName', namespaces=ns):
            tag = e.tag.split('}')[-1]
            ref = e.attrib.get('ref')
            entity_text = "".join(e.itertext()).strip()
            if not ref or not ref.strip():
                empty_ref_log.append({
                    'paragraph': xml_id,
                    'element': tag,
                    'text': entity_text
                })
            else:
                refs.add(ref.strip())
                valid_entity_counter[tag] += 1
        entity_map[xml_id] = refs
        text_map[xml_id] = "".join(p.itertext()).strip()
    return entity_map, text_map, empty_ref_log, valid_entity_counter

# === Parsen ===
french_tree = etree.parse(french_file)
german_tree = etree.parse(german_file)

french_entities, french_texts, french_empty_refs, french_entity_count = extract_entity_refs_and_text(french_tree)
german_entities, german_texts, german_empty_refs, _ = extract_entity_refs_and_text(german_tree)

# === Abgleich Refs ===
discrepancies = []
count_missing = 0
count_extra = 0
common_ids = set(french_entities) & set(german_entities)

for para_id in sorted(common_ids):
    fr_refs = french_entities[para_id]
    de_refs = german_entities[para_id]
    extra = sorted(de_refs - fr_refs)
    missing = sorted(fr_refs - de_refs)
    if extra or missing:
        discrepancies.append({
            'paragraph': para_id,
            'extra_in_german': extra,
            'missing_in_german': missing,
            'french_text': french_texts.get(para_id, ""),
            'german_text': german_texts.get(para_id, "")
        })
        if extra:
            count_extra += 1
        if missing:
            count_missing += 1

# === Zusammenfassung ===
total_paragraphs = len(common_ids)
total_discrepant = len(discrepancies)
total_french_empty = len(french_empty_refs)
total_german_empty = len(german_empty_refs)

# Leere @ref nach Elementtyp zählen
def summarize_empty_refs(ref_list):
    counter = Counter()
    for item in ref_list:
        counter[item['element']] += 1
    return counter

# === Zusammenfassungsbericht ===
print("=== 🔍 Annotation Summary Report ===")
print(f"Total aligned paragraphs:         {total_paragraphs}")
print(f"Paragraphs with missing refs:     {count_missing}")
print(f"Paragraphs with extra refs:       {count_extra}")
print(f"Paragraphs with any discrepancies:{total_discrepant}")
print()
print(f"Valid French entities with @ref:")
print(f"  👤 persName:  {french_entity_count['persName']}")
print(f"  📍 placeName: {french_entity_count['placeName']}")
print()
print(f"Empty @ref attributes (FR):       {total_french_empty}")
print(f"Empty @ref attributes (DE):       {total_german_empty}")
print("Empty @ref breakdown (FR):", summarize_empty_refs(french_empty_refs))
print("Empty @ref breakdown (DE):", summarize_empty_refs(german_empty_refs))
print("====================================\n")

# === Details zu Differenzen ===
if discrepancies:
    print("=== Entity Annotation Discrepancies ===")
    for d in discrepancies:
        print(f"\n📌 Paragraph {d['paragraph']}:")
        if d['extra_in_german']:
            print(f"  ➕ Extra in German:  {d['extra_in_german']}")
        if d['missing_in_german']:
            print(f"  ❌ Missing in German: {d['missing_in_german']}")
        print("  🇫🇷 French text:\n ", d['french_text'][:400], "...\n")
        print("  🇩🇪 German text:\n ", d['german_text'][:400], "...\n")
else:
    print("✅ No discrepancies in entity annotations.")

# === Leere Refs ===
if french_empty_refs or german_empty_refs:
    print("\n=== Elements with Empty @ref Attributes ===")
    for item in french_empty_refs:
        print(f"[FR] Paragraph {item['paragraph']} — <{item['element']}>: '{item['text']}' has no @ref")
    for item in german_empty_refs:
        print(f"[DE] Paragraph {item['paragraph']} — <{item['element']}>: '{item['text']}' has no @ref")
else:
    print("✅ No empty @ref attributes found.")


=== 🔍 Annotation Summary Report ===
Total aligned paragraphs:         46
Paragraphs with missing refs:     7
Paragraphs with extra refs:       0
Paragraphs with any discrepancies:7

Valid French entities with @ref:
  👤 persName:  366
  📍 placeName: 48

Empty @ref attributes (FR):       0
Empty @ref attributes (DE):       1
Empty @ref breakdown (FR): Counter()
Empty @ref breakdown (DE): Counter({'persName': 1})

=== Entity Annotation Discrepancies ===

📌 Paragraph P.14:
  ❌ Missing in German: ['#P403']
  🇫🇷 French text:
  Cette nombreuse compagnie ne s’arrêta pas longtemps. Ils
               partirent et j’en restai inconsolable. Je voulais me former sur le modèle de
                  ma belle-mère. Je voulais avoir de l’esprit et des
               airs de cour, et je ne savais comment m’y prendre pour m’en acquérir. Ah ! parents
               trop indulgents que ne démêliez-vous mes sentiments et ne mettiez-vou ...

  🇩🇪 German text:
  Die Gesellschaft verweilte nicht lang. Sie reis

## Works, dates, relations, anchors

In [13]:
from lxml import etree
from collections import Counter

# === Dateien und Namespace ===
french_file = "input/french_5-50.xml"
german_file = "output/gpt/all_annotations/german_annotated_5-50.xml"
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

# === Daten extrahieren ===
def extract_meta_refs_and_text(tree):
    entity_map = {}
    text_map = {}
    empty_ref_log = []
    valid_entity_counter = Counter()

    for p in tree.xpath('//tei:p[@xml:id]', namespaces=ns):
        xml_id = p.attrib.get('{http://www.w3.org/XML/1998/namespace}id')
        refs = set()
        for e in p.xpath('.//tei:bibl | .//tei:seg | .//tei:anchor | .//tei:date', namespaces=ns):
            tag = e.tag.split('}')[-1]
            ref = e.attrib.get('ana') or e.attrib.get('n') or e.attrib.get('when') or e.attrib.get('when-iso') or e.attrib.get('value') or e.attrib.get('corresp')
            entity_text = "".join(e.itertext()).strip()
            if not ref or not ref.strip():
                empty_ref_log.append({
                    'paragraph': xml_id,
                    'element': tag,
                    'text': entity_text
                })
            else:
                first_ref = ref.strip().split()[0]
                refs.add(first_ref)
                valid_entity_counter[tag] += 1
        entity_map[xml_id] = refs
        text_map[xml_id] = "".join(p.itertext()).strip()
    return entity_map, text_map, empty_ref_log, valid_entity_counter

# === Parsen ===
french_tree = etree.parse(french_file)
german_tree = etree.parse(german_file)

french_entities, french_texts, french_empty_refs, french_entity_count = extract_meta_refs_and_text(french_tree)
german_entities, german_texts, german_empty_refs, german_entity_count = extract_meta_refs_and_text(german_tree)

# === Abgleich Refs ===
discrepancies = []
count_missing = 0
count_extra = 0
common_ids = set(french_entities) & set(german_entities)

for para_id in sorted(common_ids):
    fr_refs = french_entities[para_id]
    de_refs = german_entities[para_id]
    extra = sorted(de_refs - fr_refs)
    missing = sorted(fr_refs - de_refs)
    if extra or missing:
        discrepancies.append({
            'paragraph': para_id,
            'extra_in_german': extra,
            'missing_in_german': missing,
            'french_text': french_texts.get(para_id, ""),
            'german_text': german_texts.get(para_id, "")
        })
        if extra:
            count_extra += 1
        if missing:
            count_missing += 1

# === Zusammenfassung ===
total_paragraphs = len(common_ids)
total_discrepant = len(discrepancies)
total_french_empty = len(french_empty_refs)
total_german_empty = len(german_empty_refs)

# Leere @ref/@ana zählen
def summarize_empty_refs(ref_list):
    counter = Counter()
    for item in ref_list:
        counter[item['element']] += 1
    return counter

# === Bericht ===
print("=== 🔍 Meta Entity Summary Report ===")
print(f"Total aligned paragraphs:         {total_paragraphs}")
print(f"Paragraphs with missing refs:     {count_missing}")
print(f"Paragraphs with extra refs:       {count_extra}")
print(f"Paragraphs with any discrepancies:{total_discrepant}")
print()
print("Valid French meta entities:")
for tag in ['bibl', 'anchor', 'seg', 'date']:
    print(f"  📄 {tag}: {french_entity_count[tag]}")
print()
print("Valid German meta entities:")
for tag in ['bibl', 'anchor', 'seg', 'date']:
    print(f"  📄 {tag}: {german_entity_count[tag]}")
print()
print(f"Empty @ana/@n/@when attributes (FR):    {total_french_empty}")
print(f"Empty @ana/@n/@when attributes (DE):    {total_german_empty}")
print("Empty ref breakdown (FR):", summarize_empty_refs(french_empty_refs))
print("Empty ref breakdown (DE):", summarize_empty_refs(german_empty_refs))
print("====================================\n")

# === Differenzen im Detail ===
if discrepancies:
    print("=== Meta Element Annotation Discrepancies ===")
    for d in discrepancies:
        print(f"\n📌 Paragraph {d['paragraph']}:")
        if d['extra_in_german']:
            print(f"  ➕ Extra in German:  {d['extra_in_german']}")
        if d['missing_in_german']:
            print(f"  ❌ Missing in German: {d['missing_in_german']}")
        print("  🇫🇷 French text:\n ", d['french_text'][:400], "...\n")
        print("  🇩🇪 German text:\n ", d['german_text'][:400], "...\n")
else:
    print("✅ No discrepancies in meta annotations.")

# === Leere Referenzen ===
if french_empty_refs or german_empty_refs:
    print("\n=== Elements with Empty @ref/@ana/@n/@when ===")
    for item in french_empty_refs:
        print(f"[FR] Paragraph {item['paragraph']} — <{item['element']}>: '{item['text']}' has no @ana, @n or @when")
    for item in german_empty_refs:
        print(f"[DE] Paragraph {item['paragraph']} — <{item['element']}>: '{item['text']}' has no @ana, @n or @when")
else:
    print("✅ No empty @ana/@n/@when attributes found.")


=== 🔍 Meta Entity Summary Report ===
Total aligned paragraphs:         46
Paragraphs with missing refs:     4
Paragraphs with extra refs:       8
Paragraphs with any discrepancies:11

Valid French meta entities:
  📄 bibl: 6
  📄 anchor: 0
  📄 seg: 6
  📄 date: 2

Valid German meta entities:
  📄 bibl: 6
  📄 anchor: 11
  📄 seg: 1
  📄 date: 2

Empty @ana/@n/@when attributes (FR):    0
Empty @ana/@n/@when attributes (DE):    1
Empty ref breakdown (FR): Counter()
Empty ref breakdown (DE): Counter({'date': 1})

=== Meta Element Annotation Discrepancies ===

📌 Paragraph P.10:
  ➕ Extra in German:  ['#facs_12']
  🇫🇷 French text:
  Mon père s’était remarié à 
                  une très belle femme
                de Berlin, d’une des plus illustres familles
               et des premières en charges et en faveur. Elle n’avait que dix-huit ans lorsque
                  mon père
                l’épousa. Il venait de temps en
               temps chez ma tante et l’on voulait m’inspirer du
        

# Inspect the annotated german TEI/XML-File

This script analyzes the automatic TEI-encoded XML document to identify all unique XML element types that appear within \<p> (paragraph) elements. It is useful for understanding the variety of tags used in annotated text, particularly when verifying or analyzing entity annotations (because sometimes LLMs tend to use different tags - for example \<lem> and \<rdg> instead of two \<rdg> inside an \<app>.

In [14]:
# Load the XML content
with open("output/claude/all_annotations/german_annotated_5-50.xml", "rb") as f:
    tree = etree.parse(f)

# TEI namespace (adjust if needed)
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

# Find all <p> elements
p_elements = tree.xpath("//tei:p", namespaces=namespaces)

# Collect all unique element tags used inside <p> elements
unique_tags = set()

for p in p_elements:
    for el in p.iterdescendants():
        # Remove namespace prefix if present
        tag = el.tag.split('}')[-1]
        unique_tags.add(tag)

# Print results
print("Unique elements inside <p> elements:")
for tag in sorted(unique_tags):
    print(f"- <{tag}>")


Unique elements inside <p> elements:
- <app>
- <bibl>
- <date>
- <milestone>
- <persName>
- <placeName>
- <rdg>
- <seg>


# Compare input and output text without annotations

This script compares the textual content of paragraphs between the plain original German TEI file and its annotated counterpart (with named entities wrapped in TEI tags). It checks whether the annotations introduced unintended changes to the text and identifies missing or extra words in the annotated version.

In [10]:
# === Einstellungen ===
original_file = "input/german_5-50.xml"
annotated_file = "output/claude/all_annotations/german_annotated_5-50.xml"
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
tags_to_strip = {"persName", "placeName", "bibl"}

# === Normalisierungsfunktion ===
def normalize(text):
    text = unicodedata.normalize("NFKC", text)
    text = text.replace("\u00A0", " ")
    text = text.replace("“", '"').replace("”", '"').replace("„", '"')
    text = text.replace("’", "'").replace("‘", "'")
    text = re.sub(r"\[P\.\d+\]", "", text)  # entferne Marker wie [P.xx]
    text = " ".join(text.split())
    return text.strip()

# === Unterschiede analysieren ===
def analyze_diff(original_text, annotated_text):
    orig_words = normalize(original_text).split()
    anno_words = normalize(annotated_text).split()

    matcher = SequenceMatcher(None, orig_words, anno_words)
    missing = []
    extra = []

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "delete":
            missing.extend(orig_words[i1:i2])
        elif tag == "insert":
            extra.extend(anno_words[j1:j2])
        elif tag == "replace":
            missing.extend(orig_words[i1:i2])
            extra.extend(anno_words[j1:j2])

    return missing, extra

# === TEI-Dateien laden ===
original_tree = etree.parse(original_file)
annotated_tree = etree.parse(annotated_file)
original_paragraphs = original_tree.xpath('//tei:p', namespaces=ns)
annotated_paragraphs = annotated_tree.xpath('//tei:p', namespaces=ns)

# === Analyse starten ===
results = []

for i, (orig_p, anno_p) in enumerate(zip(original_paragraphs, annotated_paragraphs), start=1):
    orig_text = "".join(orig_p.itertext())

    # Duplikat des annotierten Elements
    anno_copy = etree.fromstring(etree.tostring(anno_p))
    for tag in tags_to_strip:
        etree.strip_tags(anno_copy, f"{{{ns['tei']}}}{tag}")
    anno_text = "".join(anno_copy.itertext())

    # Vergleich
    if normalize(orig_text) != normalize(anno_text):
        missing, extra = analyze_diff(orig_text, anno_text)
        results.append({
            "paragraph": i,
            "missing_in_annotated": missing,
            "extra_in_annotated": extra
        })

# === Ergebnis anzeigen ===
if results:
    print("=== Detaillierte Unterschiede ===\n")
    for r in results:
        print(f"📍 Paragraph {r['paragraph']}")
        if r["missing_in_annotated"]:
            print("  ❌ Fehlende Wörter:")
            print("   ", ", ".join(r["missing_in_annotated"]))
        if r["extra_in_annotated"]:
            print("  ➕ Zusätzliche Wörter:")
            print("   ", ", ".join(r["extra_in_annotated"]))
        print()
else:
    print("✅ Keine inhaltlichen Unterschiede gefunden.")


=== Detaillierte Unterschiede ===

📍 Paragraph 7
  ❌ Fehlende Wörter:
    Jesus,
  ➕ Zusätzliche Wörter:
    Jesus, Sauveur, ,

📍 Paragraph 37
  ❌ Fehlende Wörter:
    beschenken, .
  ➕ Zusätzliche Wörter:
    beschenken.

📍 Paragraph 46
  ❌ Fehlende Wörter:
    glauben.Ich
  ➕ Zusätzliche Wörter:
    glauben., Ich, von, Schwerin

