# Load Libraries

In [None]:
!pip install --upgrade httpx

In [2]:
from lxml import etree
import unicodedata
import re
from difflib import SequenceMatcher
from collections import defaultdict, Counter
import os
import anthropic

# Annotate with LLM

This Python script automatically transfers named entity annotations from a French TEI/XML document to its corresponding German translation using Anthropic’s Claude LLM (claude-sonnet-4-20250514). It aligns paragraph-by-paragraph and annotates the German text with TEI entity tags \<persName> and \<placeName> based on the French source.

In [3]:
import openai
from lxml import etree

# === STEP 1: Claude Setup ===
api_key = ""
client = anthropic.Anthropic(api_key=api_key)


french_path = "input/french_5-105.xml"
german_path = "input/german_5-105.xml"
output_path = "output/claude/german_annotated_5-105.xml"
debug_folder = "output/claude/debug"
os.makedirs(debug_folder, exist_ok=True)

ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

# === Load TEI Documents ===
french_tree = etree.parse(french_path)
german_tree = etree.parse(german_path)

french_paragraphs = french_tree.xpath('//tei:text//tei:p', namespaces=ns)
german_paragraphs = german_tree.xpath('//tei:text//tei:p', namespaces=ns)

# === XML sanity check ===
def is_valid_xml_snippet(text):
    text = text.strip()
    return text.startswith("<p") and text.endswith("</p>")

# === Claude Annotation Function ===
def annotate_german_with_claude(french_xml, german_text):
    prompt = (
        "You are an expert in TEI/XML and named entity annotation."
        "Given a paragraph in French (with named entities encoded using the TEI Tags <persName and <placeName>) "
        "and its German translation (not annotated), annotate exactly the same entities in the German text "
        "by wrapping them in the corresponding TEI tags. Encode persons and places."
        "ONLY encode these entities and structural things and nothing else; also map the correct"
        "ID from the @ref-Attribute. Sometimes the @ref can contain multiple IDs."
        "It is important to not encode additional entities." 
        "Do not change the german text itstelf."
        "Return only a single valid TEI XML element, starting with <p> and ending with </p>."
        "Also return the correct xml:id of the paragraph."
        "Do not include any commentary, code blocks, or explanations. The result must be valid XML.\n\n"
        f"French TEI/XML:\n{french_xml}\n\n"
        f"German Text:\n{german_text}\n\n"
        "Output (annotated German <p> element only):"
    )

    message = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=10000,
        temperature=0.0,
        system="You are a TEI/XML expert aligning named entity annotations from a French source to a German translation.",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    return message.content[0].text.strip()

# === Annotate Paragraphs ===
failures = []

for i, (fr_p, de_p) in enumerate(zip(french_paragraphs, german_paragraphs)):
    xml_id = de_p.attrib.get('{http://www.w3.org/XML/1998/namespace}id', f'p{i}')
    print(f"🔍 Aligning paragraph {xml_id}...")

    fr_xml = etree.tostring(fr_p, encoding="unicode")
    de_text = "".join(de_p.itertext()).strip()

    try:
        annotated_de = annotate_german_with_claude(fr_xml, de_text)

        if not is_valid_xml_snippet(annotated_de):
            print(f"Skipping invalid XML for {xml_id}: Not wrapped in <p>")
            failures.append((xml_id, "Output not wrapped in <p>"))
            with open(os.path.join(debug_folder, f"{xml_id}.txt"), "w", encoding="utf-8") as f:
                f.write(annotated_de)
            continue

        try:
            new_p = etree.fromstring(annotated_de.encode("utf-8"))
            parent = de_p.getparent()
            parent.replace(de_p, new_p)
        except Exception as parse_error:
            print(f"XML parsing failed for {xml_id}: {parse_error}")
            failures.append((xml_id, f"Invalid XML from Claude: {parse_error}"))
            with open(os.path.join(debug_folder, f"{xml_id}.txt"), "w", encoding="utf-8") as f:
                f.write(annotated_de)
            continue

    except Exception as e:
        print(f"❌ Claude call failed for {xml_id}: {e}")
        failures.append((xml_id, str(e)))
        continue

# === Save Annotated German TEI ===
german_tree.write(output_path, encoding="utf-8", xml_declaration=True, pretty_print=True)
print(f"\nAnnotated German TEI saved to: {output_path}")

# === Report Failures ===
if failures:
    print("\nThese paragraphs failed:")
    for fid, reason in failures:
        print(f"- {fid}: {reason}")

print(f"\nFinished: {len(german_paragraphs) - len(failures)} succeeded, {len(failures)} failed out of {len(german_paragraphs)} paragraphs.")


🔍 Aligning paragraph P.5...
🔍 Aligning paragraph P.6...
🔍 Aligning paragraph P.7...
🔍 Aligning paragraph P.8...
🔍 Aligning paragraph P.9...
🔍 Aligning paragraph P.10...
🔍 Aligning paragraph P.11...
🔍 Aligning paragraph P.12...
🔍 Aligning paragraph P.13...
🔍 Aligning paragraph P.14...
🔍 Aligning paragraph P.15...
🔍 Aligning paragraph P.16...
🔍 Aligning paragraph P.17...
🔍 Aligning paragraph P.18...
🔍 Aligning paragraph P.19...
🔍 Aligning paragraph P.20...
🔍 Aligning paragraph P.21...
🔍 Aligning paragraph P.22...
🔍 Aligning paragraph P.23...
🔍 Aligning paragraph P.24...
🔍 Aligning paragraph P.25...
🔍 Aligning paragraph P.26...
🔍 Aligning paragraph P.27...
🔍 Aligning paragraph P.28...
🔍 Aligning paragraph P.29...
🔍 Aligning paragraph P.30...
🔍 Aligning paragraph P.31...
🔍 Aligning paragraph P.32...
🔍 Aligning paragraph P.33...
🔍 Aligning paragraph P.34...
🔍 Aligning paragraph P.35...
🔍 Aligning paragraph P.36...
🔍 Aligning paragraph P.37...
🔍 Aligning paragraph P.38...
🔍 Aligning paragrap

# Inspect the annotated german TEI/XML-File

This script analyzes the automatic TEI-encoded XML document to identify all unique XML element types that appear within \<p> (paragraph) elements. It is useful for understanding the variety of tags used in annotated text, particularly when verifying or analyzing entity annotations (because sometimes LLMs tend to use different tags - for example \<lem> and \<rdg> instead of two \<rdg> inside an \<app>.

In [7]:
# Load the XML content
with open("output/claude/german_annotated_5-105.xml", "rb") as f:
    tree = etree.parse(f)

# TEI namespace (adjust if needed)
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

# Find all <p> elements
p_elements = tree.xpath("//tei:p", namespaces=namespaces)

# Collect all unique element tags used inside <p> elements
unique_tags = set()

for p in p_elements:
    for el in p.iterdescendants():
        # Remove namespace prefix if present
        tag = el.tag.split('}')[-1]
        unique_tags.add(tag)

# Print results
print("Unique elements inside <p> elements:")
for tag in sorted(unique_tags):
    print(f"- <{tag}>")


Unique elements inside <p> elements:
- <anchor>
- <bibl>
- <hi>
- <persName>
- <placeName>
