In [2]:
# Script to transfer entities from a Named-entity recognition (NER) processed file to the original file.
# This corrects unintended changes to the XML document that may have occurred during the TEI processing.
# The NER-processed file is referred to as the "edited file".

# pip install lxml
import os
import re
import copy
from bs4 import BeautifulSoup

# Pfade für die verschiedenen Verzeichnisse
edited_dir = 'test_data/TEI-XML_NER/error/Amtsblatt/'
original_dir = 'test_data/TEI-XML/Amtsblatt/'
output_dir = 'test_data/postprocessed/'


## Helper functions

# Helper function to check if an entity is nested within another entity
def is_nested(tag):
    
        parent = tag.find_parent()
        while parent:
            if parent.name in {'placeName', 'persName', 'orgName'} and parent != tag:
                return True
            parent = parent.find_parent()
        return False


### Remove all nested entities from the list that are already contained within a parent entity
def filter_nested_entities(entities):
    
    non_nested_entities = []

    for entity in entities:
        if not is_nested(entity):
            non_nested_entities.append(entity)

    return non_nested_entities


def remove_entity_tags_in_str(text):
    
    # Regex-Muster für die Start- und End-Tags
    tag_patterns = {
        'placeName': r'</?placeName[^>]*>',
        'persName': r'</?persName[^>]*>',
        'orgName': r'</?orgName[^>]*>'
    }
    
    # Entferne die Tags für jede Entität
    for tag_name, pattern in tag_patterns.items():
        # Entferne alle Start- und End-Tags für das aktuelle Tag
        text = re.sub(pattern, '', text)
    
    return text


def get_text_for_lookbehind(entity, removeEntityTags = True):
    
    parent_element = entity.find_parent()
    
    if parent_element:
        
        parent_text = ''.join(str(content) for content in parent_element.contents)
        
        entity_str = str(entity)
        
        index_of_child = parent_text.find(entity_str)

        text_before_child_with_entities = parent_text[:index_of_child]

        if not removeEntityTags:
            # return 20 characters before entity tag as lookbehind text
            return text_before_child_with_entities[-20:]

        text_before_child_without_entities = remove_entity_tags_in_str(text_before_child_with_entities)

        # return 30 characters before entity tag as lookbehind text
        return text_before_child_without_entities[-30:]

    return ""
    

### Prepare the search text by removing all entities, so that the text matches the text in the original file
def prepare_search_text(entity):
    
    # Remove entities within the parent entity
    for inner_entity in entity.find_all(['placeName', 'persName', 'orgName']):
        inner_entity.unwrap()  # Removes the tag but retains the content

    # Also remove the parent entity to prepare the text for search
    search_text = ''.join(str(content) for content in entity.contents)
    
    return search_text

    
###Insert ---DONE--- in each word in the replacement text to prevent the search term from being found again
def insert_done_in_every_word(sentence):
    
    words = sentence.split()  # Satz in Wörter aufteilen
    modified_words = []

    pattern = re.compile(r'(<[^>]*>| )')
    words = pattern.split(sentence)
    
    for word in words:
        modified_word = word[:len(word)//2] + "---DONE---" + word[len(word)//2:]
        modified_words.append(modified_word)
    
    # Die modifizierten Wörter zu einem neuen Satz zusammenfügen
    modified_sentence = ''.join(modified_words)
    
    return modified_sentence

    
## Extract entities from the edited file and insert them into the original file (only within <body>)
def merge_entities(original_xml, edited_xml):
    
    # Parse the original and edited XML
    original_soup = BeautifulSoup(original_xml, 'xml')
    edited_soup = BeautifulSoup(edited_xml, 'xml')

    # Extract <body> content from both documents
    original_body = original_soup.find('body')
    edited_body = edited_soup.find('body')

    # Ensure <body> exists in both documents
    if original_body and edited_body:

        # Find a list of all entities in the edited XML (places, people, organizations)
        entities = edited_body.find_all(['placeName', 'persName', 'orgName'])

        # Remove all entities that are already nested within another entity
        non_nested_entities = filter_nested_entities(entities)

        # Liste zum Speichern der nicht ersetzbaren Entitäten aus Runde eins
        unreplaced_entities = []
        
        original_body_str = str(original_body)
        
        for entity in non_nested_entities:

            searchText = prepare_search_text(copy.deepcopy(entity))

            text_for_lookbehind = get_text_for_lookbehind(entity)
            
            # Insert text "---DONE---" multiple times in the replacement string to prevent the search term from being found again
            replaceText = insert_done_in_every_word(str(entity))

            context_pattern = (
                r'(?<=' + re.escape(text_for_lookbehind) + r')\s*' + re.escape(searchText)
            )
            
            # Perform the replacement if lookbehind is found
            original_body_str, count = re.subn(context_pattern, replaceText, original_body_str, count=1)

            if count == 0:
                # If no replacements were made, add the entity to the unreplaced list
                unreplaced_entities.append(entity)

        # Remove "---DONE---" texts
        original_body_str = original_body_str.replace("---DONE---", "")


        # 2. Durchgang, um die noch nicht ersetzten Entitäten mit anderem Ansatz zu finden
        for entity in unreplaced_entities:

            searchText = prepare_search_text(copy.deepcopy(entity))

            text_for_lookbehind = get_text_for_lookbehind(entity, removeEntityTags = False)
            
            # Don't insert text "---DONE---", because the purpose of the second round is to find the ones, which are not found in round one because of the already inserted entity tags in in the original body
            replaceText = str(entity)

            context_pattern = (
                r'(?<=' + re.escape(text_for_lookbehind) + r')\s*' + re.escape(searchText)
            )
    
            # Perform the replacement if lookbehind is found, ignoring the tags
            original_body_str = re.sub(context_pattern, replaceText, original_body_str, count=1)
            

        # Replace the old <body> with the new modified one in the original document
        original_body.replace_with(BeautifulSoup(original_body_str, 'xml').body)
        
    return str(original_soup)


##Dokumente verarbeiten: Einlesen, Entitäten übertragen, in neues File abspeichern
###Falls der Zielordner nicht existiert, erstelle ihn
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

###Durchiteriere den Ordner mit den NER-Dokumenten
for filename in os.listdir(edited_dir):
    if filename.endswith('.xml'):  # Stelle sicher, dass nur XML-Dateien betrachtet werden
        ###Pfad für das bearbeitete XML
        edited_file_path = os.path.join(edited_dir, filename)
        ###Pfad für das Original-XML
        original_file_path = os.path.join(original_dir, filename)

        ###Lese das bearbeitete XML in edited_xml
        with open(edited_file_path, 'r', encoding='utf-8') as file:
            edited_xml = file.read()

        ###Lese das Original-XML in original_xml
        if os.path.exists(original_file_path):  # Überprüfe, ob das Original existiert
            with open(original_file_path, 'r', encoding='utf-8') as file:
                original_xml = file.read()
        else:
            print(f"Original file not found for {filename}")
            continue
        
        ###Result
        result = merge_entities(original_xml, edited_xml)

        ###Speichere das Result als XML-Dokument im postprocessed-Ordner ab
        output_file_path = os.path.join(output_dir, filename)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(result)

        print(f"Processed and saved {filename} to {output_file_path}")




Processed and saved ABl_1980__S__104-105_.xml to test_data/postprocessed/ABl_1980__S__104-105_.xml
Processed and saved ABl_1980__S__1008-1016_.xml to test_data/postprocessed/ABl_1980__S__1008-1016_.xml
Processed and saved ABl_1980__S__1024-1028_.xml to test_data/postprocessed/ABl_1980__S__1024-1028_.xml
