In [2]:
""" Script for correcting unintended text modifications during Named Entity Recognition (NER) 

This script addresses issues that may arise during the NER process 
according to the TEI standard, mostly unintended text duplications. 
An example of such a modification is:
>>> (...) Seminar der Univer<lb break="no" facs="#facs_290_r33"/>sität</orgName>sität</cell>
The script corrects this modification to:
>>> (...) Seminar der Univer<lb break="no" facs="#facs_290_r33"/>sität</orgName></cell>

The NER process checks the documents and detects errors. Documents with 
errors are saved in an /error folder. This script takes the NER-processed 
files with errors (hereafter referred to as the "edited file")  and extracts 
all TEI entities with some context. It then inserts the entities into the 
original file (before NER) using search and replace.

The script accepts XML files (.xml) as input.

Requirements:
- This script requires that `beautifulsoup4` and `lxml` are installed in the Python environment where you are running this script.

Installation of beautifulsoup4:
- To install beautifulsoup4, run the following command in your command line:
  ```bash
  pip install beautifulsoup4

Installation of lxml:
- To install lxml, run the following command in your command line:
  ```bash
  pip install lxml
"""

# pip install beautifulsoup4
# pip install lxml
import os
import re
import copy
import difflib
import xml.etree.ElementTree as ET

import stats

from bs4 import BeautifulSoup
from lxml import etree
from difflib import ndiff


# Paths for the various directories
edited_dir = 'test_data/TEI-XML_NER/error/Amtsblatt/' # Directory containing NER-processed files with errors
original_dir = 'test_data/TEI-XML/Amtsblatt/' # Directory containing the original files
output_dir = 'test_data/postprocessed/' # Output directory for the merged files generated by this script


def is_nested(entity):
    """ Check if the current entity is nested within another entity.

    Args:
        entity (BeautifulSoup Tag): The entity (e.g., <placeName>, <persName>, <orgName>).

    Returns:
        bool: True if the entity is nested within another entity, False otherwise.
    """
    
    parent = entity.find_parent()
    while parent:
        if parent.name in {'placeName', 'persName', 'orgName'} and parent != entity:
            return True
        parent = parent.find_parent()
    return False


def filter_nested_entities(entities):
    """ Remove all nested entities from the list that are already contained within a parent entity

    Args:
        entities (list of BeautifulSoup Tags): List of entity tags from the edited XML.

    Returns:
        list of BeautifulSoup Tags: List of non-nested entities.
    """
    
    non_nested_entities = []

    for entity in entities:
        if not is_nested(entity):
            non_nested_entities.append(entity)

    return non_nested_entities

def count_entities(text):
    total_count  = 0

    tag_patterns = {
        'placeName': r'</?placeName[^>]*>',
        'persName': r'</?persName[^>]*>',
        'orgName': r'</?orgName[^>]*>'
    }

    # Durchlaufe jedes Pattern und summiere die Anzahl der Vorkommen
    for pattern in tag_patterns.values():
        total_count  += len(re.findall(pattern, text))

    # Dividiere durch 2, da es immer ein Start- und Endtag gibt. Es werden Start- und Endtags gezählt, um zu erkennen, falls nur einer von beiden fehlt.
    return (total_count//2)
    

def remove_entity_tags_in_str(text):
    """ Remove all TEI entity tags (placeName, persName, orgName) from a string.

    Args:
        text (str): The input string that may contain TEI entity tags.

    Returns:
        str: The input string with all TEI entity tags removed.
    """
    
    tag_patterns = {
        'placeName': r'</?placeName[^>]*>',
        'persName': r'</?persName[^>]*>',
        'orgName': r'</?orgName[^>]*>'
    }
    
    # Remove the tags for each entity type
    for tag_name, pattern in tag_patterns.items():
        text = re.sub(pattern, '', text)
    
    return text


def get_text_for_lookbehind(entity, removeEntityTags = True):
    """ Extract up to 30 characters of text before the entity, used for regex lookbehind.

    Args:
        entity (BeautifulSoup Tag): The entity tag from which to extract the lookbehind text.
        removeEntityTags (bool, optional): Whether to remove nested entity tags from the lookbehind text. Defaults to True.

    Returns:
        str: The 30 characters (if removeEntityTags is false) or 20 characters (otherwise) before the entity in the parent element.
    """
    
    parent_element = entity.find_parent()
    
    if parent_element:
        parent_text = ''.join(str(content) for content in parent_element.contents)
        entity_str = str(entity)
        index_of_child = parent_text.find(entity_str)
        text_before_child_with_entities = parent_text[:index_of_child]

        if not removeEntityTags:
            # return 20 characters before entity tag as lookbehind text
            return text_before_child_with_entities[-20:]

        text_before_child_without_entities = remove_entity_tags_in_str(text_before_child_with_entities)

        # return 30 characters before entity tag as lookbehind text
        return text_before_child_without_entities[-30:]

    return ""
    

def prepare_search_text(entity):
    """ Prepare the search text by removing all entity tags from the entity. 
        This will make the text match the text in the original file (before NER).

    Args:
        entity (BeautifulSoup Tag): The entity whose text is being prepared.

    Returns:
        str: The cleaned entity text, without any nested tags.
    """
    
    # Remove all nested entities inside the current entity
    for inner_entity in entity.find_all(['placeName', 'persName', 'orgName']):
        inner_entity.unwrap()  # Removes the tag but retains the content

    # Also remove the parent entity to prepare the text for search
    search_text = ''.join(str(content) for content in entity.contents)
    
    return search_text

    
def insert_done_in_every_word(sentence):
    """ Insert the marker `---DONE---` into each word of the replacement string
    to prevent repeated matches during the search-and-replace process.

    Args:
        sentence (str): The sentence or entity text in which to insert the marker.

    Returns:
        str: The modified sentence with `---DONE---` inserted.
    """
    
    modified_words = []

    pattern = re.compile(r'(<[^>]*>| )')
    words = pattern.split(sentence)
    
    for word in words:
        modified_word = word[:len(word)//2] + "---DONE---" + word[len(word)//2:]
        modified_words.append(modified_word)
    
    modified_sentence = ''.join(modified_words)
    
    return modified_sentence


def validate_result(filename, original_xml, edited_xml, postprocessed_xml):


    # Überprüfen, ob kein Inhalt verloren gegangen ist -> Postprocessed ohne Entity-Tags und Whitespaces
    # Postprocessed <-> Original

    def normalize_xml(xml_string):
        root = ET.fromstring(xml_string)
    
        def sort_attributes(elem):
            elem.attrib = {k: v for k, v in sorted(elem.attrib.items())}
            
            for child in elem:
                sort_attributes(child)
    
        sort_attributes(root)
    
        # Entferne überflüssige Whitespaces
        normalized_string = ET.tostring(root, encoding='utf-8').decode('utf-8')
        normalized_string = re.sub(r">\s+<", "><", normalized_string)
        normalized_string = re.sub(r"\s+", " ", normalized_string)
        normalized_string = re.sub(r"[\t\n]+", " ", normalized_string)  # Remove tabs and newlines
        
        return normalized_string
    
    def compare_xml_strings(xml_string1, xml_string2):
        # Normalisiere beide XML-Strings
        normalized_xml1 = normalize_xml(xml_string1)
        normalized_xml2 = normalize_xml(xml_string2)
    
        # Vergleiche die beiden normalisierten XML-Strings
        if normalized_xml1 == normalized_xml2:
            print("Die XML-Dateien sind gleich.")
            stats.write_to_statistics(filename,'Content integrity (Original <-> Postprocessed)', 'Yes')
        else:
            print("Die XML-Dateien sind unterschiedlich.")
            stats.write_to_statistics(filename,'Content integrity (Original <-> Postprocessed)', 'No')

    compare_xml_strings(original_xml, remove_entity_tags_in_str(postprocessed_xml))


    def show_differences(original, edited):
        """
        Show differences between edited XML and postprocessed XML.
        """

        def prepare_for_diff(str_for_diff):
            # Parse the XML string
            root = ET.fromstring(str_for_diff)

            
            def sort_attributes(elem):
                # Sort attributes
                elem.attrib = dict(sorted(elem.attrib.items()))
                
                for child in elem:
                    sort_attributes(child)
        
            sort_attributes(root)
            
            # Erstelle die sortierte XML-String und behalte die Zeilenumbrüche
            str_for_diff = ET.tostring(root, encoding='utf-8', method='xml').decode('utf-8')
            
            #str_for_diff = re.sub(r"\s+", " ", str_for_diff)
            str_for_diff = re.sub(r"[\t]+", " ", str_for_diff)  # Remove tabs
            str_for_diff = re.sub(r'^\s+', '', str_for_diff, flags=re.MULTILINE)  # Remove whitespaces at the beginning 
            str_for_diff = re.sub("ns0:", "", str_for_diff)
            
            return str_for_diff
        
        original = prepare_for_diff(original)
        edited = prepare_for_diff(edited)


        diff = ndiff(original.splitlines(), edited.splitlines())
        changed_lines = [line for line in diff if line.startswith('+') or line.startswith('-') or line.startswith('?')] # remove unchanged lines        

        stats.write_to_statistics(filename,'Changes without tabs/whitespaces (ner-processed <-> postprocessed)', '"{}"'.format('\n'.join(list(changed_lines))))



        def save_html_diff(filename, original_xml, edited_xml):
            """
            Erstellt eine HTML-Datei mit dem Vergleich zwischen original_xml und edited_xml.
            
            Args:
                filename (str): Der Name der Datei, die verglichen wird.
                original_xml (str): Der Original-XML-Inhalt.
                edited_xml (str): Der bearbeitete XML-Inhalt.
            """
            
            # Verwende HtmlDiff, um die Unterschiede in HTML-Format zu bringen
            differ = difflib.HtmlDiff()
            html_diff = differ.make_file(original_xml.splitlines(), edited_xml.splitlines(), fromdesc='Original', todesc='Edited')

            output_dir = 'diffs'
            
            # Prüfe, ob das Verzeichnis existiert, und erstelle es, falls es nicht existiert
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            html_filename = f'diff_{os.path.splitext(filename)[0]}.html'

            diff_path = os.path.join(f'{output_dir}/', html_filename)
          
            # Speichere das HTML-Ergebnis in eine Datei
            with open(diff_path, 'w', encoding='utf-8') as f:
                f.write(html_diff)
            
            print(f"HTML-Diff gespeichert in {diff_path}")


        save_html_diff(filename, original, edited)
        
        
        #diff = unified_diff(original.splitlines(), edited.splitlines(), n=0, lineterm='')
        #print('\n'.join(list(changed_lines)))

        '''
        # Normalize both XML strings
        normalized_original = normalize_xml(original)
        normalized_edited = normalize_xml(edited)

        # Split into lines for comparison
        original_lines = normalized_original.splitlines()
        edited_lines = normalized_edited.splitlines()

        # Simple line-by-line comparison to find differences
        for i, (orig_line, edit_line) in enumerate(zip(original_lines, edited_lines)):
            if orig_line != edit_line:
                print(f"Difference found on line {i + 1}:")
                print(f"Original: {orig_line}")
                print(f"Edited:   {edit_line}\n")

        # Check for additional lines in edited
        if len(edited_lines) > len(original_lines):
            for edit_line in edited_lines[len(original_lines):]:
                print(f"Additional in Edited: {edit_line}")

        # Check for missing lines in edited
        if len(original_lines) > len(edited_lines):
            for orig_line in original_lines[len(edited_lines):]:
                print(f"Missing in Edited: {orig_line}")
        '''
    show_differences(edited_xml, postprocessed_xml)
    

    print("------------------------------ postprocessed_xml ohne Entity Tags: -------------------------")
    print(filename)
    #print(normalize_xml(remove_entity_tags_in_str(postprocessed_xml)))
    print("------------------------------ original_xml: -------------------------")
    #print(filename)
    #print(normalize_xml(original_xml))

    # Vlt auch Differenzen zwischen postprocessed <-> edited im csv ausgeben
    

    # XML Validieren ob Syntax korrekt ist.

    # Kann geprüft werden, ob alle Fehler korrigiert wurden?

    # Für Statistik zählen, wie viele Entities ersetzt wurden
    entities_before_postprocessing = count_entities(edited_xml)
    entities_after_postprocessing = count_entities(postprocessed_xml)
    stats.write_to_statistics(filename,'Number of Entities Before Processing', entities_before_postprocessing)
    stats.write_to_statistics(filename,'Number of Entities After Processing', entities_after_postprocessing)

    # Prüfen, ob die Anzahl Entitäten übereinstimmt
    # error <-> postprocessed
    missing_entities = entities_before_postprocessing - entities_after_postprocessing
    stats.write_to_statistics(filename,'Missing Entities', missing_entities)


    '''
    # Remove all entities from postprocessed_xml
    postprocessed_xml = remove_entity_tags_in_str(postprocessed_xml)

    if original_xml.startswith('<?xml'):
        original_xml = original_xml.split('?>', 1)[1].strip()

    if postprocessed_xml.startswith('<?xml'):
        postprocessed_xml = postprocessed_xml.split('?>', 1)[1].strip()
    
    def normalize_xml(xml_content):
        """Parses the XML content and returns a normalized form."""
        
        # Encode the string to bytes
        #xml_bytes = xml_content.encode('utf-8')
        #root = etree.fromstring(xml_bytes)
        root = etree.fromstring(xml_content)
    
        # Normalize the XML by sorting the attributes
        def sort_attributes(element):
            # Create a new element with sorted attributes
            sorted_attrib = dict(sorted(element.attrib.items()))
            new_element = etree.Element(element.tag, **sorted_attrib)
    
            # Copy the text and children
            new_element.text = element.text
            for child in element:
                new_child = sort_attributes(child)  # Recursively sort child attributes
                new_element.append(new_child)
    
            return new_element
    
        # Create a new root element with sorted attributes
        normalized_root = sort_attributes(root)
    
        # Convert the normalized XML to a string
        return etree.tostring(normalized_root, pretty_print=True, encoding='utf-8')
    
    def compare_xml_content(file1, file2):
        """Compares the content of two XML files and returns a score."""
        normalized_xml1 = normalize_xml(file1)
        normalized_xml2 = normalize_xml(file2)

        print("------------------------------ Normalized 1: -------------------------")
        #print(normalized_xml1)
        print("------------------------------ Normalized 2: -------------------------")
        #print(normalized_xml2)
    
        # Count the number of matching elements
        tree1 = etree.fromstring(normalized_xml1)
        tree2 = etree.fromstring(normalized_xml2)

        tree1 = tree1.replace('\t', '')
        tree2 = tree2.replace('\t', '')
    
        def count_matching_elements(elem1, elem2):
            """Counts the matching elements and their text contents."""
            count = 0
            total = 0
    
            # Compare the elements
            for child1 in elem1:
                total += 1
                for child2 in elem2:
                    if child1.tag == child2.tag and child1.text == child2.text:
                        count += 1
                        break
    
            return count, total
    
        matching_count, total_count = count_matching_elements(tree1, tree2)
    
        # Calculate the score
        if total_count == 0:
            return 0.0  # Avoid division by zero
        score = matching_count / total_count
        return score
    
    #score = compare_xml_content(original_xml, postprocessed_xml)
    #print(f"Matching Score: {score:.2f}")
    #return score

    '''
    
def merge_entities(original_xml, edited_xml):
    """ Merge the named entities from the edited XML into the original XML (only within <body>).

    Args:
        original_xml (str): The XML content of the original file.
        edited_xml (str): The XML content of the NER-processed file with errors.

    Returns:
        str: The original XML content with the corrected entities inserted.
    """
    
    # Parse the original and edited XML
    original_soup = BeautifulSoup(original_xml, 'xml')
    edited_soup = BeautifulSoup(edited_xml, 'xml')

    # Extract <body> content from both documents
    original_body = original_soup.find('body')
    edited_body = edited_soup.find('body')

    # Ensure <body> exists in both documents
    if original_body and edited_body:
        
        # Find all entities in the edited XML (places, people, organizations)
        entities = edited_body.find_all(['placeName', 'persName', 'orgName'])

        # Remove all entities that are already nested within another entity
        non_nested_entities = filter_nested_entities(entities)

        # List to store entities that couldn't be replaced in the round one
        unreplaced_entities = []
        
        original_body_str = str(original_body)

        # Round one using greater context (lookbehind) and ignoring entity tags   
        for entity in non_nested_entities:
            
            searchText = prepare_search_text(copy.deepcopy(entity))
            text_for_lookbehind = get_text_for_lookbehind(entity)
            
            # Insert "---DONE---" in replacement text to prevent re-matching
            replaceText = insert_done_in_every_word(str(entity))

            # Create the regex pattern for contextual replacement
            context_pattern = (
                r'(?<=' + re.escape(text_for_lookbehind) + r')\s*' + re.escape(searchText)
            )
            
            # Perform the replacement if the lookbehind is found
            original_body_str, count = re.subn(context_pattern, replaceText, original_body_str, count=1)

            if count == 0:
                # If no replacements were made, add the entity to the unreplaced list
                unreplaced_entities.append(entity)

        # Remove "---DONE---" markers
        original_body_str = original_body_str.replace("---DONE---", "")


        # Round two uses a shorter context and considers the presence of entity tags. 
        # A possible reason for the previous lookbehind not matching could be that an entity 
        # has already been inserted into the original file, causing the current lookbehind 
        # to fail when trying to match with the next entity's surrounding text.
        for entity in unreplaced_entities:
            
            searchText = prepare_search_text(copy.deepcopy(entity))
            text_for_lookbehind = get_text_for_lookbehind(entity, removeEntityTags = False)
            
            # Do not insert "---DONE---" in this round. The purpose of the second round is to find entities 
            # that were missed in the first round due to already inserted entity tags in the original document.
            replaceText = str(entity)

            context_pattern = (
                r'(?<=' + re.escape(text_for_lookbehind) + r')\s*' + re.escape(searchText)
            )
    
            original_body_str = re.sub(context_pattern, replaceText, original_body_str, count=1)
            

        # Replace the old <body> with the modified version in the original document
        original_body.replace_with(BeautifulSoup(original_body_str, 'xml').body)
        
    return str(original_soup)


def main():
    """ Process XML documents by reading, merging entities, and saving the results.

    This function reads NER-processed XML files,
    merges the identified entities into the corresponding original files,
    and then saves the merged output into a new directory.
    """

    # Ensure the output directory exists, create it if necessary
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for filename in os.listdir(edited_dir):
        
        if filename.endswith('.xml'):

            # Read edited and original xml file
            edited_file_path = os.path.join(edited_dir, filename)
            original_file_path = os.path.join(original_dir, filename)
    
            with open(edited_file_path, 'r', encoding='utf-8') as file:
                edited_xml = file.read()
    
            if os.path.exists(original_file_path): 
                with open(original_file_path, 'r', encoding='utf-8') as file:
                    original_xml = file.read()
            else:
                print(f"Original file not found for {filename}")
                continue
    
            # Merge entities from the edited file into the original file
            result = merge_entities(original_xml, edited_xml)           

            # Validate result
            validate_result(filename, original_xml, edited_xml, result)
    
            # Save the merged result to the output directory as an XML document
            output_file_path = os.path.join(output_dir, filename)
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(result)

            stats.save_statistics_to_csv()
            print(f"Processed and saved {filename} to {output_file_path}")

    print(f"All files processed")

# Let the magic happen
main()


Die XML-Dateien sind gleich.
HTML-Diff gespeichert in diffs/diff_ABl_1981__S__186_.html
------------------------------ postprocessed_xml ohne Entity Tags: -------------------------
ABl_1981__S__186_.xml
------------------------------ original_xml: -------------------------
Statistics saved to statistics.csv
Processed and saved ABl_1981__S__186_.xml to test_data/postprocessed/ABl_1981__S__186_.xml
Die XML-Dateien sind gleich.
HTML-Diff gespeichert in diffs/diff_ABl_1980__S__700-704_.html
------------------------------ postprocessed_xml ohne Entity Tags: -------------------------
ABl_1980__S__700-704_.xml
------------------------------ original_xml: -------------------------
Statistics saved to statistics.csv
Processed and saved ABl_1980__S__700-704_.xml to test_data/postprocessed/ABl_1980__S__700-704_.xml
Die XML-Dateien sind gleich.
HTML-Diff gespeichert in diffs/diff_ABl_1982__S__1247-1255_.html
------------------------------ postprocessed_xml ohne Entity Tags: -----------------------

KeyboardInterrupt: 

In [1]:
import importlib  # Importiere importlib
import stats  # Importiere dein Modul

# Änderungen an my_module vornehmen...

# Jetzt das Modul neu laden
importlib.reload(stats)  # Lade das Modul neu

<module 'stats' from '/workspaces/postprocessing_NER/stats.py'>

In [6]:
pip install xmldiff

Collecting xmldiff
  Downloading xmldiff-2.7.0-py3-none-any.whl.metadata (9.0 kB)
Downloading xmldiff-2.7.0-py3-none-any.whl (43 kB)
Installing collected packages: xmldiff
Successfully installed xmldiff-2.7.0
Note: you may need to restart the kernel to use updated packages.
