In [1]:
import xml.etree.ElementTree as ET

def clean_xml(file_path):
    """
    Reads an XML file, removes unwanted characters, and writes the cleaned XML back to the file.
    """
    # Define the set of allowed characters
    allowed_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?'-")

    def filter_text(text):
        return ''.join(c for c in text if c in allowed_chars)

    tree = ET.parse(file_path)
    root = tree.getroot()

    for elem in root.iter():
        if elem.text:
            elem.text = filter_text(elem.text)
        if elem.tail:
            elem.tail = filter_text(elem.tail)
        for attr in elem.attrib:
            elem.attrib[attr] = filter_text(elem.attrib[attr])

    tree.write(file_path, encoding='ISO-8859-1', xml_declaration=True)

# Example usage:
file_path = "C:/Users/shouv/Desktop/Research/NIST/Second_round/Summaries/summaries.xml"
clean_xml(file_path)
