In [23]:
import xml.etree.ElementTree as ET
import json

# Define the XML file path
xml_file = "/Users/senthil/Desktop/Senthil/myTesting/Project_Creation/xmltoner/9781003373148.xml"

# Create an empty list to store the NER annotated data
ner_data = []

# Parse the XML file
tree = ET.parse(xml_file)

# Get the root element of the XML file
root = tree.getroot()

# Iterate through all the elements in the XML file
for elem in root.iter():

    # Check if the element is <xref>
    if elem.tag == "xref":

        # Extract the text content of <xref>, if any
        xref_text = elem.text
        if xref_text:
            xref_text = xref_text.strip()

        # Extract the value of the ref-type attribute
        entity_key = elem.get("ref-type")

        # Prepare the NER annotated data in JSON format
        if xref_text:
            ner_entry = {
                xref_text: {
                    "entities": [
                        [0, len(xref_text), entity_key]
                    ]
                }
            }

            # Add the NER annotated data to the list
            ner_data.append(ner_entry)

# Write the NER annotated data to a JSON file
with open("ner_data.json", "w") as f:
    json.dump(ner_data, f, indent=4)


In [21]:
import xml.etree.ElementTree as ET
import json

# Define the XML file path
xml_file = "/Users/senthil/Desktop/Senthil/myTesting/Project_Creation/xmltoner/9781003327684.xml"

# Create an empty list to store the NER annotated data
ner_data = []

# Parse the XML file
tree = ET.parse(xml_file)

# Get the root element of the XML file
root = tree.getroot()

# Iterate through all the <p> elements in the XML file
for para_elem in root.iter("p"):

    # Get the text content of the <p> element, including any child elements
    para_text = "".join(para_elem.itertext())

    # Initialize a list to store the entities in the paragraph
    entities = []

    # Initialize the end index of the previous entity to 0
    prev_end = 0

    # Iterate through all the <xref> elements in the <p> element
    for xref_elem in para_elem.iter("xref"):

        # Extract the text content of the <xref> element, if any
        xref_text = xref_elem.text
        if xref_text is not None:
            xref_text = xref_text.strip().lower()

        # Extract the value of the ref-type attribute
        entity_key = xref_elem.get("ref-type")

        # Get the start index of the <xref> element in the <p> element
        xref_start = None
        if xref_text is not None:
            xref_start = para_text.lower().find(xref_text, prev_end)
        if xref_start is None:
            continue  # skip this entity if the text is not found

        # Calculate the end index of the <xref> element in the <p> element
        xref_end = xref_start + len(xref_text)

        # Add the entity to the list of entities in the paragraph
        entities.append([xref_start, xref_end, entity_key])

        # Update the end index of the previous entity
        prev_end = xref_end

    # Prepare the NER annotated data in JSON format
    if entities:
        ner_entry = {
            "text": para_text,
            "entities": entities
        }

        # Add the NER annotated data to the list
        ner_data.append(ner_entry)

# Write the NER annotated data to a JSON file
with open("ner_data.json", "w") as f:
    json.dump(ner_data, f, indent=4, ensure_ascii=False)