In [None]:
import xml.etree.ElementTree as ET

input_str = '<para alignment="JUSTIFY (3)" ele-id="1DF65D10" space_after="6.0"> The diversity of photopigments, light-harvesting protein complexes, and their different regulatory mechanisms contrasts with the high homology of the photosynthetic RCs within the phylogenetic classification (Blankenship, 2010). Light-harvesting or antenna systems have evolved to adapt to a variable intensity and spectral qualities of light in diverse environments and are divided into inner and outer antenna systems (Table 2). The inner antenna is an essential component of RCs. In contrast, the outer antenna systems include multiple photopigment-bound protein families having different pigments and pigment-protein complexes. The light-harvesting complexes (LHCs) in plants and other eukaryotic photosynthetic organisms belong to a single protein family and bind 15-22 chlorophylls. LHCs make up the outer antenna system and are composed of three membrane-spanning proteins. Cyanobacteria lack members of the LHC family and instead use a soluble antenna, comprising phycobiliproteins or six-transmembrane Chl-binding proteins (CBP) as their main light-harvesting system (Chen<emphasis role="cs_italic"><emphasis role="italic"> et al.</emphasis></emphasis> 2008; Mirkovic <emphasis role="cs_italic"><emphasis role="italic">et al</emphasis></emphasis>. 2017). Three important parameters decide optimisation and efficiency of antenna systems, 1, the distance between photopigments; 2, the orientation of photopigments;  and 3, spectral overlap between the pigments. </para><para>senthil</para>'

# Create an XML element for the root
root = ET.Element('root')

# Define the custom entities with their respective start and end indices
entities = [
    {
        "text": "2010",
        "start-index": 223,
        "end-index": 227,
        "label": "REF_CITE"
    },
    {
        "text": "Table 2",
        "start-index": 422,
        "end-index": 429,
        "label": "TABLE_CITE"
    },
    {
        "text": "Chen et al. 2008",
        "start-index": 1078,
        "end-index": 1094,
        "label": "REF_CITE"
    },
    {
        "text": "Mirkovic et al. 2017",
        "start-index": 1096,
        "end-index": 1116,
        "label": "REF_CITE"
    }
]

# Create a new XML element for the para tag
para_element = ET.Element('para')
para_element.text = input_str

# Loop through each entity and create an XML element for each one
for entity in entities:
    start_index, end_index = entity['start-index'], entity['end-index']
    entity_element = ET.Element(entity['label'])
    entity_text = entity['text']
    entity_element.text = entity_text
    entity_element.set('start_index', str(start_index))
    entity_element.set('end_index', str(end_index))
    para_element.insert(start_index, entity_element)

# Append the modified para tag to the root element
root.append(para_element)

# Create the XML file
xml_string = ET.tostring(root)
with open('output.xml', 'wb') as f:
    f.write(xml_string)


In [None]:
import xml.etree.ElementTree as ET

input_str = '<para alignment="JUSTIFY (3)" ele-id="1DF65D10" space_after="6.0"> The diversity of photopigments, light-harvesting protein complexes, and their different regulatory mechanisms contrasts with the high homology of the photosynthetic RCs within the phylogenetic classification (Blankenship, 2010). Light-harvesting or antenna systems have evolved to adapt to a variable intensity and spectral qualities of light in diverse environments and are divided into inner and outer antenna systems (Table 2). The inner antenna is an essential component of RCs. In contrast, the outer antenna systems include multiple photopigment-bound protein families having different pigments and pigment-protein complexes. The light-harvesting complexes (LHCs) in plants and other eukaryotic photosynthetic organisms belong to a single protein family and bind 15-22 chlorophylls. LHCs make up the outer antenna system and are composed of three membrane-spanning proteins. Cyanobacteria lack members of the LHC family and instead use a soluble antenna, comprising phycobiliproteins or six-transmembrane Chl-binding proteins (CBP) as their main light-harvesting system (Chen<emphasis role="cs_italic"><emphasis role="italic"> et al.</emphasis></emphasis> 2008; Mirkovic <emphasis role="cs_italic"><emphasis role="italic">et al</emphasis></emphasis>. 2017). Three important parameters decide optimisation and efficiency of antenna systems, 1, the distance between photopigments; 2, the orientation of photopigments;  and 3, spectral overlap between the pigments. </para>'

# Parse the input string as an XML element tree
root = ET.fromstring(input_str)

# Define the custom entities with their respective start and end indices
entities = [
    {
        "text": "2010",
        "start-index": 223,
        "end-index": 227,
        "label": "REF_CITE"
    },
    {
        "text": "Table 2",
        "start-index": 422,
        "end-index": 429,
        "label": "TABLE_CITE"
    },
    {
        "text": "Chen et al. 2008",
        "start-index": 1078,
        "end-index": 1094,
        "label": "REF_CITE"
    },
    {
        "text": "Mirkovic et al. 2017",
        "start-index": 1096,
        "end-index": 1116,
        "label": "REF_CITE"
    }
]

# Loop through each entity and wrap the corresponding text with a label as an XML tag
for entity in entities:
    entity_text = entity['text']
    start_index = entity['start-index']
    end_index = entity['end-index']
    label = entity['label']
    # Adjust start and end indices to account for any XML tags in the input string
    start_index += input_str[:start_index].count('<') + input_str[:start_index].count('>')
    end_index += input_str[:end_index].count('<') + input_str[:end_index].count('>')
    input_str = input_str[:start_index] + f'<{label}>{entity_text}</{label}>' + input_str[end_index:]

# Try parsing the modified XML string and catch any errors
try:
    root = ET.fromstring(input_str)
    # Write the XML object to a file
    ET.ElementTree(root).write('output.xml')
except ET.ParseError as e:
    print(f"Modified XML string:\n{input_str}\n")
    print(f"ParseError: {e}")


In [2]:
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET


# Placeholder input string and entities
input_str = '<document><para alignment="JUSTIFY (3)" ele-id="1DF65D10" space_after="6.0"> The diversity of photopigments, light-harvesting protein complexes, and their different regulatory mechanisms contrasts with the high homology of the photosynthetic RCs within the phylogenetic classification (Blankenship, 2010). Light-harvesting or antenna systems have evolved to adapt to a variable intensity and spectral qualities of light in diverse environments and are divided into inner and outer antenna systems (Table 2). The inner antenna is an essential component of RCs. In contrast, the outer antenna systems include multiple photopigment-bound protein families having different pigments and pigment-protein complexes. The light-harvesting complexes (LHCs) in plants and other eukaryotic photosynthetic organisms belong to a single protein family and bind 15-22 chlorophylls. LHCs make up the outer antenna system and are composed of three membrane-spanning proteins. Cyanobacteria lack members of the LHC family and instead use a soluble antenna, comprising phycobiliproteins or six-transmembrane Chl-binding proteins (CBP) as their main light-harvesting system (Chen<emphasis role="cs_italic"><emphasis role="italic"> et al.</emphasis></emphasis> 2008; Mirkovic <emphasis role="cs_italic"><emphasis role="italic">et al</emphasis></emphasis>. 2017). Three important parameters decide optimisation and efficiency of antenna systems, 1, the distance between photopigments; 2, the orientation of photopigments;  and 3, spectral overlap between the pigments. </para><para>senthil</para></document>'

entities = [
    {
        "text": "2010",
        "start-index": 223,
        "end-index": 227,
        "label": "REF_CITE"
    },
    {
        "text": "Table 2",
        "start-index": 422,
        "end-index": 429,
        "label": "TABLE_CITE"
    },
    {
        "text": "Chen et al. 2008",
        "start-index": 1078,
        "end-index": 1094,
        "label": "REF_CITE"
    },
    {
        "text": "Mirkovic et al. 2017",
        "start-index": 1096,
        "end-index": 1116,
        "label": "REF_CITE"
    }
]

# Parse the input XML string with Beautiful Soup
soup = BeautifulSoup(input_str, 'xml')

# Extract the text content of each element
for elem in soup.find_all():
    elem_text = ''.join(list(elem.itertext()))
    elem.string = elem_text

# Loop through each entity and wrap the corresponding text with a label as an XML tag
for entity in entities:
    entity_text = entity['text']
    start_index = entity['start-index']
    end_index = entity['end-index']
    label = entity['label']
    # Adjust start and end indices to account for any XML tags in the input string
    start_index += input_str[:start_index].count('<') + input_str[:start_index].count('>')
    end_index += input_str[:end_index].count('<') + input_str[:end_index].count('>')
    input_str = input_str[:start_index] + f'<{label}>{entity_text}</{label}>' + input_str[end_index:]

# Try parsing the modified XML string and catch any errors
try:
    root = ET.fromstring(input_str)
    # Write the XML object to a file
    ET.ElementTree(root).write('output.xml')
except ET.ParseError as e:
    print(f"Modified XML string:\n{input_str}\n")
    print(f"ParseError: {e}")


TypeError: 'NoneType' object is not callable