# XML to JSON conversion
Goal: ease data processing, cope with weird output formatting.

In [1]:
import re

In [2]:
next_tag = re.compile("<(/?)(PER|ACT|LOC|CARDINAL|TITLE|FT|ENTRY)>")
test_str = "sfsss</PER>dsfsf<ENTRY>"
m = next_tag.search(test_str)
len(m.group(0)), m.start(0), m.end(0), m.end(0) - m.start(0), m.span(0), m.endpos, m.group(0), m.group(1), m.regs


(6, 5, 11, 6, (5, 11), 23, '</PER>', '/', ((5, 11), (6, 7), (7, 10)))

In [3]:
m = next_tag.search(test_str, m.end(0))
m.end(0), m.groups()

(23, ('', 'ENTRY'))

In [4]:
[x for x in dir(m) if not x.startswith("__")]

['end',
 'endpos',
 'expand',
 'group',
 'groupdict',
 'groups',
 'lastgroup',
 'lastindex',
 'pos',
 're',
 'regs',
 'span',
 'start',
 'string']

In [5]:
next_tag.search("sf") is None

True

In [6]:
next_tag_scanner = re.compile("<(?P<closing>/?)(?P<tag>PER|ACT|LOC|CARDINAL|TITLE|FT|ENTRY)>")
def search_next_tag(string: str, start: int) -> tuple[str, bool, int]|None:
    '''returns None or tag, startpos, endpos, is_closing'''
    m = next_tag_scanner.search(string, start)
    if m is None:
        return None
    return m.group("tag"), m.start(0), m.end(0), m.group("closing") == "/"

In [7]:
pos = 0
while match := search_next_tag(test_str, pos):
    tag, _startpos, endpos, closing = match
    print(tag, closing)
    pos = endpos

PER True
ENTRY False


In [21]:
# State definitions
STATE_EXPECT_OPENING_TAG, STATE_EXPECT_CLOSING_TAG = range(2)

def xml_to_dict(xml_path: str) -> dict:

    """Parses a pseudo-XML file (list of tagged lines) to extract entries, copping with unpaired and missing tags.

    It works by scanning the file line by line, looking for tags.
    When an <ENTRY> or an </ENTRY> tag is found, the entry being parse is added to the list of entries, and any field being parsed is discarded.
    When an opening tag like <PER>, <ACT> and so on is found, we start slurping text until another tag is found.
    If the next tag is properly closing the one we just read, then we finish slurping the text before the new tag, add the field (tag, text) to the entry, and finally start waiting for a new opening tag.
    If the next tag is incorrect, we discart the text and tag we started collecting, and wait for a new opening tag.
    Some normalization and fixes should be performed before parsing each line (project to latin script, deal with easy mistakes like forgotten digits around a <CARDINAL>\d+</CARDINAL> construct) but non is implemented yet.

    Raises:
        RuntimeError: When some bug reveals itself…

    Returns:
        list[list[tuple[str,str]]]: list of entries. Entries are lists of tuple (tag name, value), like `[('PER', 'Cantagrel'), ('ACT', 'architecte')]`. This should ease splitting when required.
    """
    all_entries: list[list[tuple[str,str]]] = []
    current_entry: list[tuple[str,str]] = []
    current_field_type: str|None = None
    current_field_value: str = ""

    # Helper fonctions, capturing variables from main function context
    def _reinit_field(tag: str|None):
        nonlocal current_field_type
        nonlocal current_field_value
        current_field_type = tag
        current_field_value = ""
    
    def _commit_entry():
        nonlocal all_entries
        nonlocal current_entry
        # Must commit field before committing entry
        _reinit_field(None)
        if current_entry:
            all_entries.append(current_entry)
        current_entry = []
    
    def _commit_field():
        nonlocal current_field_type
        nonlocal current_field_value
        if current_field_type is None:
            raise RuntimeError("Must not commit field with unknown type")
        current_entry.append((current_field_type, current_field_value))
        _reinit_field(None)

    def _append_to_field_value(string: str):
        nonlocal current_field_type
        nonlocal current_field_value
        string_ = string.rstrip()
        if len(current_field_value) > 0 and not current_field_value.endswith("-"):
            current_field_value += " "
        current_field_value += string_

    # main loop
    state = STATE_EXPECT_OPENING_TAG
    with open(xml_path, encoding="utf-8") as xml_file:
        for line in xml_file:
            line_parsed = False
            start_pos = 0

            while not line_parsed:
                # TODO rewrite the string: 
                # - project to latin (char-level fast substitution, see DAS22 code)
                # - renormalize "(\d)*<CARDINAL>(\d)+</CARDINAL>(\d)*" into "<CARDINAL>\1\2\3</CARDINAL>"
                match = search_next_tag(line, start_pos)
                if not match:
                    # No tag until end of line, store text (could be limited to STATE_EXPECT_CLOSING_TAG)
                    _append_to_field_value(line[start_pos:])
                    line_parsed = True
                    continue
                # else: we found a tag
                tag_name, tag_start_pos, tag_end_pos, tag_is_closing = match
                prev_start_pos = start_pos
                # let us update the loop variable now to avoid mistakes
                # In all cases, continue search after current tag
                start_pos = tag_end_pos

                # No matter which state we are in, we flush each time we get an ENTRY tag, opening or closing (until better results)
                if tag_name == "ENTRY":
                    # committing entry without previous field commit drops current field (which has unpaired tags or is noise)
                    _commit_entry()
                    continue

                if state == STATE_EXPECT_OPENING_TAG:
                    if tag_is_closing: 
                        # unexpected closing tag
                        # drop current field
                        # TODO should be logged to detect code problem in NER output generation
                        _reinit_field(None)
                        # state = STATE_EXPECT_OPENING_TAG  # already set
                        # continue
                    else:
                        # we have a proper, non-ENTRY opening tag
                        _reinit_field(tag_name)
                        state = STATE_EXPECT_CLOSING_TAG
                        # continue
                    
                elif state == STATE_EXPECT_CLOSING_TAG:
                    if not tag_is_closing or tag_name != current_field_type:
                        # unexpected opening tag and/or unpaired tag, drop field and expect new opening tag
                        # TODO should be logged to detect code problem in NER output generation
                        _reinit_field(None)
                        state = STATE_EXPECT_OPENING_TAG
                        # continue
                    else:
                        # we have a matching closing tag, add it to entry
                        # don't forget to gather remaining chars between previous starting point and start of tag
                        _append_to_field_value(line[prev_start_pos:tag_start_pos])
                        _commit_field()
                        state = STATE_EXPECT_OPENING_TAG

                else:
                    raise RuntimeError(f"Unknown state value: {state}")
    
    return all_entries

In [23]:
xml_to_dict("output-sample/Didot_1853b-3:85-SAMPLE.xml")

[[('PER', 'Sédillot'), ('ACT', 'negociant')],
 [('PER', 'Halphen (Germ.'), ('ACT', 'juge au tribunal de commerce')],
 [('PER', 'Larrouy et Baillieux'), ('ACT', 'commissionnaires')],
 [('PER', 'Paccard (B.), Dufour et Cie'), ('ACT', 'banquiers')],
 [('PER', 'Bailleux, de la maison Larroux et Baillieux')],
 [('PER', 'Cazenave (Alphée)'), ('ACT', 'mě-decin')],
 [('PER', 'Cantagrel'), ('ACT', 'architecte')],
 [('PER', 'de Clansayes'),
  ('ACT', 'něgociant'),
  ('LOC', 'cité Tré-vise'),
  ('CARDINAL', '6')],
 [('PER', 'Allegri (B.) et comp.'),
  ('ACT', 'né-gociants'),
  ('ACT', 'banquicrs')],
 [('PER', 'Gaillard (I.) et Cie NC'), ('ACT', 'commiss. en sucre intiq')]]