In [1]:
from collections import defaultdict
from functools import partial
from lxml import etree
from unidecode import unidecode
import Levenshtein as lev
import numpy as np
import regex

### Getting tags that looks like `<article-id>`, `<aff>` and `<contrib>`

In [2]:
document_regex_searchers = {k: regex.compile(v).search for k, v in [
    ("article-id", r"/(?:front){e<=1}/.*/(?:article-id){e<=2}$"),
    ("contrib", r"/(?:front){e<=1}/(?:article-meta){e<=4}/.*/(?:contrib){e<=2}$"),
    ("aff", r"/(?:front){e<=1}/(?:article-meta){e<=4}/.*/(?:aff){e<=1}$"),
]}

In [3]:
def etree_tag_line_gen(root, start=""):
    start += "/" + root.tag
    yield start, root
    for node in root.iterchildren():
        yield from etree_tag_line_gen(node, start)

In [4]:
def get_article_id_contrib_aff(element):
    result = defaultdict(list)
    for tag_line, element in etree_tag_line_gen(element):
        for k, searcher in document_regex_searchers.items():
            if searcher(tag_line):
                result[k].append(element)
    return result

In [5]:
with open("selecao_xml_br/rbort/v48n1/0102-3616-rbort-48-01-0041.xml") as f:
    doctree = etree.parse(f)

In [6]:
{k: len(v) for k, v in get_article_id_contrib_aff(doctree.getroot()).items()}

{'article-id': 2, 'contrib': 1, 'aff': 1}

### Creating a Levenshtein distance approximate `__getitem__`

In [7]:
min(["aidy", "di", "aid", "prid", "xp", "b"], key=partial(lev.distance, "id"))

'aid'

In [8]:
def get_lev(dict_or_node, key):
    return dict_or_node.get(min(dict_or_node.keys(),
                                key=partial(lev.distance, key)))

In [9]:
branches = get_article_id_contrib_aff(doctree.getroot())
first_aff = branches["aff"][0]
get_lev(first_aff, "id")

'aff1'

### Getting the `rid` value from approximate `<xref ref-type="aff" rid="...">`

In [10]:
def xml_attr_cleanup(name):
    return regex.sub("/@", "", unidecode(name))

In [11]:
def etree_line_gen(branch, path=""):
    path += "/" + branch.tag
    for k, v in sorted(branch.items()):
        path += f"@{k}={xml_attr_cleanup(v)}"
    if branch.attrib: # We don't need an empty node entry
        yield path, branch
    for node in branch.xpath("node()"):
        if not isinstance(node, str):
            yield from etree_line_gen(node, path)
        else:
            if node.strip():
                yield f"{path}#text", node

Is this node a text?

In [12]:
first_contrib = branches["contrib"][0]
for path, node in etree_line_gen(first_contrib):
    print(f"{path:64} # {isinstance(node, str)}")

/contrib@contrib-type=author                                     # False
/contrib@contrib-type=author/name/surname#text                   # True
/contrib@contrib-type=author/name/given-names#text               # True
/contrib@contrib-type=author/xref@ref-type=aff@rid=aff1          # False
/contrib@contrib-type=author/xref@ref-type=aff@rid=aff1#text     # True
/contrib@contrib-type=author/xref@ref-type=corresp@rid=cor1      # False
/contrib@contrib-type=author/xref@ref-type=corresp@rid=cor1#text # True


In [13]:
xref_aff_regex = regex.compile(r"(?b)/(?:xref){e<=1}[^/]*@(?:ref-type){e<=2}=(?:aff){e<=1}(?:$|@)")

In [14]:
first_contrib_pnpairs = list(etree_line_gen(first_contrib))
first_contrib_lines, first_contrib_nodes = zip(*first_contrib_pnpairs)
first_contrib_lines_str = "\n".join(first_contrib_lines)

In [15]:
first_contrib_lines_ends = np.cumsum([len(p) + 1 for p in first_contrib_lines])
first_contrib_lines_ends

array([ 29,  76, 127, 183, 244, 304, 369])

In [16]:
fcls_match = xref_aff_regex.search(first_contrib_lines_str)
fcls_line_idx = np.where(first_contrib_lines_ends > fcls_match.start())[0][0]
get_lev(first_contrib_nodes[fcls_line_idx].attrib, "rid")

'aff1'

### Split the node generator for text and attribute lookup

In [17]:
def etree_line_text_node_gen(branch, path=""):
    path += "/" + branch.tag
    for k, v in sorted(branch.items()):
        path += f"@{k}={xml_attr_cleanup(v)}"
    for node in branch.xpath("node()"):
        if not isinstance(node, str):
            yield from etree_line_text_node_gen(node, path)
        elif node.strip():
            yield path, node

In [18]:
for path, node in etree_line_text_node_gen(first_contrib):
    print(f"{path:60} # {isinstance(node, str)}")

/contrib@contrib-type=author/name/surname                    # True
/contrib@contrib-type=author/name/given-names                # True
/contrib@contrib-type=author/xref@ref-type=aff@rid=aff1      # True
/contrib@contrib-type=author/xref@ref-type=corresp@rid=cor1  # True


In [19]:
def etree_line_attr_node_gen(branch, path=""):
    path += "/" + branch.tag
    for k, v in sorted(branch.items()):
        path += f"@{k}={xml_attr_cleanup(v)}"
    if branch.attrib: # We don't need an empty node entry
        yield path, branch
    for node in branch:
        if not isinstance(node, str):
            yield from etree_line_attr_node_gen(node, path)

In [20]:
for path, node in etree_line_attr_node_gen(first_contrib):
    print(f"{path:60} # {isinstance(node, str)}")

/contrib@contrib-type=author                                 # False
/contrib@contrib-type=author/xref@ref-type=aff@rid=aff1      # False
/contrib@contrib-type=author/xref@ref-type=corresp@rid=cor1  # False


### Identify all `<aff>` and `<contrib>` elements

In [21]:
def xref_aff_gen(contrib):
    for path, node in etree_line_attr_node_gen(contrib):
        if xref_aff_regex.search(path):
            yield node

In [22]:
{contrib: [get_lev(xref_aff, "rid") for xref_aff in xref_aff_gen(contrib)]
 for contrib in branches["contrib"]}

{<Element contrib at 0x7fd35738c608>: ['aff1']}

In [23]:
{aff: get_lev(aff, "id") for aff in branches["aff"]}

{<Element aff at 0x7fd35738c5c8>: 'aff1'}

### Extracting `<article-id>` data

Suppose our extractors are defined by a string table like:

```python
#    result key,   XML element attribute,          line matching regex
[
    ("key_name_1", "first-attr-in-my-xml-element", "/(?:some)?regex"),
    ("key_name_2", "second-attr-in-that-element",  "/another.regex"),
    ("key_name_3", "another-attr-in-the-element",  "/(?:approx){e<=1}regex"),
    ("key_name_4", "",                             "/some@node"), # Matches text
]
```

Where the attribute name is an empty string when the regex should match a text node. The extractor input is a single XML element.

The first (yet incomplete) approach was a  text-only extractor (i.e., one that ignores the attribute column and gets the text node) would be:

```python
def etree_texts_extract(regexes_table, element)
    paths, nodes = zip(*etree_line_text_node_gen(element))
    paths_str = "\n".join(paths)
    ends = np.cumsum([len(p) + 1 for p in paths]) # Add \n
    keys, attrs, regexes = zip(*regexes_table) # Ignore attrs
    matches_gen = (r.search(paths_str) for r in regexes)
    nodes_gen = (match and nodes[np.where(ends > match.start())[0][0]]
                 for match in matches_gen)
    return {key: node for key, node in zip(keys, nodes_gen) if node}
```

The same, somewhat complicated, but for both text and attribute nodes:

```python
def etree_extract(regexes_table, element):
    text_paths, text_nodes = zip(*etree_line_text_node_gen(element))
    text_paths_str = unidecode("\n".join(text_paths)).lower()
    text_ends = np.cumsum([len(p) + 1 for p in text_paths]) # Add \n

    attr_paths, attr_nodes = zip(*etree_line_attr_node_gen(element))
    attr_paths_str = unidecode("\n".join(attr_paths)).lower()
    attr_ends = np.cumsum([len(p) + 1 for p in attr_paths])

    keys, attrs, regexes = zip(*regexes_table)
    at_paths, at_nodes, at_paths_str, at_ends = zip(*[
        (attr_paths, attr_nodes, attr_paths_str, attr_ends)
        if attr else
        (text_paths, text_nodes, text_paths_str, text_ends)
        for attr in attrs
    ])

    matches_gen = (r.search(p) for r, p in zip(regexes, at_paths_str))
    nodes_gen = (match and nodes[np.where(ends > match.start())[0][0]]
                 for match, nodes, ends in zip(matches_gen, at_nodes, at_ends))
    return {key: (get_lev(node, attr) if attr else node.strip())
                                      if node is not None else ""
            for key, attr, node in zip(keys, attrs, nodes_gen)}
```

The general (and simpler) approach found was to only walk through element nodes, not text nodes:

In [24]:
def etree_path_gen(branch, path=""):
    path += "/" + branch.tag
    for k, v in sorted(branch.items()):
        path += f"@{xml_attr_cleanup(k)}={xml_attr_cleanup(v)}"
    yield path, branch
    for node in branch:
        if not isinstance(node, str):
            yield from etree_path_gen(node, path)

In [25]:
def node_getattr(node, attr):
    if node is None:
        return ""
    if attr:
        return get_lev(node, attr)
    full_text = etree.tounicode(node, method="text", with_tail=False)
    return regex.sub("\s+", " ", full_text).strip()

In [26]:
def etree_extract(regexes_table, element):
    paths, nodes = zip(*etree_path_gen(element))
    paths_str = "\n".join(paths)
    ends = np.cumsum([len(p) + 1 for p in paths]) # Add \n
    keys, attrs, regexes = zip(*regexes_table)
    matches_gen = (r.search(paths_str) for r in regexes)
    nodes_gen = (match and nodes[np.where(ends > match.start())[0][0]]
                 for match in matches_gen)
    return {key: node_getattr(node, attr)
            for key, attr, node in zip(keys, attrs, nodes_gen)}

In [27]:
aids_regs = \
    [(key, attr, regex.compile(r, regex.B | regex.M)) for key, attr, r in [
        ("article_publisher_id", "", r"/(?:article-id){e<=2}[^/]*"
                                     r"@(?:pub-id-type){e<=4}"
                                     r"=(?:publisher-id){e<=4}(?:$|@)"),
        ("article_doi", "", r"/(?:article-id){e<=2}[^/]*"
                            r"@(?:pub-id-type){e<=4}"
                            r"=(?:doi){e<=1}(?:$|@)"),
    ]]
aids_extract = partial(etree_extract, aids_regs)

In [28]:
aids = etree.Element("aids")
aids.extend(branches["article-id"])

In [29]:
print(etree.tounicode(aids))
aids_extract(aids)

<aids><article-id pub-id-type="publisher-id">S0102-36162013000100008</article-id>
			<article-id pub-id-type="doi">10.1016/j.rboe.2011.12.001</article-id>
			</aids>


{'article_publisher_id': 'S0102-36162013000100008',
 'article_doi': '10.1016/j.rboe.2011.12.001'}

With some invalid `<article-id>` tags:

In [30]:
aids_extract(etree.XML("""<aids>
<articcle-id pub-id-dtype="puisherdi">S000... PID
</articcle-id>
<raticle-id pub-id-te="do">10.xxxx/y.zzzz.2000.10.002</raticle-id>
</aids>"""))

{'article_publisher_id': 'S000... PID',
 'article_doi': '10.xxxx/y.zzzz.2000.10.002'}

### Extracting `<contrib>` data

In [31]:
contrib_regs = \
    [(key, attr, regex.compile(r, regex.B | regex.M)) for key, attr, r in [
        ("contrib_type", "contrib-type", r"/[^/]*$"),
        ("contrib_surname", "", r"/(?:surname){e<=2}$"),
        ("contrib_given_names", "", r"/(?:given-names){e<=3}$"),
        ("contrib_prefix", "", r"/(?:prefix){e<=2}$"),
        ("contrib_suffix", "", r"/(?:suffix){e<=2}$"),
        ("xref_corresp", "rid", r"/(?:xref){e<=1}[^/]*"
                                r"@(?:ref-type){e<=2}"
                                r"=(?:corresp){e<=2}(?:$|@)"),
        ("xref_corresp_text", "", r"/(?:xref){e<=1}[^/]*"
                                  r"@(?:ref-type){e<=2}"
                                  r"=(?:corresp){e<=2}(?:$|@)"),
        ("xref_aff", "", r"/(?:xref){e<=1}[^/]*"
                         r"@(?:ref-type){e<=2}"
                         r"=(?:aff){e<=1}(?:$|@)"),
        ("xref_aff_text", "", r"/(?:xref){e<=1}[^/]*"
                              r"@(?:ref-type){e<=2}"
                              r"=(?:aff){e<=1}(?:$|@)"),
    ]]
contrib_extract = partial(etree_extract, contrib_regs)

In [32]:
print(etree.tounicode(first_contrib))
contrib_extract(first_contrib)

<contrib xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" contrib-type="author">
					<name>
						<surname>Carvalho</surname>
						<given-names>Daniel Augusto de</given-names>
					</name>
					<xref ref-type="aff" rid="aff1">1</xref>
					<xref ref-type="corresp" rid="cor1">*</xref>
				</contrib>
				


{'contrib_type': 'author',
 'contrib_surname': 'Carvalho',
 'contrib_given_names': 'Daniel Augusto de',
 'contrib_prefix': '',
 'contrib_suffix': '',
 'xref_corresp': 'cor1',
 'xref_corresp_text': '*',
 'xref_aff': '1',
 'xref_aff_text': '1'}

In [33]:
contrib_extract(etree.XML("""<contrib contriytpe="author">
<unname><suRrnamy>Sobrenome</suRrnamy>
<xrref reftÿpé="rresp" ri="aff2">ref. 2</xrref>
</unname>
<name><xrref reftype="afff" id="aff1">ref. 1</xrref></name>
<givename>Namae!</givename>
</contrib>"""))

{'contrib_type': 'author',
 'contrib_surname': 'Sobrenome ref. 2',
 'contrib_given_names': 'Namae!',
 'contrib_prefix': '',
 'contrib_suffix': '',
 'xref_corresp': 'aff2',
 'xref_corresp_text': 'ref. 2',
 'xref_aff': 'ref. 1',
 'xref_aff_text': 'ref. 1'}

### Extracting `<aff>` data

In [34]:
aff_regs = \
    [(key, attr, regex.compile(r, regex.B | regex.M)) for key, attr, r in [
        ("addr_country", "", r"/(?:country){e<=2}(?:$|@)"),
        ("addr_country_code", "country", r"/(?:country){e<=2}[^/]*@(?:country){e<=2}"),
        ("aff_id", "id", r"/[^/]*$"),
        ("aff_text", "", r"/[^/]*$"),
        ("email", "", r"/(?:email){e<=1}$"),
        ("institution_original", "", r"/(?:institution){e<=2}[^/]*"
                                     r"@(?:content-type){e<=4}"
                                     r"=(?:original){e<=2}(?:$|@)"),
        ("institution_orgdiv1", "", r"/(?:institution){e<=2}[^/]*"
                                    r"@(?:content-type){e<=4}"
                                    r"=(?:orgdiv){e<=2}1(?:$|@)"),
        ("institution_orgdiv2", "", r"/(?:institution){e<=2}[^/]*"
                                    r"@(?:content-type){e<=4}"
                                    r"=(?:orgdiv){e<=2}2(?:$|@)"),
        ("institution_orgname", "", r"/(?:institution){e<=2}[^/]*"
                                    r"@(?:content-type){e<=4}"
                                    r"=(?:orgname){e<=2}(?:$|@)"),
        ("institution_orgname_rewritten", "", r"/(?:institution){e<=2}[^/]*"
                                              r"@(?:content-type){e<=4}"
                                              r"=(?:normalized){e<=4}(?:$|@)"),
        ("label", "", r"/(?:label){e<=1}$"),
    ]]
aff_extract = partial(etree_extract, aff_regs)

In [35]:
print(etree.tounicode(first_aff))
aff_extract(first_aff)

<aff xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" id="aff1">
					<label>1</label>
				Orthopedist and Traumatologist at the
					<institution content-type="orgname">Paraná Club</institution>, 
					<institution content-type="orgdiv1">Ninho da Gralha Player Training Center</institution>, <addr-line>
						<named-content content-type="city">Quatro Barras</named-content>, <named-content content-type="state">Paraná</named-content>, </addr-line>
					<country>Brazil</country>
				</aff>
			


{'addr_country': 'Brazil',
 'addr_country_code': '',
 'aff_id': 'aff1',
 'aff_text': '1 Orthopedist and Traumatologist at the Paraná Club, Ninho da Gralha Player Training Center, Quatro Barras, Paraná, Brazil',
 'email': '',
 'institution_original': '',
 'institution_orgdiv1': 'Ninho da Gralha Player Training Center',
 'institution_orgdiv2': '',
 'institution_orgname': 'Paraná Club',
 'institution_orgname_rewritten': '',
 'label': '1'}

In [36]:
fname = "0104-6632-bjce-33-01-0001.xml"
with open(fname) as f:
    d = etree.parse(f)
print(etree.tounicode(d.xpath("//aff")[0]))
aff_extract(d.xpath("//aff")[0])

<aff xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" id="aff1">
					<institution content-type="orgname">University of Chemical Technology and Metallurgy</institution>
					<addr-line>
						<named-content content-type="city">Sofia</named-content>
					</addr-line>
					<country country="BG">Bulgaria</country>
					<email>rteodosieva@abv.bg</email>
					<institution content-type="original">University of Chemical Technology and Metallurgy, 8 Kliment Ohridski Blvd. 1756, Sofia, Bulgaria. Phone: + 359 28163247. E-mail: rteodosieva@abv.bg</institution>
				</aff>
			


{'addr_country': 'Bulgaria',
 'addr_country_code': 'BG',
 'aff_id': 'aff1',
 'aff_text': 'University of Chemical Technology and Metallurgy Sofia Bulgaria rteodosieva@abv.bg University of Chemical Technology and Metallurgy, 8 Kliment Ohridski Blvd. 1756, Sofia, Bulgaria. Phone: + 359 28163247. E-mail: rteodosieva@abv.bg',
 'email': 'rteodosieva@abv.bg',
 'institution_original': 'University of Chemical Technology and Metallurgy, 8 Kliment Ohridski Blvd. 1756, Sofia, Bulgaria. Phone: + 359 28163247. E-mail: rteodosieva@abv.bg',
 'institution_orgdiv1': '',
 'institution_orgdiv2': '',
 'institution_orgname': 'University of Chemical Technology and Metallurgy',
 'institution_orgname_rewritten': '',
 'label': ''}

To be continued...