## XML parsing with ElementTree (+ XPath)

In [1]:
from xml.etree.ElementTree import ElementTree

In [2]:
with open("0104-6632-bjce-33-01-0001.xml") as f:
    doc_etree = ElementTree(file=f)

In [3]:
doc_etree.findall("//aff")

  """Entry point for launching an IPython kernel.


[<Element 'aff' at 0x7fa9f02ec098>]

In [4]:
print(doc_etree.findall("//aff")[0])

<Element 'aff' at 0x7fa9f02ec098>


  """Entry point for launching an IPython kernel.


## XML parsing with libxml2 (+ XPath)

In [5]:
import libxml2

In [6]:
doc = libxml2.parseFile("0104-6632-bjce-33-01-0001.xml")

In [7]:
doc_affs = doc.xpathEval("//aff")
doc_affs

[<xmlNode (aff) object at 0x7fa9f017a710>]

In [8]:
doc_aff = doc_affs[0]
print(doc_aff)

<aff id="aff1">
					<institution content-type="orgname">University of Chemical Technology and Metallurgy</institution>
					<addr-line>
						<named-content content-type="city">Sofia</named-content>
					</addr-line>
					<country country="BG">Bulgaria</country>
					<email>rteodosieva@abv.bg</email>
					<institution content-type="original">University of Chemical Technology and Metallurgy, 8 Kliment Ohridski Blvd. 1756, Sofia, Bulgaria. Phone: + 359 28163247. E-mail: rteodosieva@abv.bg</institution>
				</aff>


## XML parsing with lxml's ElementTree (+ XPath)

In [9]:
from lxml import etree

In [10]:
with open("0104-6632-bjce-33-01-0001.xml") as f:
    data = etree.parse(f)

In [11]:
data.xpath("//aff/country/text()[normalize-space()]")

['Bulgaria']

In [12]:
country_xpath = etree.XPath("//aff/country/text()[normalize-space()]")
country_xpath

//aff/country/text()[normalize-space()]

In [13]:
country_xpath(data)

['Bulgaria']

## XSLT with lxml

In [14]:
transform = etree.XSLT(etree.XML("""
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="text" encoding="utf-8" />
    <xsl:template match="//a">
        <xsl:text>{"test": </xsl:text>
        <xsl:value-of select="b/text()" />
        <xsl:text>}</xsl:text>
    </xsl:template>
</xsl:stylesheet>
"""))

In [15]:
print(transform(etree.XML("<c><a><b>Tééeext</b></a></c>")))

{"test": Tééeext}


## XML to JSON with XSLT

XSLT from https://www.bjelic.net/2012/08/01/coding/convert-xml-to-json-using-xslt/

In [16]:
xml2json_xslt = etree.XSLT(etree.XML("""
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="text" encoding="utf-8"/>
 
    <xsl:template match="/*[node()]">
        <xsl:text>{</xsl:text>
        <xsl:apply-templates select="." mode="detect" />
        <xsl:text>}</xsl:text>
    </xsl:template>
 
    <xsl:template match="*" mode="detect">
        <xsl:choose>
            <xsl:when test="name(preceding-sibling::*[1]) = name(current()) and name(following-sibling::*[1]) != name(current())">
                    <xsl:apply-templates select="." mode="obj-content" />
                <xsl:text>]</xsl:text>
                <xsl:if test="count(following-sibling::*[name() != name(current())]) &gt; 0">, </xsl:if>
            </xsl:when>
            <xsl:when test="name(preceding-sibling::*[1]) = name(current())">
                    <xsl:apply-templates select="." mode="obj-content" />
                    <xsl:if test="name(following-sibling::*) = name(current())">, </xsl:if>
            </xsl:when>
            <xsl:when test="following-sibling::*[1][name() = name(current())]">
                <xsl:text>"</xsl:text><xsl:value-of select="name()"/><xsl:text>" : [</xsl:text>
                    <xsl:apply-templates select="." mode="obj-content" /><xsl:text>, </xsl:text>
            </xsl:when>
            <xsl:when test="count(./child::*) > 0 or count(@*) > 0">
                <xsl:text>"</xsl:text><xsl:value-of select="name()"/>" : <xsl:apply-templates select="." mode="obj-content" />
                <xsl:if test="count(following-sibling::*) &gt; 0">, </xsl:if>
            </xsl:when>
            <xsl:when test="count(./child::*) = 0">
                <xsl:text>"</xsl:text><xsl:value-of select="name()"/>" : "<xsl:apply-templates select="."/><xsl:text>"</xsl:text>
                <xsl:if test="count(following-sibling::*) &gt; 0">, </xsl:if>
            </xsl:when>
        </xsl:choose>
    </xsl:template>
 
    <xsl:template match="*" mode="obj-content">
        <xsl:text>{</xsl:text>
            <xsl:apply-templates select="@*" mode="attr" />
            <xsl:if test="count(@*) &gt; 0 and (count(child::*) &gt; 0 or text())">, </xsl:if>
            <xsl:apply-templates select="./*" mode="detect" />
            <xsl:if test="count(child::*) = 0 and text() and not(@*)">
                <xsl:text>"</xsl:text><xsl:value-of select="name()"/>" : "<xsl:value-of select="text()"/><xsl:text>"</xsl:text>
            </xsl:if>
            <xsl:if test="count(child::*) = 0 and text() and @*">
                <xsl:text>"text" : "</xsl:text><xsl:value-of select="text()"/><xsl:text>"</xsl:text>
            </xsl:if>
        <xsl:text>}</xsl:text>
        <xsl:if test="position() &lt; last()">, </xsl:if>
    </xsl:template>
 
    <xsl:template match="@*" mode="attr">
        <xsl:text>"</xsl:text><xsl:value-of select="name()"/>" : "<xsl:value-of select="."/><xsl:text>"</xsl:text>
        <xsl:if test="position() &lt; last()">,</xsl:if>
    </xsl:template>
 
    <xsl:template match="node/@TEXT | text()" name="removeBreaks">
        <xsl:param name="pText" select="normalize-space(.)"/>
        <xsl:choose>
            <xsl:when test="not(contains($pText, '&#xA;'))"><xsl:copy-of select="$pText"/></xsl:when>
            <xsl:otherwise>
                <xsl:value-of select="concat(substring-before($pText, '&#xD;&#xA;'), ' ')"/>
                <xsl:call-template name="removeBreaks">
                    <xsl:with-param name="pText" select="substring-after($pText, '&#xD;&#xA;')"/>
                </xsl:call-template>
            </xsl:otherwise>
        </xsl:choose>
    </xsl:template>
 
</xsl:stylesheet>
"""))

In [17]:
print(xml2json_xslt(etree.XML("<c><a><b>Tééeext</b></a></c>")))

{"c" : {"a" : {"b" : "Tééeext"}}}


Does it with an article XML?

In [18]:
print(xml2json_xslt(data))

{"article" : {"article-type" : "research-article","dtd-version" : "1.0","specific-use" : "sps-1.4","xml:lang" : "en", "front" : {"journal-meta" : {"journal-id" : {"journal-id-type" : "publisher-id", "text" : "bjce"}, "journal-title-group" : {"journal-title" : "Brazilian Journal of Chemical Engineering", "abbrev-journal-title" : {"abbrev-type" : "publisher", "text" : "Braz. J. Chem. Eng."}}, "issn" : [{"pub-type" : "ppub", "text" : "0104-6632"}, {"pub-type" : "epub", "text" : "1678-4383"}], "publisher" : {"publisher-name" : "Brazilian Society of Chemical Engineering"}}, "article-meta" : {"article-id" : {"pub-id-type" : "doi", "text" : "10.1590/0104-6632.20160331s00003267"}, "article-categories" : {"subj-group" : {"subj-group-type" : "heading", "subject" : "BIOPROCESS ENGINEERING"}}, "title-group" : {"article-title" : "BIODECOMPOSITION OF JORDAN PHOSPHORITE BY PHOSPHATE-SOLUBILIZING FUNGI"}, "contrib-group" : {"contrib" : [{"contrib-type" : "author", "name" : {"surname" : "Teodosieva", "

In [19]:
import json

As it seems, it's broken...

In [20]:
json.loads(str(xml2json_xslt(data)))

JSONDecodeError: Expecting ',' delimiter: line 1 column 7815 (char 7814)

In [21]:
# with open("0104-6632-bjce-33-01-0001.json", "w") as f:
#     f.write(str(xml2json_xslt(data)))

## Left-joining aff and contrib with XSLT

In [22]:
for x in data.xpath("//contrib"):
    print(etree.tostring(x, encoding="utf-8").decode("utf-8"))

<contrib xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" contrib-type="author">
					<name>
						<surname>Teodosieva</surname>
						<given-names>R.</given-names>
					</name>
					<xref ref-type="aff" rid="aff1"/>
					<xref ref-type="fn" rid="fn1">*</xref>
				</contrib>
				
<contrib xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" contrib-type="author">
					<name>
						<surname>Bojinova</surname>
						<given-names>D.</given-names>
					</name>
					<xref ref-type="aff" rid="aff1"/>
				</contrib>
				


In [23]:
for x in data.xpath('//contrib/xref[@ref-type = "aff" and @rid = $affid]/..', affid="aff1"):
    print(etree.tostring(x, encoding="utf-8").decode("utf-8"))

<contrib xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" contrib-type="author">
					<name>
						<surname>Teodosieva</surname>
						<given-names>R.</given-names>
					</name>
					<xref ref-type="aff" rid="aff1"/>
					<xref ref-type="fn" rid="fn1">*</xref>
				</contrib>
				
<contrib xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" contrib-type="author">
					<name>
						<surname>Bojinova</surname>
						<given-names>D.</given-names>
					</name>
					<xref ref-type="aff" rid="aff1"/>
				</contrib>
				


In [24]:
aff_contrib_xslt = etree.XSLT(etree.XML("""
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" version="1.0" encoding="utf-8" indent="yes"/>
    <xsl:strip-space elements="*"/>

    <xsl:template match="article">
        <pairs>
        <xsl:for-each select="//aff">
            <xsl:variable name ="affid" select="@id" />
            <xsl:call-template name="left_join">
                <xsl:with-param name="aff" select="." />
                <xsl:with-param name="contrib" select="//contrib/xref[@ref-type = 'aff' and @rid = $affid]/.." />
            </xsl:call-template>
        </xsl:for-each>
        </pairs>
    </xsl:template>

    <xsl:template name="left_join">
        <xsl:param name="aff" />
        <xsl:param name="contrib" />

        <xsl:choose>
            <xsl:when test="$contrib">
                <xsl:for-each select="$contrib">
                    <xsl:call-template name="print_join">
                        <xsl:with-param name="aff" select="$aff" />
                        <xsl:with-param name="contrib" select="." />
                    </xsl:call-template>
                </xsl:for-each>
            </xsl:when>
            <xsl:otherwise>
                <xsl:call-template name="print_join">
                    <xsl:with-param name="aff" select="$aff" />
                </xsl:call-template>
            </xsl:otherwise>
        </xsl:choose>
    </xsl:template>

    <xsl:template match="node() | @*">
        <xsl:copy>
            <xsl:apply-templates select="node() | @*" />
        </xsl:copy>
    </xsl:template>

    <xsl:template name="print_join">
        <xsl:param name="aff" />
        <xsl:param name="contrib" />
        <xsl:element name="affcontrib">
            <xsl:for-each select="$aff">
                <xsl:apply-templates select="node()" />
            </xsl:for-each>
            <xsl:if test="$contrib">
                <xsl:apply-templates select="$contrib/name"/>
                <ctype><xsl:value-of select="$contrib/@contrib-type" /></ctype>
            </xsl:if>
        </xsl:element>
    </xsl:template>

</xsl:stylesheet>
"""))

In [25]:
print(aff_contrib_xslt(data))

<?xml version="1.0"?>
<pairs>
  <affcontrib>
    <institution content-type="orgname">University of Chemical Technology and Metallurgy</institution>
    <addr-line>
      <named-content content-type="city">Sofia</named-content>
    </addr-line>
    <country country="BG">Bulgaria</country>
    <email>rteodosieva@abv.bg</email>
    <institution content-type="original">University of Chemical Technology and Metallurgy, 8 Kliment Ohridski Blvd. 1756, Sofia, Bulgaria. Phone: + 359 28163247. E-mail: rteodosieva@abv.bg</institution>
    <name>
      <surname>Teodosieva</surname>
      <given-names>R.</given-names>
    </name>
    <ctype>author</ctype>
  </affcontrib>
  <affcontrib>
    <institution content-type="orgname">University of Chemical Technology and Metallurgy</institution>
    <addr-line>
      <named-content content-type="city">Sofia</named-content>
    </addr-line>
    <country country="BG">Bulgaria</country>
    <email>rteodosieva@abv.bg</email>
    <institution content-type="origi

## Where does the dataset have aff xrefs?

In [26]:
import pathlib, collections

In [27]:
fnames = collections.defaultdict(set)
fnames_total = 0
for xml_path in pathlib.Path("selecao_xml_br").glob("**/*.xml"):
    fnames_total += 1
    try:
        with open(xml_path) as xml_file:
            xml_etree = etree.parse(xml_file)
    except:
        fnames["<errored>"].add(xml_path)
    else:
        for el in xml_etree.xpath("//xref[@ref-type = 'aff']/.."):
            fnames[el.tag].add(xml_path)

In [28]:
quantities = {k: len(v) for k, v in fnames.items()}
quantities["<total>"] = fnames_total
quantities

{'<errored>': 425,
 '<total>': 23815,
 'aff': 2,
 'contrib': 21991,
 'name': 1,
 'p': 5}

In [29]:
quantities["<errored>"] / quantities["<total>"]

0.01784589544404787

In [30]:
from IPython.lib.pretty import pprint
pprint(fnames) # Use max_seq_length=float("inf") to see all (2MB data)

defaultdict(set,
            {'contrib': {PosixPath('selecao_xml_br/csc/v21n1/1413-8123-csc-21-01-0001.xml'),
              PosixPath('selecao_xml_br/ides/v68n1/2175-8026-ides-68-01-00075.xml'),
              PosixPath('selecao_xml_br/rlae/v23n1/0104-1169-rlae-23-01-00155.xml'),
              PosixPath('selecao_xml_br/pat/v47n1/1983-4063-pat-47-01-0022.xml'),
              PosixPath('selecao_xml_br/acb/v31s1/0102-8650-acb-31-s1-00008.xml'),
              PosixPath('selecao_xml_br/bjmbr/v46n1/bjb-46-01-001.xml'),
              PosixPath('selecao_xml_br/jbpneu/v40n1/1806-3713-jbpneu-40-01-00082.xml'),
              PosixPath('selecao_xml_br/inter/v18n1/1518-7012-inter-18-01-0165.xml'),
              PosixPath('selecao_xml_br/rgenf/xml/reme/v21/1415-2762-reme-20170050.xml'),
              PosixPath('selecao_xml_br/abcd/v30n1/0102-6720-abcd-30-01-00069.xml'),
              PosixPath('selecao_xml_br/rca/v49n1/1806-6690-rca-49-01-0062.xml'),
              PosixPath('selecao_xml_br/bjpp/v48n0

Files that have an xref to an affiliation in a `<p>` element:
```
bjpp/v48n01/0102-3616-rbort-48-01-0057.xml
ld/v17n1/1518-7632-ld-17-01-00011.xml
pci/v21n1/1413-9936-pci-21-01-00134.xml
rbcdh/v17n1/1980-0037-rbcdh-17-01-00073.xml
ress/v25n1/2237-9622-ress-25-01-00095.xml
```

Files that have an xref to an affiliation in a `<name>` element:
```
aabc/v86n1/0001-3765-aabc-86-01-429.xml
```

That's the reason why Hugo Tourinho doesn't have an affiliation link in http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0001-37652014000100429&lng=en&nrm=iso&tlng=en

Files that have an xref to an affiliation in an `<aff>` element:
```
ean/v18n1/1414-8145-ean-18-01-0075.xml
rgenf/xml/ean/v18n1/1414-8145-ean-18-01-0075.xml
```

There are duplicated files! These last two are actually the same!

In [31]:
!cmp -l selecao_xml_br/ean/v18n1/1414-8145-ean-18-01-0075.xml selecao_xml_br/rgenf/xml/ean/v18n1/1414-8145-ean-18-01-0075.xml