# Creating a table with all data regarding `<aff>` in the front matter

In [1]:
import json
from io import StringIO
import pathlib, csv

In [2]:
import pandas as pd
from lxml import etree

## Getting data from element with XPath

*scenes from the last episode...*

In [3]:
with open("selecao_xml_br/rbepid/v20n1/1980-5497-rbepid-20-01-00115.xml") as f:
    data = etree.parse(f)

In [4]:
for el in data.xpath("//country"):
    print(etree.tostring(el, encoding="utf-8").decode("utf-8"))

<country xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" country="BR">Brazil</country>
			
<country xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" country="BR">Brazil</country>
			


In [5]:
data.xpath("//country/@country")

['BR', 'BR']

Files opened while testing:

```
0104-6632-bjce-33-01-0001.xml
1020-4989-RPSP-41-e78.xml
selecao_xml_br/rbepid/v20n1/1980-5497-rbepid-20-01-00115.xml
selecao_xml_br/rbccv/v31n1/0102-7638-rbccv-31-01-0007.xml -> Has subarticle w/ <aff> in <front-stub> (not used)
selecao_xml_br/rimtsp/v55n1/0036-4665-rimtsp-55-01-07.xml -> Has both <article-id>, but no country code
selecao_xml_br/bbr/v15n1/1808-2386-bbr-15-01-0088.xml -> Hybrid <named-content> and <state>/<city>
selecao_xml_br/abb/v32n1/0102-3306-abb-0102-33062017abb0306.xml -> Has <postal-code>
selecao_xml_br/rbgo/v36n11/0100-7203-rbgo-36-11-0529.xml -> Don't have <aff>
```

## Getting aff-contrib pairs in a JSON with XSLT

In [6]:
aff_contrib_xslt = etree.XSLT(etree.XML("""
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" version="1.0" encoding="utf-8" indent="yes"/>
    <xsl:strip-space elements="*"/>

    <xsl:template match="article">
        <xsl:element name="pairs">
            <xsl:for-each select="//front//article-id">
                <xsl:apply-templates select="." />
            </xsl:for-each>
            <xsl:for-each select="//front//aff">
                <xsl:variable name ="affid" select="@id" />
                <xsl:call-template name="left_join">
                    <xsl:with-param name="aff" select="." />
                    <xsl:with-param name="contrib" select="//front//contrib/xref[@ref-type = 'aff' and @rid = $affid]/.." />
                </xsl:call-template>
            </xsl:for-each>
        </xsl:element>
    </xsl:template>

    <xsl:template name="left_join">
        <xsl:param name="aff" />
        <xsl:param name="contrib" />

        <xsl:choose>
            <xsl:when test="$contrib">
                <xsl:for-each select="$contrib">
                    <xsl:call-template name="print_join">
                        <xsl:with-param name="aff" select="$aff" />
                        <xsl:with-param name="contrib" select="." />
                    </xsl:call-template>
                </xsl:for-each>
            </xsl:when>
            <xsl:otherwise>
                <xsl:call-template name="print_join">
                    <xsl:with-param name="aff" select="$aff" />
                </xsl:call-template>
            </xsl:otherwise>
        </xsl:choose>
    </xsl:template>

    <xsl:template match="node() | @*">
        <xsl:copy>
            <xsl:apply-templates select="node() | @*" />
        </xsl:copy>
    </xsl:template>

    <xsl:template name="print_join">
        <xsl:param name="aff" />
        <xsl:param name="contrib" />
        <xsl:element name="affcontrib">
            <xsl:for-each select="$aff">
                <xsl:apply-templates select="node()" />
                <xsl:element name="aff-id">
                    <xsl:value-of select="$aff/@id" />
                </xsl:element>
            </xsl:for-each>
            <xsl:if test="$contrib">
                <xsl:apply-templates select="node()"/>
                <xsl:element name="ctype">
                    <xsl:value-of select="$contrib/@contrib-type" />
                </xsl:element>
            </xsl:if>
        </xsl:element>
    </xsl:template>

</xsl:stylesheet>
"""))

In [7]:
print(aff_contrib_xslt(data))

<?xml version="1.0"?>
<pairs>
  <article-id pub-id-type="doi">10.1590/1980-5497201700010010</article-id>
  <affcontrib>
    <label>I</label>
    <institution content-type="original">Universidade de Fortaleza - Fortaleza (CE), Brasil.</institution>
    <institution content-type="normalized">Universidade de Fortaleza</institution>
    <institution content-type="orgname">Universidade de Fortaleza</institution>
    <addr-line>
      <named-content content-type="city">Fortaleza</named-content>
      <named-content content-type="state">CE</named-content>
    </addr-line>
    <country country="BR">Brazil</country>
    <aff-id>aff1</aff-id>
    <name>
      <surname>Cavalcante</surname>
      <given-names>Jessica Brito</given-names>
    </name>
    <xref ref-type="aff" rid="aff1">
      <sup>I</sup>
    </xref>
    <ctype>author</ctype>
  </affcontrib>
  <affcontrib>
    <label>I</label>
    <institution content-type="original">Universidade de Fortaleza - Fortaleza (CE), Brasil.</institution>


In [8]:
aff_contrib_json_xslt = etree.XSLT(etree.XML("""
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="text" encoding="utf-8" />

    <xsl:template match="pairs">
        <xsl:text>[</xsl:text>
            <xsl:for-each select="//affcontrib">
                <xsl:apply-templates select="." />
            </xsl:for-each>
        <xsl:text>]</xsl:text>
    </xsl:template>

    <xsl:template match="affcontrib">
        <xsl:text>{</xsl:text>
            <xsl:text>"article_publisher_id":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(//article-id[@pub-id-type = 'publisher-id']/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"article_doi":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(//article-id[@pub-id-type = 'doi']/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"institution_orgname":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./institution[@content-type = 'orgname']/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"institution_orgname_rewritten":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./institution[@content-type = 'normalized']/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"institution_orgdiv1":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./institution[@content-type = 'orgdiv1']/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"institution_orgdiv2":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./institution[@content-type = 'orgdiv2']/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"institution_original":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./institution[@content-type = 'original']/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"label":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./label/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"addr_country_code":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./country/@country))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"addr_country":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./country/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"addr_state":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./addr-line/named-content[@content-type = 'state']/text()|./addr-line/state))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"addr_city":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./addr-line/named-content[@content-type = 'city']/text()|./addr-line/city))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"addr_postal_code":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./addr-line/postal-code))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"email":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./email/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"phone":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./phone/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"contrib_surname":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./name/surname/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"contrib_given_names":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./name/given-names/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"contrib_prefix":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./name/prefix/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"contrib_suffix":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./name/suffix/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"degrees":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./degrees/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>,</xsl:text>
            <xsl:text>"contrib_type":</xsl:text>
                <xsl:text>"</xsl:text>
                <xsl:value-of select="normalize-space(string(./ctype/text()))" />
                <xsl:text>"</xsl:text>
        <xsl:text>}</xsl:text>
        <xsl:if test="count(following-sibling::*) &gt; 0">
            <xsl:text>,</xsl:text>
        </xsl:if>
    </xsl:template>

</xsl:stylesheet>
"""))

In [9]:
(str(aff_contrib_json_xslt(aff_contrib_xslt(data))))

'[{"article_publisher_id":"","article_doi":"10.1590/1980-5497201700010010","institution_orgname":"Universidade de Fortaleza","institution_orgname_rewritten":"Universidade de Fortaleza","institution_orgdiv1":"","institution_orgdiv2":"","institution_original":"Universidade de Fortaleza - Fortaleza (CE), Brasil.","label":"I","addr_country_code":"BR","addr_country":"Brazil","addr_state":"CE","addr_city":"Fortaleza","addr_postal_code":"","email":"","phone":"","contrib_surname":"Cavalcante","contrib_given_names":"Jessica Brito","contrib_prefix":"","contrib_suffix":"","degrees":"","contrib_type":"author"},{"article_publisher_id":"","article_doi":"10.1590/1980-5497201700010010","institution_orgname":"Universidade de Fortaleza","institution_orgname_rewritten":"Universidade de Fortaleza","institution_orgdiv1":"","institution_orgdiv2":"","institution_original":"Universidade de Fortaleza - Fortaleza (CE), Brasil.","label":"I","addr_country_code":"BR","addr_country":"Brazil","addr_state":"CE","addr

In [10]:
pd.read_json(StringIO(str(aff_contrib_json_xslt(aff_contrib_xslt(data))))).T

Unnamed: 0,0,1,2,3,4
addr_city,Fortaleza,Fortaleza,Fortaleza,Fortaleza,Fortaleza
addr_country,Brazil,Brazil,Brazil,Brazil,Brazil
addr_country_code,BR,BR,BR,BR,BR
addr_postal_code,,,,,
addr_state,CE,CE,CE,CE,CE
article_doi,10.1590/1980-5497201700010010,10.1590/1980-5497201700010010,10.1590/1980-5497201700010010,10.1590/1980-5497201700010010,10.1590/1980-5497201700010010
article_publisher_id,,,,,
contrib_given_names,Jessica Brito,Tyciane Maria Vieira,Caroline da Costa,Carolinne Reinaldo,Ilana Nogueira
contrib_prefix,,,,,
contrib_suffix,,,,,


## Querying all `<aff>` and `<contrib>` \[sub\]-fields and their attributes in a single XML file

In [11]:
from functools import reduce
from itertools import chain

In [12]:
set(el.tag for el in data.xpath("//aff//*"))

{'addr-line', 'country', 'institution', 'label', 'named-content'}

In [13]:
reduce(set.union, (el.attrib.keys() for el in data.xpath("//aff//*")), set())

{'content-type', 'country'}

In [14]:
set("@".join([el.tag, attr]) if attr else el.tag
    for el in data.xpath("//aff//*|//contrib//*")
    for attr in chain([""], el.attrib))

{'addr-line',
 'country',
 'country@country',
 'given-names',
 'institution',
 'institution@content-type',
 'label',
 'name',
 'named-content',
 'named-content@content-type',
 'sup',
 'surname',
 'xref',
 'xref@ref-type',
 'xref@rid'}

## Querying all `<aff>` and `<contrib>` \[sub\]-fields and their attributes in every XML file

In [15]:
import pathlib, collections

In [16]:
def tag_attr_set(node):
    return set("@".join([el.tag, attr]) if attr else el.tag
               for el in node.xpath("//aff//*|//contrib//*")
               for attr in chain([""], el.attrib))

In [17]:
fnames = collections.defaultdict(set)
fnames_total = 0
for xml_path in pathlib.Path("selecao_xml_br").glob("**/*.xml"):
    fnames_total += 1
    try:
        with open(xml_path) as xml_file:
            xml_etree = etree.parse(xml_file)
    except:
        fnames["<errored>"].add(xml_path)
    else:
        for ta in tag_attr_set(xml_etree):
            fnames[ta].add(xml_path)

In [18]:
quantities = {k: len(v) for k, v in fnames.items()}
quantities["<total>"] = fnames_total
quantities

{'<errored>': 425,
 '<total>': 23815,
 'addr-line': 19929,
 'aff': 1,
 'aff@id': 1,
 'anonymous': 1,
 'author-comment': 6,
 'bio': 29,
 'bio@id': 15,
 'bold': 416,
 'city': 861,
 'collab': 52,
 'contrib': 2,
 'contrib-id': 258,
 'contrib-id@content-type': 1,
 'contrib-id@contrib-id-type': 257,
 'contrib@contrib-type': 1,
 'country': 21864,
 'country@content-type': 3,
 'country@country': 17097,
 'degrees': 70,
 'emai': 1,
 'email': 5381,
 'ext-link': 7,
 'ext-link@ext-link-type': 7,
 'ext-link@{http://www.w3.org/1999/xlink}href': 7,
 'given-names': 23112,
 'insitution': 2,
 'insitution@content-type': 2,
 'institution': 22299,
 'institution@content-type': 22299,
 'italic': 155,
 'label': 18894,
 'name': 23113,
 'name@name-style': 10,
 'named-content': 19085,
 'named-content@content-type': 19085,
 'on-behalf-of': 13,
 'p': 35,
 'phone': 4,
 'postal-code': 30,
 'preffix': 1,
 'prefix': 21,
 'role': 132,
 'sc': 2,
 'siffix': 2,
 'state': 680,
 'state@content-type': 1,
 'sub': 2,
 'suffix': 

In [19]:
fnames["state"].intersection(fnames["named-content"])

{PosixPath('selecao_xml_br/bbr/v15n1/1808-2386-bbr-15-01-0088.xml'),
 PosixPath('selecao_xml_br/rbent/v62n1/0085-5626-rbent-62-01-0029.xml')}

In [20]:
fnames["prefix"].intersection(fnames["suffix"])

{PosixPath('selecao_xml_br/rbgo/v36n11/0100-7203-rbgo-36-11-0529.xml')}

To see all the data, run either:

```
from IPython.lib.pretty import pprint
pprint(fnames, max_seq_length=float(\"inf\"))
```

or something like:

```
import os
for path in sorted(reduce(set.union, fnames.values())):
    print(os.path.join(*path.parts[1:]), end=": ")
    print(", ".join(fname for fname in sorted(fnames) if path in fnames[fname]))
```

## Exporting the JSON results as a CSV

Trying with 25 files:

In [21]:
count = 25
header = None
with open("affs_table_25.csv", "w") as output_file:
    cw = csv.writer(output_file)
    for xml_path in pathlib.Path("selecao_xml_br").glob("**/*.xml"):
        with open(xml_path) as xml_file:
            xml_etree = etree.parse(xml_file)
        rows = json.loads(str(aff_contrib_json_xslt(aff_contrib_xslt(xml_etree))))
        if header is None:
            header = sorted(rows[0].keys())
            cw.writerow(header)
        for row in rows:
            cw.writerow(row[col_name] for col_name in header)
        output_file.flush()
        count -= 1
        if count <= 0:
            break

In [22]:
pd.read_csv("affs_table_25.csv", keep_default_na=False)

Unnamed: 0,addr_city,addr_country,addr_country_code,addr_postal_code,addr_state,article_doi,article_publisher_id,contrib_given_names,contrib_prefix,contrib_suffix,...,contrib_type,degrees,email,institution_orgdiv1,institution_orgdiv2,institution_orgname,institution_orgname_rewritten,institution_original,label,phone
0,,Brazil,BR,,,10.1590/0103-335220162102,,Lígia Mori,,,...,author,,ligiamorimadeira@gmail.com,,,Universidade Federal do Rio Grande do Sul (UFRGS),Universidade Federal do Rio Grande do Sul,É doutora em Sociologia pela Universidade Fede...,*,
1,,Brazil,BR,,,10.1590/0103-335220162106,,Marcelo Kunrath,,,...,author,,mksilva@ufrgs.br,,,Universidade Federal do Rio Grande do Sul (UFRGS),Universidade Federal do Rio Grande do Sul,É professor do Departamento de Sociologia da U...,*,
2,,Brazil,BR,,,10.1590/0103-335220162106,,Bianca de Oliveira,,,...,author,,bianca.or@gmail.com,,,Instituto Federal Sul-rio-grandense (IFSUL),Instituto Federal Sul-rio-grandense,É professora do Instituto Federal Sul-rio-gran...,**,
3,,Brazil,BR,,,10.1590/0103-335220162101,,Rodrigo Nuñez,,,...,author,,mrviegas@gmail.com,,,Universidade Federal do Rio de Janeiro (UFRJ),Universidade Federal do Rio de Janeiro,"É sociólogo, mestre em Sociologia e Antropolog...",*,
4,,Brazil,BR,,,10.1590/0103-335220162108,,Rafael Cardoso,,,...,author,,cardososampaio@gmail.com,,,Universidade Federal do Paraná (UFPR),Universidade Federal do Paraná,É professor de Ciência Política na Universidad...,*,
5,,Brazil,BR,,,10.1590/0103-335220162108,,Rachel Callai,,,...,author,,rachelbragatto@gmail.com,,,Universidade Federal do Paraná (UFPR),Universidade Federal do Paraná,É doutora em Sociologia pela Universidade Fede...,**,
6,,Brazil,BR,,,10.1590/0103-335220162108,,Maria Alejandra,,,...,author,,alejandranicolas@gmail.com,,,Universidade Federal da Integração Latino-Amer...,Universidade Federal da Integração Latino-Amer...,É professora de Administração Pública e Políti...,***,
7,,Brazil,BR,,,10.1590/0103-335220162107,,Nildo,,,...,author,,nildoavelino@gmail.com,,,Pontifícia Universidade Católica de São Paulo ...,Pontifícia Universidade Católica de São Paulo,É doutor em Ciência Política pela Pontifícia U...,*,
8,,Portugal,PT,,,10.1590/0103-335220162104,,Teresa,,,...,author,,tcierco@letras.up.pt,,,Universidade do Porto,Universidade do Porto,doutora em Ciência Política e Relações Interna...,*,
9,,Portugal,PT,,,10.1590/0103-335220162104,,António,,,...,author,,agomesbelo@gmail.com,,,Universidade da Beira Interior,Universidade da Beira Interior,É oficial de polícia com a categoria de intend...,**,


Full:

In [23]:
header = None
with open("affs_table.csv", "w") as output_file:
    cw = csv.writer(output_file)
    for xml_path in pathlib.Path("selecao_xml_br").glob("**/*.xml"):
        try:
            with open(xml_path) as xml_file:
                xml_etree = etree.parse(xml_file)
        except:
            continue
        rows = json.loads(str(aff_contrib_json_xslt(aff_contrib_xslt(xml_etree))))
        if header is None:
            header = sorted(rows[0].keys())
            cw.writerow(header)
        for row in rows:
            cw.writerow(row.values())
        output_file.flush()

JSONDecodeError: Expecting ',' delimiter: line 1 column 132 (char 131)

It broke because of a `"` symbol in some field. Actually, that's the `institution_orgname` that has a `Universidade Estadual Paulista "Júlio de Mesquita Filho"` value.