In [None]:
%load_ext autoreload
%autoreload 2

## Augment basic MMDA document with Grobid Annotations

#### Generate a basic PdfPlumber parsed MMDA doc


In [None]:

PDF_PATH = '../../tests/fixtures/grobid_augment_existing_document_parser/e5910c027af0ee9c1901c57f6579d903aedee7f4.pdf'
from mmda.parsers import PDFPlumberParser
from mmda.types import Document
# PDF to text
pdf_plumber = PDFPlumberParser()
doc: Document = pdf_plumber.parse(input_pdf_path=PDF_PATH)
doc.fields

#### Pass the doc as well as the PDF to the Grobid parser

Run a Grobid server (from https://grobid.readthedocs.io/en/latest/Grobid-docker/):
> docker pull lfoppiano/grobid:0.7.2
> docker run -t --rm -p 8070:8070 lfoppiano/grobid:0.7.2

In [None]:

from mmda.parsers.grobid_augment_existing_document_parser import GrobidAugmentExistingDocumentParser
parser = GrobidAugmentExistingDocumentParser(config_path='../../src/mmda/parsers/grobid.config', check_server=True)

In [None]:
doc = parser.parse(PDF_PATH, doc, ".")

In [None]:
doc.fields

## XML Playground

### parse xml to md

In [None]:
import xml.etree.ElementTree as et
from collections import defaultdict

XML_PATH = '../../tests/fixtures/grobid_augment_existing_document_parser/e5910c027af0ee9c1901c57f6579d903aedee7f4.xml'

xml = open(XML_PATH, encoding='utf-8').read()

xml_root = et.fromstring(xml)

# Open markdown file
with open('output.md', 'w', encoding='utf-8') as f:
    # Iterate over all elements in the XML tree
    for elem in xml_root.iter():
        # If element has text, write it to the file
        if elem.text:
            f.write(elem.text + '\n')

In [None]:
import xml.etree.ElementTree as et
from collections import defaultdict

XML_PATH = '../../tests/fixtures/grobid_augment_existing_document_parser/e5910c027af0ee9c1901c57f6579d903aedee7f4.xml'

xml = open(XML_PATH, encoding='utf-8').read()

xml_root = et.fromstring(xml)

parser._cache_page_sizes(xml_root)

parser.page_sizes


In [None]:
NS = {"tei": "http://www.tei-c.org/ns/1.0"}

for child in xml_root:
    print(child.tag)

In [None]:

author_list_root = xml_root.find(".//tei:sourceDesc", NS)
print('author_list_root', author_list_root)

author_names = []
author_name_structs = author_list_root.findall(".//tei:persName", NS)

for a in author_name_structs:
    coords_string = a.attrib["coords"]
    boxes = parser._xml_coords_to_boxes(coords_string)
author_names


In [None]:

body_root = xml_root.find(".//tei:body", NS)

all_refs = body_root.findall(".//tei:ref", NS)

bib_refs = []
for r in all_refs:
    type_attr = r.attrib["type"]
    if type_attr == "bibr":
        bib_refs.append(r)

print(len(all_refs))
print(len(bib_refs))
for ref in all_refs:
    print(ref.attrib["type"])

### Check out the Bibliography Entries

In [None]:
for bib in doc.bib_entries[:3]:
    print(bib.id, bib.box_group.id)

In [None]:
for bib in doc.bib_entries[:3]:
    print('\n\n', bib.spans, '\n', bib.text)

### Check out the Author names

In [None]:
for author_name in doc.authors:
    print('\n\n', author_name.spans, '\n', author_name.text)

### Check out the Citation Mentions

In [None]:
for author_name in doc.citation_mentions:
    print('\n\n', author_name.spans, '\n', author_name.text)