In [8]:
from pathlib import Path
import pandas as pd
from grobid_client.grobid_client import GrobidClient
import os
import shutil

In [1]:
pdf_pth = 'annual_reviews_chapter-2022-08-16.pdf'

### PDF to xml

In [3]:
client = GrobidClient(config_path=str(Path.home()) + '/Library/grobid_client_python/config.json')

GROBID server is up and running


In [6]:
temp_dir = './temp'

In [16]:
if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)

temp_pdf_dir = os.path.join(temp_dir, 'pdf')
if not os.path.exists(temp_pdf_dir):
    os.makedirs(temp_pdf_dir)

temp_xml_dir = os.path.join(temp_dir, 'xml')
if not os.path.exists(temp_xml_dir):
    os.makedirs(temp_xml_dir)

In [17]:
shutil.copy(pdf_pth, temp_pdf_dir)

'./temp/pdf/annual_reviews_chapter-2022-08-16.pdf'

In [18]:
client.process(
    'processFulltextDocument',
    temp_pdf_dir,
    tei_coordinates=True,
    force=True,
    verbose=True,
    output=temp_xml_dir,
)

annual_reviews_chapter-2022-08-16.pdf
1 files to process in current batch


In [15]:
file_name = pdf_pth.split('/')[-1]
name = file_name[:-4]
xml_name = name + '.tei.xml'

In [30]:
xml_pth = os.path.join(temp_xml_dir, xml_name)
os.path.exists(xml_pth)

True

### PDF to figures

In [22]:
from subprocess import call

In [26]:
pdffigures2_home = os.path.abspath(str(Path.home()) + '/Library/pdffigures2')

fig_dir_profix = 'figure'
img_dir_profix = 'figure/image'
json_dir_profix = 'figure/json'

tmp_fig_dir = os.path.join(pdffigures2_home, fig_dir_profix)
if not os.path.exists(tmp_fig_dir):
    os.makedirs(tmp_fig_dir)
tmp_img_dir = os.path.join(pdffigures2_home, img_dir_profix)
if not os.path.exists(tmp_img_dir):
    os.makedirs(tmp_img_dir)
tmp_json_dir = os.path.join(pdffigures2_home, json_dir_profix)
if not os.path.exists(tmp_json_dir):
    os.makedirs(tmp_json_dir)

In [27]:
args = [
    'sbt',
    '-J-Xmx4G',
    'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q ' + os.path.abspath(temp_pdf_dir) + '/' + ' -m ' + './' + img_dir_profix + '/' + ' -d ' + './' + json_dir_profix + '/' + ' -s ' + './' + fig_dir_profix + '/stat.json',
]
exit_code = call(args, cwd=pdffigures2_home)

[info] Loading settings for project pdffigures2-build from plugins.sbt ...
[info] Loading project definition from /home/quanta/Library/pdffigures2/project
[info] Loading settings for project root from build.sbt ...
[info] Set current project to pdffigures2 (in build file:/home/quanta/Library/pdffigures2/)
[warn] Multiple main classes detected.  Run 'show discoveredMainClasses' to see the list
[info] running org.allenai.pdffigures2.FigureExtractorBatchCli -e -q /home/quanta/Datasets/sciduet-dataset/temp/pdf/ -m ./figure/image/ -d ./figure/json/ -s ./figure/stat.json
01:18:35.674 [run-main-0] INFO  o.a.p.FigureExtractorBatchCli$ - Processing file annual_reviews_chapter-2022-08-16.pdf (1 of 1)
01:18:37.409 [run-main-0] INFO  o.a.p.FigureExtractorBatchCli$ - Finished annual_reviews_chapter-2022-08-16.pdf in 1.733 seconds
01:18:37.410 [run-main-0] INFO  o.a.p.FigureExtractorBatchCli$ - Finished processing 1 files
01:18:37.410 [run-main-0] INFO  o.a.p.FigureExtractorBatchCli$ - Took 1.736 se

In [28]:
shutil.move(tmp_fig_dir, temp_dir)

'./temp/figure'

In [31]:
figure_json_pth = os.path.join(temp_dir, 'figure/json', name + '.json')
os.path.exists(figure_json_pth)

True

### Merge to json

In [32]:
from core.merge2json import single_entry
_, title, abstract, text, headers, figures = single_entry('', xml_pth=xml_pth, fig_json_pth=figure_json_pth)

In [36]:
temp_json_dir = os.path.join(temp_dir, 'json')
if not os.path.exists(temp_json_dir):
    os.makedirs(temp_json_dir)

In [37]:
json_data = {
    'title': title,
    'abstract': abstract,
    'text': text,
    'headers': headers,
    'figures': figures,
}

In [38]:
import json
with open(os.path.join(temp_json_dir, name + '.json'), 'w') as f:
    json.dump(json_data, f, indent=4)