Skip to content

Commit

Permalink
fix: PDF Ingestion bug when Grobid is unable to parse the reference P…
Browse files Browse the repository at this point in the history
…DF (#103)

fixes #79
  • Loading branch information
gjreda committed Jun 7, 2023
1 parent fccdb87 commit f909337
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 27 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"lint:fix": "eslint --fix --ext .js,.ts,.tsx ./src",
"ts:check": "tsc --noEmit",
"preview": "vite preview",
"python": "bash scripts/remove_binary.sh && poetry run pyinstaller --noconfirm --hidden-import torch --collect-data torch --recursive-copy-metadata torch --hidden-import pyarrow --collect-data pyarrow --recursive-copy-metadata pyarrow --collect-binaries pyarrow --collect-submodules pyarrow --recursive-copy-metadata tqdm --recursive-copy-metadata sentence-transformers --distpath src-tauri/bin/python python/main.py && bash scripts/move_binary.sh",
"python": "bash scripts/remove_binary.sh && poetry run pyinstaller --noconfirm --distpath src-tauri/bin/python python/main.py && bash scripts/move_binary.sh",
"tauri": "tauri",
"tauri:dev": "tauri dev",
"tauri:dev:debug": "DEV_TOOLS=1 tauri dev"
Expand Down
49 changes: 41 additions & 8 deletions python/sidecar/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,18 @@ def call_grobid_server(self) -> None:
# thus, HiddenPrint (https://stackoverflow.com/a/45669280)
client = GrobidClient(GROBID_SERVER_URL, timeout=GROBID_TIMEOUT)

client.process(
"processFulltextDocument",
input_path=self.input_dir,
output=self.grobid_output_dir,
force=True
)
# If an error occurs during processing, the Grobid Server will
# print out error messages to stdout, rather than using HTTP status codes
# or raising an exception. So this line also needs to be wrapped
# in HiddenPrints.
# Grobid will still create the output file, even if an error occurs,
# however it will be a txt file with a name like {filename}_{errorcode}.txt
client.process(
"processFulltextDocument",
input_path=self.input_dir,
output=self.grobid_output_dir,
force=True
)
logger.info("Finished calling Grobid server")

def convert_grobid_xml_to_json(self) -> None:
Expand All @@ -107,7 +113,7 @@ def convert_grobid_xml_to_json(self) -> None:
with open(json_filepath, "w") as fout:
doc = grobid_tei_xml.parse_document_xml(xml)
json.dump(doc.to_dict(), fout)

def _parse_header(self, document: dict) -> dict:
"""
Parses the header of a document and returns a dictionary of the header fields
Expand Down Expand Up @@ -135,6 +141,31 @@ def _create_author(self, author_dict: dict) -> Author:
surname=author_dict.get("surname"),
email=author_dict.get("email"),
)

def _create_references_for_grobid_failures(self) -> List[Reference]:
"""
Creates Reference objects for PDFs that Grobid was unable to parse.
We want the output of PDF Ingestion to contain _all_ PDF References, even
if Grobid was unable to parse them. This allows the frontend to inform the
user which PDFs we were unable to parse.
In cases where Grobid was unable to parse the PDF, a TXT file is created
in the same output directory where the XML file would have been created.
The TXT file is named as {pdf_filename}_{error_code}.txt.
"""
txt_files = list(self.grobid_output_dir.glob("*.txt"))
logger.info(f"Found {len(txt_files)} txt files from Grobid parsing errors")

references = []
for file in txt_files:
source_pdf = f"{file.stem.rpartition('_')[0]}.pdf"
references.append(
Reference(
source_filename=source_pdf,
filename_md5=get_filename_md5(source_pdf),
)
)
return references

def create_references(self) -> List[Reference]:
"""
Expand Down Expand Up @@ -162,7 +193,9 @@ def create_references(self) -> List[Reference]:
chunks=chunk_text(doc.get("body"))
)
references.append(ref)
return references

failures = self._create_references_for_grobid_failures()
return references + failures

def create_response_from_references(self, references: List[Reference]) -> dict:
"""
Expand Down
8 changes: 4 additions & 4 deletions python/sidecar/typing.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from dataclasses import dataclass, field
from typing import List, Dict, Any
from typing import Any, Dict, List, Optional


@dataclass
class Reference:
source_filename: str
filename_md5: str
title: str
abstract: str
contents: str
title: Optional[str] = None
abstract: Optional[str] = None
contents: Optional[str] = None
authors: List["Author"] = field(default_factory=list)
chunks: List["Chunk"] = field(default_factory=list)
metadata: Dict[str, Any] = field(default_factory=dict)
Expand Down
Binary file added python/tests/fixtures/pdf/grobid-fails.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions python/tests/fixtures/xml/grobid-fails_500.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[GENERAL] An exception occurred while running Grobid.
43 changes: 29 additions & 14 deletions python/tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,20 @@ def test_main(monkeypatch, tmp_path, capsys):
grobid_output_dir = tmp_path.joinpath(".grobid")
json_storage_dir = tmp_path.joinpath(".storage")

# copy test pdf to temp dir
test_pdf = FIXTURES_DIR.joinpath("pdf", "test.pdf")
write_path = tmp_path.joinpath("uploads", "test.pdf")
_copy_fixture_to_temp_dir(test_pdf, write_path)
# copy test PDFs to temp dir
for pdf in FIXTURES_DIR.joinpath("pdf").glob("*.pdf"):
write_path = tmp_path.joinpath("uploads", pdf.name)
_copy_fixture_to_temp_dir(pdf, write_path)

# grobid server takes an input directory of PDFs
# and writes {pdfname.tei.xml} files to an output directory
# if grobid successfully parses the file, it creates a {pdfname}.tei.xml file
# if grobid fails to parse the file, it creates a {pdfname}_{errorcode}.txt file
# mock this by copying the test xml to the output directory
def mock_grobid_client_process(*args, **kwargs):
test_xml = FIXTURES_DIR.joinpath("xml", "test.tei.xml")
write_path = grobid_output_dir.joinpath("test.tei.xml")
_copy_fixture_to_temp_dir(test_xml, write_path)

for file_ in FIXTURES_DIR.joinpath("xml").glob("*"):
write_path = grobid_output_dir.joinpath(file_.name)
_copy_fixture_to_temp_dir(file_, write_path)
monkeypatch.setattr(ingest.GrobidClient, "process", mock_grobid_client_process)

pdf_directory = tmp_path.joinpath("uploads")
Expand All @@ -47,15 +48,29 @@ def mock_grobid_client_process(*args, **kwargs):
# check that the expected output was printed to stdout
captured = capsys.readouterr()
output = json.loads(captured.out)
assert len(output['references']) == 1
assert output['references'][0]['title'] == "A Few Useful Things to Know about Machine Learning"
assert len(output['references'][0]['authors']) == 1
assert output['references'][0]['authors'][0]['full_name'] == "Pedro Domingos"

# project name is the name of the parent directory of the input directory
assert output['project_name'] == tmp_path.name

# check that the expected number of references were parsed
assert len(output['references']) == 2

# sort references by source_filename so list order is consistent
references = sorted(output['references'], key=lambda x: x['source_filename'])

# check that grobid-fails.pdf is contained in the reference output
assert references[0]['source_filename'] == "grobid-fails.pdf"

# check that test.pdf was parsed correctly
assert references[1]['title'] == "A Few Useful Things to Know about Machine Learning"
assert len(references[1]['authors']) == 1
assert references[1]['authors'][0]['full_name'] == "Pedro Domingos"

# check that the expected directories and files were created
# grobid output
assert grobid_output_dir.exists()
assert grobid_output_dir.joinpath("test.tei.xml").exists()
# json creation and storage
assert grobid_output_dir.joinpath("grobid-fails_500.txt").exists()
# json creation and storage - successfully parsed references are stored as json
assert json_storage_dir.exists()
assert json_storage_dir.joinpath("test.json").exists()

0 comments on commit f909337

Please sign in to comment.