fix: PDF Ingestion bug when Grobid is unable to parse the reference P…

…DF (#103) fixes #79
refstudio · Jun 7, 2023 · f909337 · f909337
1 parent fccdb87
commit f909337
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 27 deletions.
diff --git a/package.json b/package.json
@@ -17,7 +17,7 @@
     "lint:fix": "eslint --fix --ext .js,.ts,.tsx ./src",
     "ts:check": "tsc --noEmit",
     "preview": "vite preview",
-    "python": "bash scripts/remove_binary.sh && poetry run pyinstaller --noconfirm --hidden-import torch --collect-data torch --recursive-copy-metadata torch --hidden-import pyarrow --collect-data pyarrow --recursive-copy-metadata pyarrow --collect-binaries pyarrow --collect-submodules pyarrow --recursive-copy-metadata tqdm --recursive-copy-metadata sentence-transformers --distpath src-tauri/bin/python python/main.py && bash scripts/move_binary.sh",
+    "python": "bash scripts/remove_binary.sh && poetry run pyinstaller --noconfirm --distpath src-tauri/bin/python python/main.py && bash scripts/move_binary.sh",
     "tauri": "tauri",
     "tauri:dev": "tauri dev",
     "tauri:dev:debug": "DEV_TOOLS=1 tauri dev"

diff --git a/python/sidecar/ingest.py b/python/sidecar/ingest.py
@@ -84,12 +84,18 @@ def call_grobid_server(self) -> None:
             # thus, HiddenPrint (https://stackoverflow.com/a/45669280)
             client = GrobidClient(GROBID_SERVER_URL, timeout=GROBID_TIMEOUT)
 
-        client.process(
-            "processFulltextDocument",
-            input_path=self.input_dir,
-            output=self.grobid_output_dir,
-            force=True
-        )
+            # If an error occurs during processing, the Grobid Server will
+            # print out error messages to stdout, rather than using HTTP status codes
+            # or raising an exception. So this line also needs to be wrapped
+            # in HiddenPrints.
+            # Grobid will still create the output file, even if an error occurs,
+            # however it will be a txt file with a name like {filename}_{errorcode}.txt
+            client.process(
+                "processFulltextDocument",
+                input_path=self.input_dir,
+                output=self.grobid_output_dir,
+                force=True
+            )
         logger.info("Finished calling Grobid server")
 
     def convert_grobid_xml_to_json(self) -> None:
@@ -107,7 +113,7 @@ def convert_grobid_xml_to_json(self) -> None:
             with open(json_filepath, "w") as fout:
                 doc = grobid_tei_xml.parse_document_xml(xml)
                 json.dump(doc.to_dict(), fout)
-
+    
     def _parse_header(self, document: dict) -> dict:
         """
         Parses the header of a document and returns a dictionary of the header fields
@@ -135,6 +141,31 @@ def _create_author(self, author_dict: dict) -> Author:
             surname=author_dict.get("surname"),
             email=author_dict.get("email"),
         )
+
+    def _create_references_for_grobid_failures(self) -> List[Reference]:
+        """
+        Creates Reference objects for PDFs that Grobid was unable to parse.
+        We want the output of PDF Ingestion to contain _all_ PDF References, even
+        if Grobid was unable to parse them. This allows the frontend to inform the
+        user which PDFs we were unable to parse.
+
+        In cases where Grobid was unable to parse the PDF, a TXT file is created
+        in the same output directory where the XML file would have been created.
+        The TXT file is named as {pdf_filename}_{error_code}.txt.
+        """
+        txt_files = list(self.grobid_output_dir.glob("*.txt"))
+        logger.info(f"Found {len(txt_files)} txt files from Grobid parsing errors")
+
+        references = []
+        for file in txt_files:
+            source_pdf = f"{file.stem.rpartition('_')[0]}.pdf"
+            references.append(
+                Reference(
+                    source_filename=source_pdf,
+                    filename_md5=get_filename_md5(source_pdf),
+                )
+            )
+        return references
 
     def create_references(self) -> List[Reference]:
         """
@@ -162,7 +193,9 @@ def create_references(self) -> List[Reference]:
                 chunks=chunk_text(doc.get("body"))
             )
             references.append(ref)
-        return references
+
+        failures = self._create_references_for_grobid_failures()
+        return references + failures
 
     def create_response_from_references(self, references: List[Reference]) -> dict:
         """

diff --git a/python/sidecar/typing.py b/python/sidecar/typing.py
@@ -1,14 +1,14 @@
 from dataclasses import dataclass, field
-from typing import List, Dict, Any
+from typing import Any, Dict, List, Optional
 
 
 @dataclass
 class Reference:
     source_filename: str
     filename_md5: str
-    title: str
-    abstract: str
-    contents: str
+    title: Optional[str] = None
+    abstract: Optional[str] = None
+    contents: Optional[str] = None
     authors: List["Author"] = field(default_factory=list)
     chunks: List["Chunk"] = field(default_factory=list)
     metadata: Dict[str, Any] = field(default_factory=dict)

diff --git a/python/tests/fixtures/pdf/grobid-fails.pdf b/python/tests/fixtures/pdf/grobid-fails.pdf
diff --git a/python/tests/fixtures/xml/grobid-fails_500.txt b/python/tests/fixtures/xml/grobid-fails_500.txt
@@ -0,0 +1 @@
+[GENERAL] An exception occurred while running Grobid.
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
@@ -26,19 +26,20 @@ def test_main(monkeypatch, tmp_path, capsys):
     grobid_output_dir = tmp_path.joinpath(".grobid")
     json_storage_dir = tmp_path.joinpath(".storage")
 
-    # copy test pdf to temp dir
-    test_pdf = FIXTURES_DIR.joinpath("pdf", "test.pdf")
-    write_path = tmp_path.joinpath("uploads", "test.pdf")
-    _copy_fixture_to_temp_dir(test_pdf, write_path)
+    # copy test PDFs to temp dir
+    for pdf in FIXTURES_DIR.joinpath("pdf").glob("*.pdf"):
+        write_path = tmp_path.joinpath("uploads", pdf.name)
+        _copy_fixture_to_temp_dir(pdf, write_path)
 
     # grobid server takes an input directory of PDFs
-    # and writes {pdfname.tei.xml} files to an output directory
+    # if grobid successfully parses the file, it creates a {pdfname}.tei.xml file
+    # if grobid fails to parse the file, it creates a {pdfname}_{errorcode}.txt file
     # mock this by copying the test xml to the output directory
     def mock_grobid_client_process(*args, **kwargs):
-        test_xml = FIXTURES_DIR.joinpath("xml", "test.tei.xml")
-        write_path = grobid_output_dir.joinpath("test.tei.xml")
-        _copy_fixture_to_temp_dir(test_xml, write_path)
-
+        for file_ in FIXTURES_DIR.joinpath("xml").glob("*"):
+            write_path = grobid_output_dir.joinpath(file_.name)
+            _copy_fixture_to_temp_dir(file_, write_path)
+    
     monkeypatch.setattr(ingest.GrobidClient, "process", mock_grobid_client_process)
 
     pdf_directory = tmp_path.joinpath("uploads")
@@ -47,15 +48,29 @@ def mock_grobid_client_process(*args, **kwargs):
     # check that the expected output was printed to stdout
     captured = capsys.readouterr()
     output = json.loads(captured.out)
-    assert len(output['references']) == 1
-    assert output['references'][0]['title'] == "A Few Useful Things to Know about Machine Learning"
-    assert len(output['references'][0]['authors']) == 1
-    assert output['references'][0]['authors'][0]['full_name'] == "Pedro Domingos"
+
+    # project name is the name of the parent directory of the input directory
+    assert output['project_name'] == tmp_path.name
+
+    # check that the expected number of references were parsed
+    assert len(output['references']) == 2
+
+    # sort references by source_filename so list order is consistent
+    references = sorted(output['references'], key=lambda x: x['source_filename'])
+
+    # check that grobid-fails.pdf is contained in the reference output
+    assert references[0]['source_filename'] == "grobid-fails.pdf"
+
+    # check that test.pdf was parsed correctly
+    assert references[1]['title'] == "A Few Useful Things to Know about Machine Learning"
+    assert len(references[1]['authors']) == 1
+    assert references[1]['authors'][0]['full_name'] == "Pedro Domingos"
 
     # check that the expected directories and files were created
     # grobid output
     assert grobid_output_dir.exists()
     assert grobid_output_dir.joinpath("test.tei.xml").exists()
-    # json creation and storage
+    assert grobid_output_dir.joinpath("grobid-fails_500.txt").exists()
+    # json creation and storage - successfully parsed references are stored as json
     assert json_storage_dir.exists()
     assert json_storage_dir.joinpath("test.json").exists()