From 4e599ed0836c743f592eaa1f95a475caa2418413 Mon Sep 17 00:00:00 2001 From: Valentin Date: Mon, 7 Apr 2025 16:52:51 +0200 Subject: [PATCH] pass load_kwargs to '_process_doc_page' and 'to_markdown' to enable write_images ... --- pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py index 48c40894..b178d996 100644 --- a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py +++ b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py @@ -61,7 +61,7 @@ def load_data( for page in doc: docs.append( self._process_doc_page( - doc, extra_info, file_path, page.number, hdr_info + doc, extra_info, file_path, page.number, hdr_info, **load_kwargs ) ) return docs @@ -76,6 +76,7 @@ def _process_doc_page( file_path: str, page_number: int, hdr_info: IdentifyHeaders, + **load_kwargs: Any, ): """Processes a single page of a PDF document.""" extra_info = self._process_doc_meta( @@ -86,7 +87,9 @@ def _process_doc_page( extra_info = self.meta_filter(extra_info) text = to_markdown( - doc, pages=[page_number], hdr_info=hdr_info, write_images=False + doc, pages=[page_number], + hdr_info=hdr_info, + **load_kwargs, ) return LlamaIndexDocument(text=text, extra_info=extra_info)