In [1]:
from marker.models import create_model_dict
from typing import Any

model_dict: dict[str,Any] | None = None

if model_dict is None:
    print("Loading Models")
    model_dict = create_model_dict()
    print(list(model_dict.keys()))
 

  from .autonotebook import tqdm as notebook_tqdm


Loading Models
Loaded layout model datalab-to/surya_layout on device cuda with dtype torch.float16
Loaded texify model datalab-to/texify on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16
Loaded table recognition model datalab-to/surya_tablerec on device cuda with dtype torch.float16
Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16
['layout_model', 'texify_model', 'recognition_model', 'table_rec_model', 'detection_model', 'ocr_error_model']


In [2]:
# utilities 
import json
from typing import Any

def save_md_file(file_name:str, content:str):
     with open(f"./output/{file_name}.md", mode="w") as f:
        f.write(content)
 
def save_str_file(file_name:str, content:str):
     with open(f"./output/{file_name}", mode="w") as f:
        f.write(content)

def save_json_file(file_name:str, content:Any):
     
    with open(f"./output/{file_name}.json", "w") as f:
        f.write(json.dumps(content, indent=4)) 


def save_html_file(file_name:str, content:Any):
     
    with open(f"./output/{file_name}.html", "w") as f:
        f.write(json.dumps(content, indent=4)) 



In [3]:
# Document object creation 

import os
import tempfile
import warnings
 
# New changes
from marker.models import create_model_dict
from marker.builders.document import DocumentBuilder
from marker.builders.layout import LayoutBuilder
from marker.builders.ocr import OcrBuilder
from marker.schema.document import Document

from marker.config.parser import ConfigParser

from marker.providers.pdf import PdfProvider
from marker.converters.pdf import PdfConverter

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = ("1")
 
warnings.filterwarnings("ignore", category=UserWarning) 
 
# tuple[str, dict[str, Image.Image], dict, list]:
if model_dict is None:
    print("Loading Models")
    model_dict = create_model_dict()
 

pdf_filename = "2024 Sales Presentation C6501-PPOs-1.pdf"

with open(f"../resources/pdf/{pdf_filename}", mode="rb") as f:
    content = f.read()
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
    temp_pdf.write(content)
    temp_pdf.seek(0)
    filename = temp_pdf.name  
    
    config = {
       "paginate_output": True,
       "force_ocr": False       
    }
    processors = [
        "marker.processors.blockquote.BlockquoteProcessor",
        "marker.processors.code.CodeProcessor",
        "marker.processors.debug.DebugProcessor",
        "marker.processors.document_toc.DocumentTOCProcessor",
        "marker.processors.equation.EquationProcessor",
        "marker.processors.footnote.FootnoteProcessor",
        "marker.processors.ignoretext.IgnoreTextProcessor",
        "marker.processors.line_numbers.LineNumbersProcessor",
        "marker.processors.list.ListProcessor",
        "marker.processors.page_header.PageHeaderProcessor",
        "marker.processors.sectionheader.SectionHeaderProcessor",
        "marker.processors.table.TableProcessor",
        "marker.processors.text.TextProcessor",
    ]
    config_parser = ConfigParser(config)
    
    pdf_converter = PdfConverter(
                config=config_parser.generate_config_dict(),
                artifact_dict=model_dict,
                processor_list=None, 
                renderer="llama_json_renderer.LLAMAJSONRenderer"
            )
    rendered = pdf_converter(filename)
    



Recognizing layout: 100%|██████████| 11/11 [00:12<00:00,  1.17s/it]
100%|██████████| 6/6 [00:00<00:00, 30.65it/s]
Detecting bboxes: 100%|██████████| 8/8 [00:13<00:00,  1.68s/it]
Recognizing Text: 100%|██████████| 6/6 [00:57<00:00,  9.57s/it]
Recognizing equations: 0it [00:00, ?it/s]
Detecting bboxes: 100%|██████████| 6/6 [00:10<00:00,  1.68s/it]
Recognizing Text: 100%|██████████| 21/21 [02:18<00:00,  6.60s/it]
Recognizing tables: 100%|██████████| 4/4 [00:14<00:00,  3.73s/it]


In [5]:
# save_output(rendered, "output","new")
pages = rendered.pages
filename = pdf_filename.replace(".pdf","")
save_json_file(filename, pages)
save_md_file(filename, rendered.markdown)
 

In [None]:
# Creating a document
pdf_provider = PdfProvider(filepath=filename, config=None) 
layout_builder = LayoutBuilder(model_dict["layout_model"])
ocr_builder = OcrBuilder(recognition_model=model_dict["recognition_model"], detection_model=model_dict["detection_model"])
document_builder = DocumentBuilder()

document:Document = document_builder(pdf_provider, layout_builder, ocr_builder)   

In [None]:
from marker.renderers.json import JSONRenderer

renderer = JSONRenderer()
md = renderer(document).html

save_md_file("test", md)

In [None]:
from html_renderer import LLAMAHTMLRenderer

renderer = LLAMAHTMLRenderer({"paginate_output": True})
images = renderer(document).paginated_images
print(images)


{'page-0': {'_page_0_Picture_1.png': <PIL.Image.Image image mode=RGB size=208x87 at 0x764F3C1374A0>}, 'page-1': {'_page_1_Picture_1.png': <PIL.Image.Image image mode=RGB size=401x94 at 0x764F3C713C80>}, 'page-2': {'_page_2_Picture_1.png': <PIL.Image.Image image mode=RGB size=208x87 at 0x764F3C7121B0>}, 'page-3': {'_page_3_Picture_1.png': <PIL.Image.Image image mode=RGB size=401x90 at 0x764F3F406810>}, 'page-4': {'_page_4_Picture_1.png': <PIL.Image.Image image mode=RGB size=345x90 at 0x764F3F405160>}, 'page-5': {'_page_5_Picture_1.png': <PIL.Image.Image image mode=RGB size=398x93 at 0x764F3C7128D0>}, 'page-6': {'_page_6_Picture_1.png': <PIL.Image.Image image mode=RGB size=376x89 at 0x764F3C713440>}, 'page-7': {'_page_7_Picture_1.png': <PIL.Image.Image image mode=RGB size=340x91 at 0x764F3C42CC50>}, 'page-8': {'_page_8_Picture_1.png': <PIL.Image.Image image mode=RGB size=403x94 at 0x764F3C42F260>}, 'page-9': {'_page_9_Picture_1.png': <PIL.Image.Image image mode=RGB size=385x90 at 0x764F3

In [None]:
# PDF to LLAMA conversion 

import os
import tempfile
import warnings
from marker.converters.pdf import PdfConverter
from marker.renderers.markdown import MarkdownRenderer
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
    "1"  # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
)
 
warnings.filterwarnings("ignore", category=UserWarning)  # Filter torch pytree user warnings

pdf_filename = "2024 Sales Presentation C6501-PPOs-1.pdf"

with open(f"../resources/pdf/{pdf_filename}", mode="rb") as f:
    content = f.read()
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
    temp_pdf.write(content)
    temp_pdf.seek(0)
    filename = temp_pdf.name
    config = {
       "paginate_output": True,       
    }
    processors = [
        "marker.processors.blockquote.BlockquoteProcessor",
        "marker.processors.code.CodeProcessor",
        "marker.processors.debug.DebugProcessor",
        "marker.processors.document_toc.DocumentTOCProcessor",
        "marker.processors.equation.EquationProcessor",
        "marker.processors.footnote.FootnoteProcessor",
        "marker.processors.ignoretext.IgnoreTextProcessor",
        "marker.processors.line_numbers.LineNumbersProcessor",
        "marker.processors.list.ListProcessor",
        "marker.processors.page_header.PageHeaderProcessor",
        "marker.processors.sectionheader.SectionHeaderProcessor",
        "marker.processors.table.TableProcessor",
        "marker.processors.text.TextProcessor",
    ]

    pdf_converter = PdfConverter(artifact_dict=model_dict, processor_list=processors)
    rendered = pdf_converter(filename)


Recognizing layout: 100%|██████████| 11/11 [00:17<00:00,  1.57s/it]
100%|██████████| 6/6 [00:00<00:00, 22.52it/s]
Detecting bboxes: 100%|██████████| 8/8 [00:16<00:00,  2.07s/it]
Recognizing Text: 100%|██████████| 6/6 [00:59<00:00,  9.98s/it]
Recognizing equations: 0it [00:00, ?it/s]
Detecting bboxes: 100%|██████████| 6/6 [00:12<00:00,  2.02s/it]
Recognizing Text: 100%|██████████| 21/21 [02:31<00:00,  7.21s/it]
Recognizing tables: 100%|██████████| 4/4 [00:17<00:00,  4.49s/it]


NameError: name 'Document' is not defined

In [6]:

# from marker.schema.document import Document

    
#     # Ensure rendered is a Document object
# if isinstance(rendered, Document):
#     renderer = MarkdownRenderer({"paginate_output": True})
#     md_result = renderer(rendered).markdown
    
#     markdown = md_result
    
#     save_md_file("urgent", markdown)
# else:
#     print("Error: The rendered object is not a Document.")

from marker.output import save_output


save_output(rendered,"output","this_one")



In [None]:
pages = rendered.pages

filename = pdf_filename.replace(".pdf", "")
save_json_file(filename, pages)

save_md_file(filename, rendered.markdown)
 
    # print("images")
    # print(rendered.images)
    # text, ext, images = text_from_rendered(rendered)
    