# Markdown parsing

In [2]:
import json
from typing import Literal

from marker.models import create_model_dict
from marker.converters.pdf import PdfConverter
from marker.config.parser import ConfigParser
from marker.output import text_from_rendered

In [40]:
def convert(to: Literal['markdown', 'json', 'chunks']):
    config_parser = ConfigParser(
        {
            'output_format': to,
            'disable_image_extraction': True
        }
    )

    converter = PdfConverter(
        config=config_parser.generate_config_dict(),
        artifact_dict=create_model_dict(),
        processor_list=config_parser.get_processors(),
        renderer=config_parser.get_renderer(),
        llm_service=config_parser.get_llm_service()
    )

    rendered = converter("../../data/docs/2022 Q3 AAPL.pdf")
    text, *_ = text_from_rendered(rendered)

    return text

In [34]:
%%time

with open('outputs/marker.md', 'w') as file:
    file.write(convert('markdown'))

Recognizing layout: 100%|██████████| 5/5 [00:06<00:00,  1.27s/it]
Running OCR Error Detection: 100%|██████████| 7/7 [00:00<00:00, 16.48it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 6/6 [00:11<00:00,  1.93s/it]


CPU times: user 10.8 s, sys: 7.34 s, total: 18.1 s
Wall time: 26.2 s


In [37]:
%%time

with open('outputs/marker.json', 'w') as file:
    text = convert('json')
    text = json.loads(text)
    json.dump(text, file, indent=4)

Recognizing layout: 100%|██████████| 5/5 [00:06<00:00,  1.32s/it]
Running OCR Error Detection: 100%|██████████| 7/7 [00:00<00:00, 17.11it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 6/6 [00:11<00:00,  1.88s/it]


CPU times: user 11.1 s, sys: 6.74 s, total: 17.9 s
Wall time: 24.7 s


In [38]:
%%time

with open('outputs/marker-chunks.json', 'w') as file:
    text = convert('chunks')
    text = json.loads(text)
    json.dump(text, file, indent=4)

Recognizing layout: 100%|██████████| 5/5 [00:06<00:00,  1.32s/it]
Running OCR Error Detection: 100%|██████████| 7/7 [00:00<00:00, 18.51it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 6/6 [00:11<00:00,  1.86s/it]


CPU times: user 10.9 s, sys: 7.17 s, total: 18.1 s
Wall time: 25 s
