#### Using [MinerU](https://github.com/opendatalab/MinerU) for data extraction

In [None]:
!pip install einops timm matplotlib pdf2image nltk qdrant_client pymilvus sentence_transformers unstructured

In [None]:
%env MILVUS_TOKEN="a97665a5c21ee698131542f4fc5511c3e58dca4c799c31497129fb2fe99d827266612ec1e27d7772f7a2e5e4f255c219c340f74a"
%env MILVUS_URI="https://in03-3435853f780aa2e.api.gcp-us-west1.zillizcloud.com"

#### Command for extractions using MinerU

In [4]:
!magic-pdf -p docs/trimmed_punch_manual.pdf -o output/ -m auto

[32m2024-08-20 17:36:38.450[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m57[0m - [1mcid_count: 0, text_len: 9594, cid_chars_radio: 0.0[0m
[32m2024-08-20 17:36:41.889[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m111[0m - [1mDocAnalysis init, this may take some times. apply_layout: True, apply_formula: True, apply_ocr: False, apply_table: True[0m
[32m2024-08-20 17:36:41.889[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m119[0m - [1musing device: cpu[0m
[32m2024-08-20 17:36:41.889[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m121[0m - [1musing models_dir: /Users/shivamarora/projects-python/diagram-extract/PDF-Extract-Kit/models[0m
CustomVisionEncoderDecoderModel init
CustomMBartForCausalLM init
CustomMBartDecoder init
[32m[08/20 17:36:54 detectron2]: [0mRank of current process: 0. World size: 1


#### Helper functions for extractions using MinerU

In [None]:
import os
import json
import copy
from loguru import logger

from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import magic_pdf.model as model_config

model_config.__use_inside_model__ = True

def json_md_dump(
        pipe,
        md_writer,
        pdf_name,
        content_list,
        md_content,
):
    orig_model_list = copy.deepcopy(pipe.model_list)
    md_writer.write(
        content=json.dumps(orig_model_list, ensure_ascii=False, indent=4),
        path=f"{pdf_name}_model.json"
    )

    md_writer.write(
        content=json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
        path=f"{pdf_name}_middle.json"
    )

    md_writer.write(
        content=json.dumps(content_list, ensure_ascii=False, indent=4),
        path=f"{pdf_name}_content_list.json"
    )

    md_writer.write(
        content=md_content,
        path=f"{pdf_name}.md"
    )

def pdf_parse_main(
        pdf_path: str,
        parse_method: str = 'auto',
        model_json_path: str = None,
        is_json_md_dump: bool = True,
        output_dir: str = None
):
    """
    :param pdf_path: .pdf file path
    :param parse_method: accepted methods auto、ocr、txt
    :param model_json_path: existing model data file. If empty then built-in model will be used
    :param is_json_md_dump: whether to write json and md file, default is True
    :param output_dir:
    """
    try:
        pdf_name = os.path.basename(pdf_path).split(".")[0]
        pdf_path_parent = os.path.dirname(pdf_path)

        if output_dir:
            output_path = os.path.join(output_dir, pdf_name)
        else:
            output_path = os.path.join(pdf_path_parent, pdf_name)

        output_image_path = os.path.join(output_path, 'images')

        image_path_parent = os.path.basename(output_image_path)

        pdf_bytes = open(pdf_path, "rb").read()

        if model_json_path:
            model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
        else:
            model_json = []

        image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)

        # jso_useful_key = {"_pdf_type": "", "model_list": model_json}
        # pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
        if parse_method == "auto":
            jso_useful_key = {"_pdf_type": "", "model_list": model_json}
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
        elif parse_method == "txt":
            pipe = TXTPipe(pdf_bytes, model_json, image_writer)
        elif parse_method == "ocr":
            pipe = OCRPipe(pdf_bytes, model_json, image_writer)
        else:
            logger.error("unknown parse method, only auto, ocr, txt allowed")
            exit(1)

        pipe.pipe_classify()

        if not model_json:
            if model_config.__use_inside_model__:
                pipe.pipe_analyze()
            else:
                logger.error("need model list input")
                exit(1)

        pipe.pipe_parse()

        content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
        md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")

        if is_json_md_dump:
            json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)


    except Exception as e:
        logger.exception(e)

In [2]:
if __name__ == '__main__':
    pdf_path = r"docs/Raspberry_short_vers.pdf"
    pdf_parse_main(pdf_path,output_dir="output")


[32m2024-08-20 17:26:16.428[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m57[0m - [1mcid_count: 0, text_len: 26859, cid_chars_radio: 0.0[0m
[32m2024-08-20 17:26:19.828[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m111[0m - [1mDocAnalysis init, this may take some times. apply_layout: True, apply_formula: True, apply_ocr: False, apply_table: True[0m
[32m2024-08-20 17:26:19.828[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m119[0m - [1musing device: cpu[0m
[32m2024-08-20 17:26:19.829[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m121[0m - [1musing models_dir: /Users/shivamarora/projects-python/diagram-extract/PDF-Extract-Kit/models[0m


CustomVisionEncoderDecoderModel init
CustomMBartForCausalLM init
CustomMBartDecoder init
[32m[08/20 17:26:31 detectron2]: [0mRank of current process: 0. World size: 1
[32m[08/20 17:26:31 detectron2]: [0mEnvironment info:
-------------------------------  --------------------------------------------------------------------------------------------------------------
sys.platform                     darwin
Python                           3.10.14 (main, May  6 2024, 14:42:37) [Clang 14.0.6 ]
numpy                            1.26.4
detectron2                       0.6 @/opt/homebrew/Caskroom/miniconda/base/envs/document-extractor/lib/python3.10/site-packages/detectron2
Compiler                         clang 15.0.0
CUDA compiler                    not available
DETECTRON2_ENV_MODULE            <not set>
PyTorch                          2.3.1 @/opt/homebrew/Caskroom/miniconda/base/envs/document-extractor/lib/python3.10/site-packages/torch
PyTorch debug build              False
torch._C._GL

[32m2024-08-20 17:26:36.017[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m148[0m - [1mDocAnalysis init done![0m
[32m2024-08-20 17:26:36.018[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mcustom_model_init[0m:[36m98[0m - [1mmodel init cost: 19.589518070220947[0m
[32m2024-08-20 17:26:49.511[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m159[0m - [1mlayout detection cost: 13.27[0m



0: 1888x1344 4 embeddings, 1732.8ms
Speed: 14.5ms preprocess, 1732.8ms inference, 0.9ms postprocess per image at shape (1, 3, 1888, 1344)


[32m2024-08-20 17:26:52.905[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m189[0m - [1mformula nums: 4, mfr time: 1.31[0m
[32m2024-08-20 17:26:52.909[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m294[0m - [1mtable cost: 0.0[0m
[32m2024-08-20 17:26:58.283[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m159[0m - [1mlayout detection cost: 5.37[0m



0: 1888x1344 4 embeddings, 1709.9ms
Speed: 10.1ms preprocess, 1709.9ms inference, 0.4ms postprocess per image at shape (1, 3, 1888, 1344)


[32m2024-08-20 17:27:01.223[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m189[0m - [1mformula nums: 4, mfr time: 1.2[0m
[32m2024-08-20 17:27:01.226[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m294[0m - [1mtable cost: 0.0[0m
[32m2024-08-20 17:27:06.385[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m159[0m - [1mlayout detection cost: 5.16[0m



0: 1888x1344 (no detections), 1673.3ms
Speed: 9.6ms preprocess, 1673.3ms inference, 0.3ms postprocess per image at shape (1, 3, 1888, 1344)


[32m2024-08-20 17:27:08.071[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m189[0m - [1mformula nums: 0, mfr time: 0.0[0m
[32m2024-08-20 17:27:08.074[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m294[0m - [1mtable cost: 0.0[0m
[32m2024-08-20 17:27:13.428[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m159[0m - [1mlayout detection cost: 5.35[0m



0: 1888x1344 (no detections), 1737.6ms
Speed: 10.1ms preprocess, 1737.6ms inference, 0.3ms postprocess per image at shape (1, 3, 1888, 1344)


[32m2024-08-20 17:27:15.181[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m189[0m - [1mformula nums: 0, mfr time: 0.0[0m
[32m2024-08-20 17:27:15.185[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m294[0m - [1mtable cost: 0.0[0m
[32m2024-08-20 17:27:20.323[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m159[0m - [1mlayout detection cost: 5.14[0m



0: 1888x1344 (no detections), 1691.8ms
Speed: 9.5ms preprocess, 1691.8ms inference, 0.6ms postprocess per image at shape (1, 3, 1888, 1344)


[32m2024-08-20 17:27:22.028[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m189[0m - [1mformula nums: 0, mfr time: 0.0[0m
[32m2024-08-20 17:27:22.031[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m294[0m - [1mtable cost: 0.0[0m
[32m2024-08-20 17:27:27.211[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m159[0m - [1mlayout detection cost: 5.18[0m



0: 1888x1344 (no detections), 1625.7ms
Speed: 10.6ms preprocess, 1625.7ms inference, 0.8ms postprocess per image at shape (1, 3, 1888, 1344)


[32m2024-08-20 17:27:28.851[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m189[0m - [1mformula nums: 0, mfr time: 0.0[0m
[32m2024-08-20 17:27:28.854[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m294[0m - [1mtable cost: 0.0[0m
[32m2024-08-20 17:27:34.016[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m159[0m - [1mlayout detection cost: 5.16[0m



0: 1888x1344 (no detections), 1695.4ms
Speed: 10.4ms preprocess, 1695.4ms inference, 0.7ms postprocess per image at shape (1, 3, 1888, 1344)


[32m2024-08-20 17:27:35.727[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m189[0m - [1mformula nums: 0, mfr time: 0.0[0m
[32m2024-08-20 17:27:35.731[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m294[0m - [1mtable cost: 0.0[0m
[32m2024-08-20 17:27:40.991[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m159[0m - [1mlayout detection cost: 5.26[0m



0: 1888x1344 5 embeddings, 1662.7ms
Speed: 10.7ms preprocess, 1662.7ms inference, 1.5ms postprocess per image at shape (1, 3, 1888, 1344)


[32m2024-08-20 17:27:44.202[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m189[0m - [1mformula nums: 5, mfr time: 1.52[0m
[32m2024-08-20 17:27:44.206[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m294[0m - [1mtable cost: 0.0[0m
[32m2024-08-20 17:27:49.553[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m159[0m - [1mlayout detection cost: 5.35[0m



0: 1888x1344 (no detections), 1706.0ms
Speed: 10.5ms preprocess, 1706.0ms inference, 0.6ms postprocess per image at shape (1, 3, 1888, 1344)


[32m2024-08-20 17:27:51.273[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m189[0m - [1mformula nums: 0, mfr time: 0.0[0m
[32m2024-08-20 17:27:51.277[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m294[0m - [1mtable cost: 0.0[0m
[32m2024-08-20 17:27:56.537[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m159[0m - [1mlayout detection cost: 5.26[0m



0: 1888x1344 (no detections), 1725.9ms
Speed: 10.9ms preprocess, 1725.9ms inference, 0.3ms postprocess per image at shape (1, 3, 1888, 1344)


[32m2024-08-20 17:27:58.277[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m189[0m - [1mformula nums: 0, mfr time: 0.0[0m
[32m2024-08-20 17:27:58.280[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m294[0m - [1mtable cost: 0.0[0m
[32m2024-08-20 17:27:58.281[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mdoc_analyze[0m:[36m124[0m - [1mdoc analyze cost: 82.034991979599[0m
[32m2024-08-20 17:27:58.943[0m | [1mINFO    [0m | [36mmagic_pdf.pipe.UNIPipe[0m:[36mpipe_mk_uni_format[0m:[36m43[0m - [1muni_pipe mk content list finished[0m
[32m2024-08-20 17:27:58.971[0m | [1mINFO    [0m | [36mmagic_pdf.pipe.UNIPipe[0m:[36mpipe_mk_markdown[0m:[36m48[0m - [1muni_pipe mk mm_markdown finished[0m
