In [1]:
import json
from typing import Union

import click
import torch
import time
import asyncio
import layoutparser as lp
import numpy as np
from layoutparser.elements.layout_elements import TextBlock
from pathlib import Path
import os

In [9]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/stefan/.keys/solid-groove-215812-f6c753f1ed74.json'

In [12]:
!pip install nest-asyncio



In [2]:
import nest_asyncio
nest_asyncio.apply()


In [16]:
async def perform_ocr(
    ocr_agent: Union[lp.TesseractAgent, lp.GCVAgent],
    image: np.array,
    block: TextBlock,
    left_pad: int = 15,
    right_pad: int = 5,
    top_pad: int = 5,
    bottom_pad: int = 5,
) -> None:
    """
    Perform OCR on a block of text.

    Args:
        ocr_agent: The OCR agent to use.
        image: The crop to perform OCR on.
        block: The block to set the text of.
        left_pad: The number of pixels to pad the left side of the block.
        right_pad: The number of pixels to pad the right side of the block.
        top_pad: The number of pixels to pad the top of the block.
        bottom_pad: The number of pixels to pad the bottom of the block.
    """
    # Pad to improve OCR accuracy as it's fairly tight.
    segment_image = block.pad(
        left=left_pad, right=right_pad, top=top_pad, bottom=bottom_pad
    ).crop_image(image)

    # Perform OCR and await the result.
    text = await ocr_agent.detect(segment_image, return_only_text=True)

    # Save OCR result
    block.set(text=text, inplace=True)

In [4]:
def run_cli(
    input_dir: Path,
    output_dir: Path,
    ocr_agent: str,
    model: str,
    detectron_threshold: float = 0.5,
) -> None:
    """
    Run cli to extract semi-structured JSON from document-AI + OCR.

    Args:
        input_dir: The directory containing the PDFs to parse.
        output_dir: The directory to write the parsed PDFs to.
        ocr_agent: The OCR agent to use.
        model: The document AI model to use.
        detectron_threshold: The threshold to use for Detectron2.
    """
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    model = lp.Detectron2LayoutModel(
        config_path=f"lp://PubLayNet/{model}",  # In model catalog,
        label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
        extra_config=[
            "MODEL.ROI_HEADS.SCORE_THRESH_TEST",
            detectron_threshold,
        ],  # Optional
    )

    if ocr_agent == "tesseract":
        ocr_agent = lp.TesseractAgent(languages="eng")
    elif ocr_agent == "gcv":
        ocr_agent = lp.GCVAgent(languages="eng")
    input_dir = Path(input_dir)
    for file in input_dir.iterdir():
        file_name = file.name
        if not file_name.endswith(".pdf"):
            continue
        _, pdf_images = lp.load_pdf(file, load_images=True)
        block_pages = (
            []
        )  # list of pages of blocks (not captured by layoutparser, will put into a proper data
        # structure later).
        for ix, image in enumerate(pdf_images):
            image_array = np.array(image)
            detect_start = time.time()
            layout = model.detect(image_array)  # perform computer vision
            detect_end = time.time()
            detect_time = detect_end - detect_start
            # perform ocr on extracted blocks.
            text_blocks = lp.Layout([b for b in layout if b.type == "Text"])
            # convert to CustomTextBlock to add page_num attribute.
            ocr_start = time.time()
            for block in text_blocks:
                perform_ocr(
                    ocr_agent, image_array, block
                )  # modify text blocks in-place
            ocr_end = time.time()
            ocr_time = ocr_end - ocr_start
            block_pages.append([ix + 1] * len(text_blocks))

        # flatten block_pages to a single list of blocks.
        blocks = [block for page in block_pages for block in page]

        # save extracted layout as json
        text_block_dict = text_blocks.to_dict()
        for ix, dic in enumerate(text_block_dict["blocks"]):
            dic["page_num"] = blocks[ix]
        file_name_without_ext = file_name.split(".")[0]
        with open(output_dir / f"{file_name_without_ext}.json", "w") as f:
            json.dump(text_block_dict, f)

In [5]:
run_cli(
    input_dir="../downloads",
    output_dir="../data/ocr",
    ocr_agent="gcv",
    model="mask_rcnn_X_101_32x8d_FPN_3x",
    detectron_threshold=0.5
)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [5]:
# Let's convert the above using asyncio.

In [17]:
async def perform_ocr_async(ocr_agent, image, blocks):
    tasks = []
    for block in blocks:
        tasks.append(asyncio.create_task(perform_ocr(ocr_agent, image, block)))
    await asyncio.gather(*tasks)

In [18]:
async def run_cli_async(
    input_dir,
    output_dir,
    ocr_agent,
    model,
    detectron_threshold,
):
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    model = lp.Detectron2LayoutModel(
        config_path=f"lp://PubLayNet/{model}",  # In model catalog,
        label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
        extra_config=[
            "MODEL.ROI_HEADS.SCORE_THRESH_TEST",
            detectron_threshold,
        ],  # Optional
    )
    if ocr_agent == "tesseract":
        ocr_agent = lp.TesseractAgent(languages="eng")
    elif ocr_agent == "gcv":
        ocr_agent = lp.GCVAgent(languages="eng")
    for file in input_dir.iterdir():
        file_name = file.name
        if not file_name.endswith(".pdf"):
            continue
        _, pdf_images = lp.load_pdf(file, load_images=True)
        block_pages = (
            []
        )  # list of pages of blocks (not captured by layoutparser, will put into a proper data
        # structure later).
        for ix, image in enumerate(pdf_images):
            print(image)
            image_array = np.array(image)
            detect_start = time.time()
            layout = model.detect(image_array)  # perform computer vision
            # clear cuda cache
            torch.cuda.empty_cache()
            detect_end = time.time()
            detect_time = detect_end - detect_start
            # perform ocr on extracted blocks.
            text_blocks = lp.Layout([b for b in layout if b.type == "Text"])
            # convert to CustomTextBlock to add page_num attribute.
            ocr_start = time.time()
            await perform_ocr_async(ocr_agent, image_array, text_blocks)  # modify text blocks in-place
            ocr_end = time.time()
            ocr_time = ocr_end - ocr_start
            print(ocr_time)
            block_pages.append([ix + 1] * len(text_blocks))

        # flatten block_pages to a single list of blocks.
        blocks = [block for page in block_pages for block in page]

        # save extracted layout as json
        text_block_dict = text_blocks.to_dict()
        for ix, dic in enumerate(text_block_dict["blocks"]):
            dic["page_num"] = blocks[ix]
        file_name_without_ext = file_name.split(".")[0]
        with open(output_dir / f"{file_name_without_ext}.json", "w") as f:
            json.dump(text_block_dict, f)

In [21]:
async def main():
    input_dir = "../downloads"
    output_dir = "../data/ocr"
    ocr_agent = "gcv"
    model = "mask_rcnn_X_101_32x8d_FPN_3x"
    detectron_threshold = 0.5
    await run_cli_async(
        input_dir,
        output_dir,
        ocr_agent,
        model,
        detectron_threshold,
    )

In [22]:
asyncio.run(main())

The checkpoint state_dict contains keys that are not used by the model:
  [35mproposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}[0m


<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=596x783 at 0x7F7E23260F10>


  max_size = (max_size + (stride - 1)) // stride * stride


TypeError: object str can't be used in 'await' expression

In [14]:
type(main())

  type(main())


coroutine