In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os
from typing import Iterator, MutableSequence, Optional, Sequence, Tuple

import google.cloud.documentai_v1 as docai
from tabulate import tabulate

PROJECT_ID = os.getenv("PROJECT_ID", "")
API_LOCATION = os.getenv("API_LOCATION", "")


os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './docextraction-440803-2193a2f4556f.json'
assert PROJECT_ID, "PROJECT_ID is undefined"
assert API_LOCATION in ("us", "eu"), "API_LOCATION is incorrect"


In [4]:

# Test processors
document_ocr_display_name = "document-ocr"
form_parser_display_name = "form-parser"


In [5]:

test_processor_display_names_and_types = (
    (document_ocr_display_name, "OCR_PROCESSOR"),
    (form_parser_display_name, "FORM_PARSER_PROCESSOR"),
)


In [6]:

def get_client() -> docai.DocumentProcessorServiceClient:
    client_options = {"api_endpoint": f"{API_LOCATION}-documentai.googleapis.com"}
    return docai.DocumentProcessorServiceClient(client_options=client_options)


In [7]:

def get_parent(client: docai.DocumentProcessorServiceClient) -> str:
    return client.common_location_path(PROJECT_ID, API_LOCATION)


In [8]:
def fetch_processor_types() -> MutableSequence[docai.ProcessorType]:
    client, parent = get_client_and_parent()
    response = client.fetch_processor_types(parent=parent)

    return response.processor_types

def print_processor_types(processor_types: Sequence[docai.ProcessorType]):
    def sort_key(pt):
        return (not pt.allow_creation, pt.category, pt.type_)

    sorted_processor_types = sorted(processor_types, key=sort_key)
    data = processor_type_tabular_data(sorted_processor_types)
    headers = next(data)
    colalign = next(data)

    print(tabulate(data, headers, tablefmt="pretty", colalign=colalign))
    print(f"→ Processor types: {len(sorted_processor_types)}")

def processor_type_tabular_data(
    processor_types: Sequence[docai.ProcessorType],
) -> Iterator[Tuple[str, str, str, str]]:
    def locations(pt):
        return ", ".join(sorted(loc.location_id for loc in pt.available_locations))

    yield ("type", "category", "allow_creation", "locations")
    yield ("left", "left", "left", "left")
    if not processor_types:
        yield ("-", "-", "-", "-")
        return
    for pt in processor_types:
        yield (pt.type_, pt.category, f"{pt.allow_creation}", locations(pt))
        
def get_client_and_parent() -> Tuple[docai.DocumentProcessorServiceClient, str]:
    client = get_client()
    parent = get_parent(client)
    return client, parent
    

In [9]:
processor_types = fetch_processor_types()
print_processor_types(processor_types)

+--------------------------------------+-------------+----------------+-----------------------------------------------------------------------------------------------------------------+
| type                                 | category    | allow_creation | locations                                                                                                       |
+--------------------------------------+-------------+----------------+-----------------------------------------------------------------------------------------------------------------+
| CUSTOM_CLASSIFICATION_PROCESSOR      | CUSTOM      | True           | asia-south1, asia-southeast1, australia-southeast1, eu, europe-west2, europe-west3, northamerica-northeast1, us |
| CUSTOM_EXTRACTION_PROCESSOR          | CUSTOM      | True           | asia-south1, asia-southeast1, australia-southeast1, eu, europe-west2, europe-west3, northamerica-northeast1, us |
| CUSTOM_SPLITTING_PROCESSOR           | CUSTOM      | True           

In [10]:
def create_processor(display_name: str, type: str) -> docai.Processor:
    client, parent = get_client_and_parent()
    processor = docai.Processor(display_name=display_name, type_=type)

    return client.create_processor(parent=parent, processor=processor)
    

In [11]:
separator = "=" * 80
for display_name, type in test_processor_display_names_and_types:
    print(separator)
    print(f"Creating {display_name} ({type})...")
    try:
        create_processor(display_name, type)
    except Exception as err:
        print(err)
print(separator)
print("Done")

Creating document-ocr (OCR_PROCESSOR)...
Creating form-parser (FORM_PARSER_PROCESSOR)...
Done


In [12]:
def list_processors() -> MutableSequence[docai.Processor]:
    client, parent = get_client_and_parent()
    response = client.list_processors(parent=parent)

    return list(response.processors)

def print_processors(processors: Optional[Sequence[docai.Processor]] = None):
    def sort_key(processor):
        return processor.display_name

    if processors is None:
        processors = list_processors()
    sorted_processors = sorted(processors, key=sort_key)
    data = processor_tabular_data(sorted_processors)
    headers = next(data)
    colalign = next(data)

    print(tabulate(data, headers, tablefmt="pretty", colalign=colalign))
    print(f"→ Processors: {len(sorted_processors)}")

def processor_tabular_data(
    processors: Sequence[docai.Processor],
) -> Iterator[Tuple[str, str, str]]:
    yield ("display_name", "type", "state")
    yield ("left", "left", "left")
    if not processors:
        yield ("-", "-", "-")
        return
    for processor in processors:
        yield (processor.display_name, processor.type_, processor.state.name)
        

In [13]:
processors = list_processors()
print_processors(processors)

+--------------+-----------------------+---------+
| display_name | type                  | state   |
+--------------+-----------------------+---------+
| doc OCR      | OCR_PROCESSOR         | ENABLED |
| document-ocr | OCR_PROCESSOR         | ENABLED |
| form-parser  | FORM_PARSER_PROCESSOR | ENABLED |
+--------------+-----------------------+---------+
→ Processors: 3


In [14]:
def get_processor(
    display_name: str,
    processors: Optional[Sequence[docai.Processor]] = None,
) -> Optional[docai.Processor]:
    if processors is None:
        processors = list_processors()
    for processor in processors:
        if processor.display_name == display_name:
            return processor
    return None
    

In [15]:
processor = get_processor(document_ocr_display_name, processors)

assert processor is not None
print(processor)

name: "projects/712357922075/locations/us/processors/5d98407fb8df954c"
type_: "OCR_PROCESSOR"
display_name: "document-ocr"
state: ENABLED
process_endpoint: "https://us-documentai.googleapis.com/v1/projects/712357922075/locations/us/processors/5d98407fb8df954c:process"
create_time {
  seconds: 1730782625
  nanos: 516346000
}
default_processor_version: "projects/712357922075/locations/us/processors/5d98407fb8df954c/processorVersions/pretrained-ocr-v2.0-2023-06-02"
processor_version_aliases {
  alias: "projects/712357922075/locations/us/processors/5d98407fb8df954c/processorVersions/pretrained"
  processor_version: "projects/712357922075/locations/us/processors/5d98407fb8df954c/processorVersions/pretrained-ocr-v1.0-2020-09-23"
}
processor_version_aliases {
  alias: "projects/712357922075/locations/us/processors/5d98407fb8df954c/processorVersions/pretrained-next"
  processor_version: "projects/712357922075/locations/us/processors/5d98407fb8df954c/processorVersions/pretrained-ocr-v1.1-2022-0

'gsutil' is not recognized as an internal or external command,
operable program or batch file.


In [23]:
def process_file(
    processor: docai.Processor,
    file_path: str,
    mime_type: str,
) -> docai.Document:
    client = get_client()
    with open(file_path, "rb") as document_file:
        document_content = document_file.read()
    document = docai.RawDocument(content=document_content, mime_type=mime_type)
    request = docai.ProcessRequest(raw_document=document, name=processor.name)

    response = client.process_document(request)

    return response.document
    

In [27]:
processor = get_processor(form_parser_display_name)
assert processor is not None

file_path = "./document-001-115484.out.000.pdf"
mime_type = "application/pdf"

document = process_file(processor, file_path, mime_type)

In [30]:
document

uri: ""
mime_type: "application/pdf"
text: "SHARP\nGL1F20\nGL1F20\nInfrared Communication\n(IrDA1.0 Compatible)\nInfraredEmitting Diode\nFeatures\nOutline Dimensions\n(Unit: mm)\n1. IrDA1.0 compatible infrared emitting diode\n(Transmission rate : 2.4 to 115.2kbps)\n5.6\n2. Built-in infrared emitting diode circuit\n3. Recommended use in combination with detector (IS1U20)\nR 1.4\nTransparent resin\nMAX. 0.6\nDetector center\nApplications\n1. Personal computers\n1.0\n2. Portable information terminal equipment\n3. Printers\n17.9\n3-0.45\n4. Word processors\n1.27\n1.27\n=\nIrDA Abbreviation of the Infrared Data Association established\nfor standardization of infrared communication specifications\nO\n②\nⒸ\nMIN 0.3\nAbsolute Maximum Ratings\n(Ta=25°C)\nBase\nParameter\nSymbol\nRating\nUnit\nForward current\nIF\n50\nmA\nTerminal configuration\n*1 Peak forward current\nIFM\n400\nmA\n①① Base\n② Emitter\nOperating temperature\nTopr\n10 to + 70\n°C\n③ Anode\nStorage temperature\nI ste\n- 20 to +85