In [31]:
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from datetime import datetime

import re
import os
import pandas as pd
import simplejson as json

In [20]:
PROJECT_ID = "ir-demo-agent-rajatgupta-diqxp"
LOCATION = "us"  # Format is 'us' or 'eu'
PROCESSOR_ID = "d0c6db9d3ac5d9ff"  

GCS_INPUT_BUCKET = 'cloud-samples-data'
GCS_INPUT_PREFIX = 'documentai/async_invoices/'
GCS_OUTPUT_URI = 'gs://united_post/output'
GCS_OUTPUT_URI_PREFIX = 'up'
file1='gs://united_post/input/Invoice # 1.pdf'
file2='gs://united_post/input/Invoice # 2.pdf'
TIMEOUT = 300

In [42]:
import re

from google.cloud import documentai_v1beta3 as documentai, bigquery
from google.cloud import storage
import simplejson as json
import proto
from datetime import datetime

def batch_process_documents(
    project_id,
    location,
    processor_id,
    GCS_INPUT_BUCKET,
    GCS_INPUT_PREFIX,
    GCS_OUTPUT_URI,
    GCS_OUTPUT_URI_PREFIX,
):

    file1='gs://united_post/input/Invoice # 1.pdf'
    file2='gs://united_post/input/Invoice # 2.pdf'

    print("Starting processing:{}".format(datetime.now()))
    client = documentai.DocumentProcessorServiceClient()
    storage_client = storage.Client()
    
    # Sample invoices are stored in gs://cloud-samples-data/documentai/async_invoices/
    blobs = storage_client.list_blobs(GCS_INPUT_BUCKET, prefix=GCS_INPUT_PREFIX)
    input_configs = []
    print("Input Files:")
    #for blob in blobs:
    #    if ".pdf" in blob.name:
    #        source = "gs://{bucket}/{name}".format(bucket = GCS_INPUT_BUCKET, name = blob.name)
    source1 = file1
    print(source1)
    input_config_1 = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
                gcs_source=source1, mime_type="application/pdf")
    input_config_2 = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
                gcs_source=file2, mime_type="application/pdf")
    input_configs.append(input_config_1)
    input_configs.append(input_config_2)

    destination_uri = f"{GCS_OUTPUT_URI}/{GCS_OUTPUT_URI_PREFIX}/"

    # Where to write results
    output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
        gcs_destination=destination_uri
    )

    name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{PROCESSOR_ID}"
    request = documentai.types.document_processor_service.BatchProcessRequest(
        name=name,
        input_configs=input_configs,
        output_config=output_config,
    )

    operation = client.batch_process_documents(request)

    # Wait for the operation to finish
    operation.result(timeout=TIMEOUT)
    print("Ending processing:{}".format(datetime.now()))
    
    # Results are written to GCS. Use a regex to find output files
    match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))

    for i, blob in enumerate(blob_list):
        # If JSON file, download the contents of this blob as a bytes object.
        if ".json" in blob.name:
            blob_as_bytes = blob.download_as_string()
            print("downloaded")

            document = documentai.types.Document.from_json(blob_as_bytes)
            print(f"Fetched file {i + 1}")

            # Read the entities output from the processor
            types = []
            values = []
            confidence = []
            
            for entity in document.entities:
                types.append(entity.type_)
                values.append(entity.mention_text)
                confidence.append(round(entity.confidence,4))
        
            # Create a Pandas Dataframe to print the values in tabular format. 
            df = pd.DataFrame({'Type': types, 'Value': values, 'Confidence': confidence}) 
            display(df)
                
        else:
            print(f"Skipping non-supported file type {blob.name}")


In [43]:
batch_process_documents(project_id='ir-demo-agent-rajatgupta-diqxp', location='us', processor_id='40a5ffd12acd0b44', GCS_INPUT_BUCKET='united_post',GCS_INPUT_PREFIX='input/',GCS_OUTPUT_URI='gs://united_post/output',GCS_OUTPUT_URI_PREFIX='up')

Starting processing:2021-04-20 03:12:59.859262
Input Files:
gs://united_post/input/Invoice # 1.pdf
Ending processing:2021-04-20 03:14:57.378464
downloaded
Fetched file 1


Unnamed: 0,Type,Value,Confidence
0,invoice_date,"Apr 5, 2021",0.9917
1,due_date,"Apr 30, 2021",0.9847
2,total_amount,2562.00,0.9789
3,receiver_name,General Services,0.9699
4,supplier_name,ABC Corporation,0.9647
5,payment_terms,30 days,0.9617
6,supplier_address,"123 Fox Street,\nSilver Spring, MD, 20398",0.9285
7,ship_to_name,ABC Corporation,0.9099
8,receiver_address,"1800 F Street, NW\nWashington, DC\n20405",0.732
9,currency,$,0.5982


downloaded
Fetched file 2


Unnamed: 0,Type,Value,Confidence
0,invoice_date,"Apr 13, 2020",0.9919
1,due_date,"Apr 30, 2020",0.9852
2,total_amount,4788.00,0.9751
3,payment_terms,30,0.97
4,total_tax_amount,$228.00,0.9369
5,net_amount,4560.00,0.9194
6,receiver_name,FDA,0.9193
7,supplier_address,"123 Spring Street, MA, 02101",0.9132
8,supplier_name,ABC Supplier,0.8331
9,ship_to_name,FDA,0.7834
