# Document AI Asynchronous API
This notebook shows you how use Python to make asynchronous calls to the Document AI API

You must replace the `processor_id` variable value in the the second cell with the appropriate value for the Processor ID of the Document AI processor that you want to use. The processor may not support all of the Document AI output properties. Entity data is only returned by processors that use specialized parsers for example. 

In [157]:
# Import Libraries
from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage
from prettytable import PrettyTable

import re
import os
import pandas as pd


In [158]:
# Set your Processor ID
processor_id = "ec4393b8709d61f4"  # TODO: Replace with a valid Processor ID 

In [159]:
# Set your variables
project_id = %system gcloud config get-value core/project
project_id = project_id[0]
location = 'us'           # Replace with 'eu' if processor does not use 'us' location
gcs_input_bucket  = project_id+"_trial"   # Bucket name only, no gs:// prefix
gcs_input_prefix  = "input/"                     # Input bucket folder e.g. input/
gcs_output_bucket = project_id+"_trial"   # Bucket name only, no gs:// prefix
gcs_output_prefix = "output/"                    # Input bucket folder e.g. output/
timeout = 300

In [160]:
# Define Google Cloud client objects
client_options = {"api_endpoint": "{}-documentai.googleapis.com".format(location)}
client = documentai.DocumentProcessorServiceClient(client_options=client_options)
storage_client = storage.Client()

In [416]:
# Create input configuration
blobs = storage_client.list_blobs(gcs_input_bucket, prefix=gcs_input_prefix)
input_configs = []
print("Input Files:")
print(blobs)
print(type(blobs))

for blob in blobs:
    if ".pdf" in blob.name:
        source = "gs://{bucket}/{name}".format(bucket = gcs_input_bucket, name = blob.name)
        #blob.download_to_file(source)
        #print(type(source))
        #print(blob.name)
        
        #self_extract(blob.name)
        
        
        input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
            gcs_source=source, mime_type="application/pdf"
        )
        input_configs.append(input_config)


Input Files:
<google.api_core.page_iterator.HTTPIterator object at 0x7f1f6aba5b10>
<class 'google.api_core.page_iterator.HTTPIterator'>


In [54]:
# Create output configuration
destination_uri = f"gs://{gcs_output_bucket}/{gcs_output_prefix}"
output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
    gcs_destination=destination_uri
)

In [162]:
# Create the Document AI API request
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
request = documentai.types.document_processor_service.BatchProcessRequest(
    name=name,
    input_configs=input_configs,
    output_config=output_config,
)


In [163]:
# Start the batch (asynchronous) API operation 
operation = client.batch_process_documents(request)
# Wait for the operation to finish
operation.result(timeout=timeout)
print ("Batch process  completed.")

Batch process  completed.


In [164]:
# Fetch list of output files
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))

In [383]:
# Display detected text from asynchronous output JSON files
jsno=[]
for i, blob in enumerate(blob_list):
    # If JSON file, download the contents of this blob as a bytes object.
    if ".json" in blob.name:
        blob_as_bytes = blob.download_as_bytes()
        document = documentai.types.Document.from_json(blob_as_bytes)
        print(f"Fetched file {i + 1}:{blob.name}")
        #print the text data output from the processor
        print(f"Text Data:\n {document.text}")
        #jsno.append(extract(document.text,blob.name))
        #print(f"Type of document text: {type(document.text)}")
    else:
        print(f"Skipping non-supported file type {blob.name}")

Fetched file 1:output/12614758114985928433/0/2022_06_06_Kroepil Joann-0.json
Text Data:
 WORKERS' COMPENSATION APPEALS BOARD
STATE OF CALIFORNIA
JOANN KROEPIL, Applicant
VS.
VONS, Administered by ALBERTSONS HOLDINGS, Defendants
Adjudication Number: ADJ6977398
Santa Ana District Office
OPINION AND ORDER
DISMISSING PETITION FOR
RECONSIDERATION
Applicant, who is representing herself. Seeks reconsideration of a March 15, 2022 Minute
Order continuing a mandatory settlement conference (MSC). We have considered the allegations
of the Petition for Reconsideration and the contents of the report of the workers' compensation
administrative law judge (WCJ) with respect thereto. Based on our review of the record, the
petition seeks reconsideration of a non-final order and will be dismissed.
An MSC is a pre-trial hearing. If the March 15, 2022 MSC had been completed, the parties
would have filled out a pre-trial conference statement and listed the exhibits they intended to
submit at trial. Exhibits 

In [48]:
# Display entity data from asynchronous output JSON files
for i, blob in enumerate(blob_list):
    # If JSON file, download the contents of this blob as a bytes object.
    if ".json" in blob.name:
        blob_as_bytes = blob.download_as_bytes()
        document = documentai.types.Document.from_json(blob_as_bytes)
        print(f"Fetched file {i + 1}:{blob.name}")
        # print the entity data output from the processor
        if 'entities' in dir(document):
            entities=document.entities
            table = PrettyTable(['Type', 'Value', 'Confidence'])
            entities_found = 0
            for entity in entities:
               entity_type = entity.type_
               value = entity.mention_text
               confidence = round(entity.confidence,4)
               table.add_row([entity_type, value, confidence])
            print(table)   
        else:
            print('No entity data returned by the Document AI processor for file'+blob.name)
    else:
        print(f"Skipping non-supported file type {blob.name}")

Fetched file 1:output/16435294507375990355/0/2022_06_06_Kroepil Joann-0.json
+------+-------+------------+
| Type | Value | Confidence |
+------+-------+------------+
+------+-------+------------+
Fetched file 2:output/16435294507375990355/1/2022_06_06_Oliver Davis-0.json
+------+-------+------------+
| Type | Value | Confidence |
+------+-------+------------+
+------+-------+------------+
Fetched file 3:output/16435294507375990355/10/2022_06_15_Jackson Cristina-0.json
+------+-------+------------+
| Type | Value | Confidence |
+------+-------+------------+
+------+-------+------------+
Fetched file 4:output/16435294507375990355/11/2022_06_17_Maghakyan Ruzanna-0.json
+------+-------+------------+
| Type | Value | Confidence |
+------+-------+------------+
+------+-------+------------+
Fetched file 5:output/16435294507375990355/12/2022_06_17_Melendez Edil David-0.json
+------+-------+------------+
| Type | Value | Confidence |
+------+-------+------------+
+------+-------+------------+


In [379]:
jsonify(jsno)

In [56]:
content='''
 WORKERS' COMPENSATION APPEALS BOARD
STATE OF CALIFORNIA
DAVID PEREZ, Applicant
VS.
TRI-STATE EMPLOYMENT SERVICES, INC.; CALIFORNIA INSURANCE
GUARANTEE ASSOCIATION by its servicing facility SEDGWICK CMS for
LUMBERMEN'S UNDERWRITING ALLIANCE; CONAGRA FOODS, INC.,
permissibly self-insured, Defendants
Adjudication Number: ADJ9554288
Pomona District Office
OPINION AND DECISION
AFTER RECONSIDERATION
We granted reconsideration to further study the factual and legal issues in this case. This
is our Opinion and Decision After Reconsideration.
As an initial matter, Labor Code section 5909 provides that a petition for reconsideration
is deemed denied unless the Appeals Board acts on the petition within 60 days of filing. (Lab.
Code, §5909.) However, “it is a fundamental principle of due process that a party may not be
deprived of a substantial right without notice...." (Shipley v. Workers' Comp. Appeals Bd. (1992)
7 Cal. App.4th 1104, 1108 [57 Cal.Comp.Cases 493].) In Shipley, the Appeals Board denied
applicant's petition for reconsideration because the Appeals Board had not acted on the petition
within the statutory time limits of section 5909. The Appeals Board did not act on defendant's
petition because it had misplaced the file, through no fault of the parties. The Court of Appeal
reversed the Appeals Board's decision holding that the time to act on defendant's petition was tolled
during the period that the file was misplaced. (Id.) Considering that defendant timely sought
reconsideration and its petition did not come to the attention of the Appeals Board until after the
time to act on the petition had passed, we find that our time to act is tolled.
We have reviewed the record in this matter. The arbitrator filed a Report and
Recommendation on Petition for Reconsideration (Report) recommending that the petition be
denied. For the reasons discussed by the arbitrator in his Report, which we adopt and incorporate
by reference and for the reasons discussed below, we will affirm the November 24, 2020 Findings
and Award.
CIGA's liability is specifically defined in Insurance Code section 1063.1. (Ins. Code, §
1063.1.) "[C]overed claims" under section 1063.1 “are not coextensive with an insolvent insurer's
obligations under its policies." (Industrial Indemnity Co. v. Workers' Comp. Appeals Bd. (Garcia)
(1997) 60 Cal.App.4th 548, 557 [62 Cal.Comp.Cases 1661].) Insurance Code section
1063.1(c)(5)(A) states: "Covered claims' does not include an obligation to insurers, insurance
pools, or underwriting associations, nor their claims for contribution, indemnity, or subrogation,
equitable or otherwise, except as otherwise provided in this chapter.” (Ins. Code, § 1063.1
(c)(5)(A).) As one Court of Appeal explained, this subsection "excludes obligations to insurers
from the category of 'covered claims." (California Ins. Guarantee Assn. v. Argonaut Ins. Co.
(1991) 227 Cal.App.3d. 624, 636 [56 Cal.Comp.Cases 104].) Citing Burnsed v State Board of
Control (1987) 189 Cal.App.3d 213, 217 [“From the earliest days of statehood the courts have
interpreted ‘any' to be broad, general, and all embracing"], another Court of Appeal noted:
"[S]ubdivision (c)(5) excludes not only claims for contribution, indemnity, or subrogation but also
'any obligations to insurers. [citation] (California Ins. Guarantee Assn. v. Workers' Comp. Appeals
Bd. (Hooten) (2005) 128 Cal. App.4th 569, 573 [70 Cal.Comp.Cases 551].)
Pursuant to Division four of the Labor Code, the term "insurer" specifically denotes
permissibly self-insured employers. Labor Code, section 3211, provides:
'Insurer' includes the State Compensation Insurance Fund and any private
company, corporation, mutual association, reciprocal or interinsurance
exchange authorized under the laws of this State to insure employers against
liability for compensation and any employer to whom a certificate of consent to
self-insure has been issued.
It is well established that a jointly and severally liable self-insured employer is “other
insurance" for purposes of relieving CIGA of liability. (Denny's Inc. v. Workers' Comp. Appeals
Bd. (Bachman) (2003) 104 Cal.App.4th 1433 [68 Cal.Comp.Cases 1].) Therefore, we will affirm
the Findings and Award.
2
IT IS ORDERED, as the Decision After Reconsideration of the Workers' Compensation
Appeals Board, that the November 24, 2020 Findings and Award is AFFIRMED.
I CONCUR,
WORKERS' COMPENSATION APPEALS BOARD
/s/ MARGUERITE SWEENEY, COMMISSIONER
/s/ KATHERINE A. ZALEWSKI, CHAIR
/s/ JOSÉ H. RAZO. COMMISSIONER
DATED AND FILED AT SAN FRANCISCO, CALIFORNIA
June 14, 2022
MWH/00
DAVID PEREZ
LAW OFFICES OF RICHARD WOOLEY
ALTMAN & BLITSTEIN
PATRICO HERMANSON & GUZMAN
LAW OFFICES OF MARK POLAN
3
COMPENSATION
WORKERS
✰
SUREKA
ALIFORNI
SEAL
SERVICE MADE ON THE ABOVE DATE ON THE PERSONS LISTED BELOW AT
THEIR ADDRESSES SHOWN ON THE CURRENT OFFICIAL ADDRESS RECORD.
NAPPEALS
I certify that I affixed the official
seal of the Workers' Compensation
Appeals Board to this original
decision on this date. 0.0
BOARD
REPORT AND RECOMMENDATION ON PETITION FOR RECONSIDERATION
INTRODUCTION
There is one case involved in this proceeding. It is a specific injury of January 20, 2014.
Defendant Conagra Foods filed a timely and verified Petition for Reconsideration on December
21, 2020.
Defendant California Insurance Guarantee Association (CIGA) filed an Answer to Petition
for Reconsideration timely and verified on January 5, 2021.
Defendant Conagra Foods seeks reconsideration on the grounds that the decision of the
arbitrator does not justify the Findings of Fact and the Findings of Fact do not support the Findings
and Award. They also allege the arbitrator acted without or in excess of his powers.
At the arbitration, the parties agreed that Conagra Foods was the special employer for the
applicant. The applicant was performing duties under the supervision, control, and direction of the
special employer Conagra Foods.
Conagra, a self-insured employer contends they are not considered other insurance and that
is the reason why the arbitration decision should be overturned.
CIGA in their Answer to Petition for Reconsideration stated Conagra is other insurance
within the meaning of insurance Code Section 1063.lc (9) and that CIGA has no liability in this
matter.
STATEMENT OF FACTS
Defendant Conagra foods has filed a timely and verified Petition for Reconsideration on
December 21, 2020.
Defendant CIGA filed a timely and verified Answer to Petition for Reconsideration on
January 5, 2021.
This case involves a specific injury on January 20, 2014.
The sole issue is coverage.
Pursuant to agreement at the arbitration Conagra Foods was the special employer for the
applicant on the date of injury.
Conagra contends there is no mechanism for a self-insured employer to obtain an
endorsement from the insurance commissioner excluding employees working at Conagra as a
special employer.
4
CIGA contends there is joint and several liability between the special employer Conagra
and the general employer CIGA. CIGA contends that self-insurance is other insurance within the
meaning of Insurance Code Section 1063.1©(9). Therefore, they allege they have no liability for
this claim as CIGA is not in fact an insurance company and if there is other insurance the claims
administration and coverage liability would be placed upon Conagra the special employer.
DISCUSSION
Conagra in their Petition for Reconsideration states they have fully complied with the laws
related to self-insurance which limit coverage to its own employees and excludes coverage for
employees of other companies. They admit they were not able to obtain an exclusionary
endorsement which would have denied coverage where they were found to be the special
employer.
Conagra admits there is no mechanism for a self-insured employer to obtain an
endorsement from the insurance commissioner excluding employees working at Conagra as a
special employer.
For matters of coverage there is no distinction between a private insurance company and a
self-insured employer. Both are state mandated policies to cover all California workers with
workers compensation insurance.
Conagra Foods did not have an exclusionary policy form filed with the state which would
exclude any temporary staffing employees who worked at Conagra.
Due to the fact there is admitted special employment CIGA was relieved of the
responsibility for coverage as there is other insurance under Insurance Code Section 1063.10 (9).
RECOMMENDATION
For the reasons stated above, it is recommended the Petition for Reconsideration dated
December 21, 2020 be denied.
Dated: January 13, 2021
5
Mark S. Polan
Workers Compensation Arbitrator
'''


In [376]:
#Extraction of values/important values

def extract(content,filename):
    l=[]
    import re
    defendant=''
    regex=r"VS."
    regex2=r"Defendants"
    match = re.search(regex, content)
    match2=re.search(regex2,content)
    app_regex=r"^ Applicant"
    pdf_extract=r"/[0-9].*"
    matches = re.search(pdf_extract, filename)
    pdf=matches.group(0)
    identifier='input'+pdf
    
    #print(match.group(0))
    
    match_app = re.search(app_regex, content)
    if match_app is None:
        applicant=''
    else:
        applicant=match_app.group(0).split(',')[0]
    if match is not None:
        if match2 is not None:
            defendant=content[match.end():match2.start()]
    else:
        defendant=''
        
        
         
    
    
    code_reg=r"section [0-9]+"
    code = re.findall(code_reg, content)
    
    class Output:
        def __init__(self, defendant, applicant, code,filename):
            self.defendant = defendant
            self.applicant = applicant
            self.code = code
            self.filename=filename
    out=Output(defendant,applicant,code,identifier)
    import json
    output_str=json.dumps(out.__dict__)
    #convert string to  object
    data = json.loads(output_str)
    return data
    #print(data)
    
    
    
    

In [317]:
type(extract(content))

NoneType

In [378]:
def jsonify(l):
    import json
    with open('data-final-pranjal-improvd.json', 'a', encoding='utf-8') as f:
        json.dump(l, f, ensure_ascii=False, indent=4)
    

In [222]:
testing='''

 WORKERS' COMPENSATION APPEALS BOARD
STATE OF CALIFORNIA
VS.
TRI-STATE EMPLOYMENT SERVICES, INC.; CALIFORNIA INSURANCE
GUARANTEE ASSOCIATION by its servicing facility SEDGWICK CMS for
LUMBERMEN'S UNDERWRITING ALLIANCE; CONAGRA FOODS, INC.,
permissibly self-insured, Defendants
Adjudication Number: ADJ9554288
Pomona District Office
OPINION AND DECISION
AFTER RECONSIDERATION
We granted reconsideration to further study the factual and legal issues in this case. This
is our Opinion and Decision After Reconsideration.
As an initial matter, Labor Code section 5909 provides that a petition for reconsideration
is deemed denied un
'''

In [389]:
pdf=[]
blobs = storage_client.list_blobs(gcs_input_bucket, prefix=gcs_input_prefix)
input_configs = []
print("Input Files:")
print(blobs)

for blob in blobs:
    
    f=open("tes.v1.txt",'wb+')
    f.write(blob.download_as_bytes())
    f.close()
    text_file=open("tes.v1.txt","rb")
    data = text_file.read()
    #print(data)
    text_file.close()
    class Output1:
        def __init__(self,data,identifier):
            self.data = data
            self.identifier=identifier
    out=Output1(str(data),blob.name)
    import json
    output_str=json.dumps(out.__dict__)
    #convert string to  object
    data = json.loads(output_str)
    pdf.append(data)
    break
    
    
    

Input Files:
<google.api_core.page_iterator.HTTPIterator object at 0x7f1f6ac67d90>


'\nfor blob in blobs:\n    \n    f=open("tes.v1.txt",\'wb+\')\n    f.write(blob.download_as_bytes())\n    f.close()\n    text_file=open("tes.v1.txt","rb")\n    data = text_file.read()\n    #print(data)\n    text_file.close()\n    class Output1:\n        def __init__(self,data,identifier):\n            self.data = data\n            self.identifier=identifier\n    out=Output1(str(data),blob.name)\n    import json\n    output_str=json.dumps(out.__dict__)\n    #convert string to  object\n    data = json.loads(output_str)\n    pdf.append(data)\n    break\n    \n    '

In [353]:
def jsonify_pdf(l):
    import json
    with open('data-pdf.json', 'a', encoding='utf-8') as f:
        json.dump(l, f, ensure_ascii=False, indent=4)

In [355]:
jsonify_pdf(pdf)

In [369]:
test='''gs://daas-try_trial/input/2022_06_28_Solano Efren Lara.pdf'''
pdf_extract=r"/[0-9].*"
match = re.search(pdf_extract, test)
applicant=match.group(0)
print('input'+applicant)


input/2022_06_28_Solano Efren Lara.pdf


Pdf Extraction self

In [406]:
import PyPDF2
def self_extract(file):
    
    f=open(file,'rb')
#pdfFileReader() reads the text  form the pdf
    pdf_reader = PyPDF2.PdfFileReader(f) 
#the following lines of code will output the number of pages of the pdf
    pdf_reader.numPages
#getPage()reads the text of a specific page. Here the parameter 0 indicates the first page of the pdf
    page_one = pdf_reader.getPage(0)
    page_one_text = page_one.extractText()
    return page_one_text

In [391]:
pdf=[]
blobs = storage_client.list_blobs(gcs_input_bucket, prefix=gcs_input_prefix)
input_configs = []
#destination_file_name='test
print("Input Files:")
print(blobs)
for blob in blobs:
    print(blob)
    break

Input Files:
<google.api_core.page_iterator.HTTPIterator object at 0x7f1f6a779ed0>
<Blob: daas-try_trial, input/2022_06_06_Kroepil Joann.pdf, 1661239113236498>


Keras Processor