<a href="https://colab.research.google.com/github/sammyamajumdar/GoogleCloudPlatform-tests/blob/main/docExtractor_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/aerial-passage-363110-d2e4c5a5de14.json' # path to json key file

In [None]:
# Install client libraries
!pip install google-cloud-storage
!pip3 install --upgrade google-cloud-documentai
!pip3 install --upgrade google-cloud-storage

In [None]:
# imports
import os
from google.cloud import storage
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.api_core.exceptions import FailedPrecondition
import pandas as pd

In [None]:
# constants
PROJECT_ID = 'aerial-passage-363110'
LOCATION = 'eu'
MIME_TYPE = 'application/pdf'

GOOGLE CLOUD STORAGE HELPER FUNCTIONS

In [None]:
## Create Google Cloud Storage Bucket
storage_client = storage.Client()
def create_bucket(BUCKET_NAME):
    bucket = storage_client.bucket(BUCKET_NAME)
    # standard frequency access class
    bucket.storage_class = 'STANDARD'
    # location 'EU'
    bucket.location = 'EU'
    # create bucket
    bucket = storage_client.create_bucket(bucket)
    # print details of buckets as a dict
    print(vars(bucket))


## Upload files to GCS
def upload_file_to_bucket(blob_name, file_path, BUCKET_NAME):
    try:
        # Grab an existing bucket
        bucket = storage_client.get_bucket(BUCKET_NAME)
        # upload to bucket as a binary large object (blob)
        blob = bucket.blob(blob_name)
        blob.upload_from_filename(file_path) 
        return True
    except Exception as e:
        # print error if upload fails
        print(e)
        return False

In [None]:
create_bucket('tester-docai-bucket01')
upload_file_to_bucket('passport_pdf_majumdar', '/content/my_PASSPORT.pdf', 'tester-docai-bucket01')

  


{'name': 'tester-docai-bucket01', '_properties': {'kind': 'storage#bucket', 'selfLink': 'https://www.googleapis.com/storage/v1/b/tester-docai-bucket01', 'id': 'tester-docai-bucket01', 'name': 'tester-docai-bucket01', 'projectNumber': '621384936610', 'metageneration': '1', 'location': 'EU', 'storageClass': 'STANDARD', 'etag': 'CAE=', 'timeCreated': '2022-09-24T14:45:35.367Z', 'updated': '2022-09-24T14:45:35.367Z', 'iamConfiguration': {'bucketPolicyOnly': {'enabled': False}, 'uniformBucketLevelAccess': {'enabled': False}, 'publicAccessPrevention': 'inherited'}, 'locationType': 'multi-region', 'rpo': 'DEFAULT'}, '_changes': set(), '_client': <google.cloud.storage.client.Client object at 0x7fef9b6be950>, '_acl': <google.cloud.storage.acl.BucketACL object at 0x7fef9b6beb50>, '_default_object_acl': <google.cloud.storage.acl.DefaultObjectACL object at 0x7fef9b6bea90>, '_label_removals': set(), '_user_project': None}


True

In [None]:
GCS_INPUT_URI = 'gs://tester-docai-bucket01/passport_pdf_majumdar'

DOCUMENT AI HELPER FUNCTIONS

In [None]:
## Create specialised processor in docAI
def create_processor(project_id, location, processor_name, processor_type):
    # setting the api endpoint
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # allocating processor to project ID and location
    parent = client.common_location_path(project_id, location)
    processor = client.create_processor(parent=parent,
                                        processor=documentai.Processor(
                                            display_name=processor_name,
                                            type_=processor_type
                                        ))
    print(f"Processor Name: {processor.name}")
    print(f"Processor Display Name: {processor.display_name}")
    print(f"Processor Type: {processor.type_}")

## Enable processor
def enable_processor(project_id, location, processor_id):
    # setting the api end point to EU
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)
    processor_name = client.processor_path(project_id, location, processor_id)
    request = documentai.EnableProcessorRequest(name=processor_name)

    try:
        # make processor request
        operation = client.enable_processor(request=request) ## client.disable_processor / client.delete_processosr
        print(operation.operation.name)
        operation.result()
    except FailedPrecondition as e:
        # throws error if processor is already activated
        print(e.message)

## Print list of available processors
def fetch_processor_types_sample(project_id, location):
    # set api endpoint location to eu
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    parent = client.common_location_path(project_id, location)

    response = client.fetch_processor_types(parent=parent)

    print("Processor types:")
    # Print the available processor types
    for processor_type in response.processor_types:
        if processor_type.allow_creation:
            print(processor_type.type_)


## Send the document for processing
def send_processing_req(project_id, location, processor_id, file_path, mime_type, GCS_INPUT_URI):
    
    docai_client = documentai.DocumentProcessorServiceClient(
        client_options = ClientOptions(api_endpoint=f'{location}-documentai.googleapis.com')
    )

    RESOURCE_NAME = docai_client.processor_path(project_id, location, processor_id)

    # load file into memory
    with open(file_path, 'rb') as image:
        image_content = image.read()

    raw_doc = documentai.RawDocument(content=GCS_INPUT_URI, mime_type=MIME_TYPE)
    request = documentai.ProcessRequest(name=RESOURCE_NAME, raw_document=raw_doc)

    result = docai_client.process_document(request=request)

    document_object = result.document
    print('Document processing complete')
    print(document_object.text)


    return(document_object.text)

In [None]:
fetch_processor_types_sample(PROJECT_ID, LOCATION)

Processor types:
INVOICE_PROCESSOR
FORM_PARSER_PROCESSOR
OCR_PROCESSOR
FORM_W9_PROCESSOR
EXPENSE_PROCESSOR
US_DRIVER_LICENSE_PROCESSOR
US_PASSPORT_PROCESSOR


In [None]:
create_processor(PROJECT_ID, LOCATION, 'passportExtractor', 'US_PASSPORT_PROCESSOR')

Processor Name: projects/621384936610/locations/eu/processors/7f978d52f8bf030
Processor Display Name: passportExtractor
Processor Type: US_PASSPORT_PROCESSOR


In [None]:
enable_processor(PROJECT_ID, LOCATION, '7f978d52f8bf030')

Processor state cannot be changed to 'ENABLING' since it is 'ENABLED'.


In [None]:
FILE_PATH = '/content/my_PASSPORT.pdf'
PROCESSOR_ID = '7f978d52f8bf030'

In [None]:
ans = send_processing_req(PROJECT_ID, LOCATION, PROCESSOR_ID, FILE_PATH, MIME_TYPE, GCS_INPUT_URI)

In [None]:
ans = ans.split("\n")

In [None]:
res = ans

In [None]:
for item in res:
    print(item, res.index(item))

In [None]:
def format_documentai_output(doc_object):
    keys = ['CountryOfOrigin', 'Type', 'Surname', 'GivenName', 'FullName', 'CountryCode', 'Passport No', 'Sex', 'DataOfBirth', 'PlaceOfBirth', 'PlaceOfIssue', 'DateOfIssue', 'DateOfExpiry']
    vals = [doc_object[0], doc_object[2], doc_object[4], doc_object[10], doc_object[29],doc_object[6], doc_object[8],doc_object[15],doc_object[16],doc_object[18],doc_object[21],doc_object[24],doc_object[25]]

    formatted_results = {}
    for item in range(0, 13):
        formatted_results[keys[int(item)]] = vals[int(item)]

    return formatted_results    

In [None]:
letssee = format_documentai_output(ans)

In [None]:
letssee