DocExtractor GCP:

1. Upload file to GCS (ideally through a front-end dashboard).
2. Grab the file through API and send it to DocAI processor (Output format JSON).
3. Upload JSON output to BigQuery table.

Front-End: 
1. Create dashboard (possibly with React bootstrap)


In [None]:
# imports
import os
from google.cloud import storage
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.api_core.exceptions import FailedPrecondition

In [None]:
# set path to private key file in local machine (json)
# create service account credentials and grab json key file 
PATH_TO_KEY_JSON = '/' 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = PATH_TO_KEY_JSON # setting environment variable to the json key file

In [None]:
from google.api_core import client_options
# helper functions

## Create Google Cloud Storage Bucket
storage_client = storage.Client()
def create_bucket(BUCKET_NAME):
    bucket = storage_client.bucket(BUCKET_NAME)
    # standard frequency access class
    bucket.storage_class = 'Standard'
    # location 'EU'
    bucket.location = 'EU'
    # create bucket
    bucket = storage_client.create_bucket(bucket)
    # print details of buckets as a dict
    print(vars(bucket))

## Upload files to GCS
def upload_file_to_bucket(blob_name, file_path, BUCKET_NAME):
    try:
        # Grabs an existing bucket
        bucket = storage_client.get_bucket(BUCKET_NAME)
        # upload to bucket as a binary large object (blob)
        blob = bucket.blob(blob_name)
        blob.upload_from_filename(file_path) 
        return True
    except Exception as e:
        # print error if upload fails
        print(e)
        return False

## Create specialised processor in docAI
def create_processor(project_id, location, processor_name, processor_type):
    # setting the api endpoint
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # allocating processor to project ID and location
    parent = client.common_location_path(project_id, location)
    processor = client.create_processor(parent=parent,
                                        processor=documentai.Processor(
                                            display_name=processor_name,
                                            type_=processor_type
                                        ))
    print(f"Processor Name: {processor.name}")
    print(f"Processor Display Name: {processor.display_name}")
    print(f"Processor Type: {processor.type_}")

## Enable processor
def enable_processor(project_id, location, processor_id):
    # setting the api end point to EU
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)
    processor_name = client.processor_path(project_id, location, processor_id)
    request = documentai.EnableProcessorRequest(name=processor_name)

    try:
        # make processor request
        operation = client.enable_processor(request=request) ## client.disable_processor / client.delete_processosr
        print(operation.operation.name)
        operation.result()
    except FailedPrecondition as e:
        # throws error if processor is already activated
        print(e.message)

## Send processing request
# this function is specific to Optical Character Recognition (OCR) processor
# additional functions and/or logic required to provision specialised processors based on document type eg passports, driving licences, forms or contracts 

def send_proc_req(project_id, location, processor_id, file_path, mime_type):
    # set api endpoint to EU
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    # Allocate to processor
    resource_name = client.processor_path(project_id, location, processor_id)

    # read file to memory
    with open(FILE_PATH, 'rb') as image:
    image_content = image.read()

    # payload message of raw document content (bytes).
    raw_doc = documentai.RawDocument(content=image_content, mime_type=MIME_TYPE)
    request = documentai.ProcessRequest(name=RESOURCE_NAME, raw_document=raw_doc)

    result = docai_client.process_document(request=request)

    document_object = result.document
    print('Document proc complete')
    print(f"Text: {document_object.text}")
    

Additional functions required: 

1. functions for provisioning different processor type
2. function to dynamically determind type of document (could be user provided.)
3. Disable processor after completion of processing
4. function to create a bigQuery table
5. function to upload JSON output to BigQuery table
6. further analytics