## Importing necessary libraries

In [2]:
import os
from google.cloud import documentai_v1beta2 as documentai
import matplotlib.pyplot as plt
import cv2
import argparse
import io
import json
import numpy
import six
import re
from google.cloud import storage
import pandas as pd

## Setting path to json key

In [3]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/affine/GCP/downloaded_key.json"

# 1) Parsing documents containing forms

- Document AI can detect and parse text from PDF, TIFF, GIF files stored in Cloud Storage, including text that contains unstructured data in the form documents.


- Request document processing from a smaller file (<=5 pages) using the process method, and larger file requests (files with a large number of pages) use the batchProcess method. The status of batch (asynchronous) requests can be checked using the operations resources.

## a) Small file online processing

- Synchronous ("online") requests target a document with a small number of pages and size (<=5 pages, < 20MB) stored in Cloud Storage. 

- Synchronous requests immediately return a response inline.

### Parsing pdf form

To parse forms:


- Instantiate the document understanding service client.

- Specifying the Google Cloud Storage location where the input file will be read from.

- Specifying the desired input location and metadata.

- We can improve form parsing results by providing key-value pair hints.

- Set the parameters to control form extraction behavior.

- Request to process one document.

- Use process_document method to send request to API.

For Parsing pdf forms: mime type used is application/pdf

In [1]:
def parse_form_pdf(project_id,input_uri):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(key='Emergency Contact',
                                          value_types=['NAME']),
        documentai.types.KeyValuePairHint(
            key='Referred By')
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params)

    document = client.process_document(request=request)
    print('document: \n',document)
    
    def _get_text(el):
        """Doc AI identifies form fields by their offsets
        in document text. This function converts offsets
        to text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    for page in document.pages:
        print('Page number: {}'.format(page.page_number))
        for form_field in page.form_fields:
            print('Field Name: {}\tConfidence: {}'.format(
                _get_text(form_field.field_name),
                form_field.field_name.confidence))
            print('Field Value: {}\tConfidence: {}'.format(
                _get_text(form_field.field_value),
                form_field.field_value.confidence))

In [129]:
# calling above function

parse_form_pdf("project-001-285307","gs://bucket0406/form.pdf")

document: 
 mime_type: "application/pdf"
text: "FakeDoc M.D.\nHEALTH INTAKE FORM\nPlease fill out the questionnaire carefully. The information you provide will be used to complete\nyour health profile and will be kept confidential.\nDate:\n9/14/19\nName: Sally Walker\nDOB: 09/04/1986\nAddress: 24 Barney Lane City: Tonaco State: NJ Zip: 07082\nEmail: Sally, waller@cmail.com_Phone #:_(906) 917-3486\nGender: _F\nSingle Occupation: Software Engineer\nReferred By: None\nEmergency Contact: Eva Walker Emergency Contact Phone: (906)334-89766\nMarital Status:\nDescribe your medical concerns (symptoms, diagnoses, etc):\nRanny nose, mucas in thoat, weakness,\naches, chills, tired\nAre you currently taking any medication? (If yes, please describe):\nVyvanse (25mg) daily for attention\n"
pages {
  page_number: 1
  dimension {
    width: 418.0
    height: 492.0
    unit: "points"
  }
  layout {
    text_anchor {
      text_segments {
        end_index: 716
      }
    }
    bounding_poly {
      ver

** Output **

Response contains:

- document text

and vertices of :


- pages in document

    - blocks in each page

        - paragrahs in each block

            - lines in each paragraph

                - tokens in each line

    - formfields in each page
    
        - it contains:
        
            - start and end index of fieldnames
            
            - start and end index of fieldvalues
            
Doc AI identifies form fields by their offsets in document text. We converts this offsets to text snippets.

### Parsing gif images

For Parsing gif images: mime type used is image/gif

In [132]:
def parse_form_gif(project_id,
               input_uri):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='image/gif')

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params)

    document = client.process_document(request=request)
    print('document: \n',document)
    
    def _get_text(el):
        """Doc AI identifies form fields by their offsets
        in document text. This function converts offsets
        to text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response

    for page in document.pages:
        print('Page number: {}'.format(page.page_number))
        for form_field in page.form_fields:
            print('Field Name: {}\tConfidence: {}'.format(
                _get_text(form_field.field_name),
                form_field.field_name.confidence))
            print('Field Value: {}\tConfidence: {}'.format(
                _get_text(form_field.field_value),
                form_field.field_value.confidence))

In [133]:
# calling above function

parse_form_gif("project-001-285307","gs://bucket0406/loan_form.gif")

document: 
 mime_type: "image/gif"
text: "Loan Agreement Form\nAgreement Number:\n0123456789\nAgreement date:\n01/01/2020\nThis loan agreement is commenced between the parties:\nMortgage company contact details:\nName:\nMortgage company A\nAddress:\n100 Franklin Street, Mountain View, CA, 94035\nPhone number: 1-800-843-8623\n(hereinafter referred to as the lender)\nIndividual details:\nName:\nArjun Patel\nMarital status:\nSingle ,\nMarried O\nOther D\nAddress:\n500 Castro Street, Mountain View, CA 94035\nPhone number: 650-987-0934\n(hereinafter referred to as the borrower)\n[Fill in all details as per instructions]\nThe lender is ready to sanction $\n2000\nas the loan amount at\n6.0\n%.\n[Total loan amount along with the agreed percentage rate).\nThis loan agreement is valid from 01/01/2020 and is ending on 12/31/2020.\nTerms & agreements:\n38.67\nper month for\n5\nyears.\nThe borrower will pay an installment of $\n[Amount & tenure of loan]\nAny late installment will be accepted with $

## b) Large file offline processing

- Asynchronous ("offline") requests targets longer documents and allows you to set the number of pages in the output files. This request starts a long-running operation. When this operation finishes it stores output as a JSON file in a specified Cloud Storage bucket.


- Document AI asynchronous processing accepts PDF, TIFF, GIF files up to 2000 pages. Attempting to process larger files returns an error.


- Use batch_process_documents method to send request to API.

In [40]:
def batch_parse_form(
        project_id,
        input_uri,
        destination_uri):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    # where to write results
    output_config = documentai.types.OutputConfig(
        gcs_destination=documentai.types.GcsDestination(
            uri=destination_uri),
        pages_per_shard=1  # Map one doc page to one output page
    )

    # Improve form parsing results by providing key-value pair hints.
    # For each key hint, key is text that is likely to appear in the
    # document as a form field name (i.e. "DOB").
    # Value types are optional, but can be one or more of:
    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
    key_value_pair_hints = [
        documentai.types.KeyValuePairHint(
            key='Emergency Contact',
            value_types=['NAME']),
        documentai.types.KeyValuePairHint(
            key='Referred By')
    ]

    # Setting enabled=True enables form extraction
    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True, key_value_pair_hints=key_value_pair_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        input_config=input_config,
        output_config=output_config,
        form_extraction_params=form_extraction_params)

    # Add each ProcessDocumentRequest to the batch request
    requests = []
    requests.append(request)

    batch_request = documentai.types.BatchProcessDocumentsRequest(
        parent=parent, requests=requests
    )

    operation = client.batch_process_documents(batch_request)

    # Wait for the operation to finish
    operation.result()

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.client.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)


In [41]:
# calling above function

batch_parse_form(project_id="project-001-285307",input_uri="gs://bucket0406/form.pdf",destination_uri="gs://bucket0406/form_output.pdf")

Output files:
form_output.pdf-output-page-1-to-1.json


It stores output as a JSON file in a specified Cloud Storage bucket.

# 2) Parsing documents containing tables

- Document AI can detect and parse text from PDF, TIFF, GIF files stored in Cloud Storage, including text that contains unstructured data in the form of tables.

- You request table detection from a smaller file (<=5 pages) using the process method, and larger file requests (files with a large number of pages) use the batchProcess method. The status of batch (asynchronous) requests can be checked using the operations resources. Output from a batch request is written to a JSON file created in the specified Cloud Storage bucket.

## a) Small file online processing

### Parsing pdf form

- For Parsing pdf forms: mime type used is application/pdf


- We can improve table parsing results by providing bounding boxes specifying where the box appears in the document (optional)


- Created a Dataframe object with the exact structure as the table in form and saved it as csv file.

In [5]:
def parse_table_pdf(project_id,
                input_uri):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    # Improve table parsing results by providing bounding boxes
    # specifying where the box appears in the document (optional)
    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(
                        x=0,
                        y=0
                    ),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(
                        x=1,
                        y=0
                    ),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(
                        x=1,
                        y=1
                    ),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(
                        x=0,
                        y=1
                    )
                ]
            )
        )
    ]

    # Setting enabled=True enables form extraction
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True, table_bound_hints=table_bound_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        table_extraction_params=table_extraction_params)

    document = client.process_document(request=request)
    print('document: \n',document)

    def _get_text(el):
        """Convert text offset indexes into text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response
    

    for page in document.pages:
        #print('Page number: {}'.format(page.page_number))
        for table_num, table in enumerate(page.tables):
            print('Table {}: '.format(table_num))
            for row_num, row in enumerate(table.header_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                print('Header Row {}: {}'.format(row_num, cells))
                columns=[y for y in cells.split('\t')]
                columns=list(map(lambda x:x.strip(),columns))
                df3=pd.DataFrame()
            for row_num, row in enumerate(table.body_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                print('Row {}: {}'.format(row_num, cells))
                l1=(cells.split('\t'))
                l1=list(map(lambda x:x.strip(),l1))
                df3=df3.append([l1],ignore_index=True)
            df3.columns=columns
            print('***********************************************************************')
            print('Dataframe with the exact structure as table in the form: \n\n', df3)
            df3.to_csv('invoice_table.csv')
                        


In [6]:
parse_table_pdf("project-001-285307","gs://bucket0406/invoice.pdf")

document: 
 mime_type: "application/pdf"
text: "TERMS: 6 month contract\nDUE: 01/01/2025\nNOTES:\nFROM: Company ABC\nuser@companyabc.com\nADDRESS: 111 Main Street\nAnytown, USA\nItem Description Quantity Price Amount\nTool A 500 $1.00 $500.00\nService B 1 $900.00 $900.00\nResource C 50 $12.00 $600.00\nSupplies used for Project Q.\nTO: John Doe\njohndoe@email.com\nADDRESS: 222 Main Street\nAnytown, USA\nSubtotal $2000.00\nTax $140.00\nBALANCE DUE $2140.00\nDATE: 01/01/1970\nINVOICE: NO. 001\nInvoice\n"
pages {
  page_number: 1
  dimension {
    width: 612.0
    height: 792.0
    unit: "points"
  }
  layout {
    text_anchor {
      text_segments {
        end_index: 435
      }
    }
    confidence: 1.0
    bounding_poly {
      vertices {
      }
      vertices {
        x: 612
      }
      vertices {
        x: 612
        y: 792
      }
      vertices {
        y: 792
      }
      normalized_vertices {
      }
      normalized_vertices {
        x: 1.0
      }
      normalized_vert

### Parsing tiff images

-  For Parsing pdf forms: mime type used is image/tiff


- Created a Dataframe object with the exact structure as the table in form and saved it as csv file.

In [159]:
def parse_table_tiff(project_id,
                input_uri):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='image/tiff')

    # Improve table parsing results by providing bounding boxes
    # specifying where the box appears in the document (optional)
    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(
                        x=0,
                        y=0
                    ),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(
                        x=1,
                        y=0
                    ),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(
                        x=1,
                        y=1
                    ),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(
                        x=0,
                        y=1
                    )
                ]
            )
        )
    ]

    # Setting enabled=True enables form extraction
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True, table_bound_hints=table_bound_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        table_extraction_params=table_extraction_params)

    document = client.process_document(request=request)
    #print('document: \n',document)

    def _get_text(el):
        """Convert text offset indexes into text snippets.
        """
        response = ''
        # If a text segment spans several lines, it will
        # be stored in different text segments.
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response
    

    for page in document.pages:
        #print('Page number: {}'.format(page.page_number))
        for table_num, table in enumerate(page.tables):
            print('Table {}: '.format(table_num))
            for row_num, row in enumerate(table.header_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                print('Header Row {}: {}'.format(row_num, cells))
                columns=[y for y in cells.split('\t')]
                columns=list(map(lambda x:x.strip(),columns))
                df3=pd.DataFrame()
            for row_num, row in enumerate(table.body_rows):
                cells = '\t'.join(
                    [_get_text(cell.layout) for cell in row.cells])
                print('Row {}: {}'.format(row_num, cells))
                l1=(cells.split('\t'))
                l1=list(map(lambda x:x.strip(),l1))
                df3=df3.append([l1],ignore_index=True)
            df3.columns=columns
            print('***********************************************************************')
            print('Dataframe with the exact structure as table in the form: \n\n', df3)
            df3.to_csv('invoice1.csv')
                        


In [160]:
parse_table_tiff("project-001-285307","gs://bucket0406/invoice1.tiff")

Table 0: 
Header Row 0: DESCRIPTION
	UNIT COST
	QTY/HR RATE
	AMOUNT

Row 0: Your item name
	$0
	1
	$0

Row 1: Your item name
	$0
	1
	$0

Row 2: Your item name
	$0
	1
	$0

Row 3: Your item name
	$0
	1
	$0

Row 4: Your item name
	$0
	1
	$0

Row 5: Your item name
	$0
	1
	$0

Row 6: Your item name
	$0
	1
	$0

***********************************************************************
Dataframe with the exact structure as table in the form: 

       DESCRIPTION UNIT COST QTY/HR RATE AMOUNT
0  Your item name        $0           1     $0
1  Your item name        $0           1     $0
2  Your item name        $0           1     $0
3  Your item name        $0           1     $0
4  Your item name        $0           1     $0
5  Your item name        $0           1     $0
6  Your item name        $0           1     $0


## b) Large file offline processing

In [52]:
def batch_parse_table(
        project_id,
        input_uri,
        destination_uri):
    """Parse a form"""

    client = documentai.DocumentUnderstandingServiceClient()

    gcs_source = documentai.types.GcsSource(uri=input_uri)

    # mime_type can be application/pdf, image/tiff,
    # and image/gif, or application/json
    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    # where to write results
    output_config = documentai.types.OutputConfig(
        gcs_destination=documentai.types.GcsDestination(
            uri=destination_uri),
        pages_per_shard=1  # Map one doc page to one output page
    )

    # Improve table parsing results by providing bounding boxes
    # specifying where the box appears in the document (optional)
    table_bound_hints = [
        documentai.types.TableBoundHint(
            page_number=1,
            bounding_box=documentai.types.BoundingPoly(
                # Define a polygon around tables to detect
                # Each vertice coordinate must be a number between 0 and 1
                normalized_vertices=[
                    # Top left
                    documentai.types.geometry.NormalizedVertex(
                        x=0,
                        y=0
                    ),
                    # Top right
                    documentai.types.geometry.NormalizedVertex(
                        x=1,
                        y=0
                    ),
                    # Bottom right
                    documentai.types.geometry.NormalizedVertex(
                        x=1,
                        y=1
                    ),
                    # Bottom left
                    documentai.types.geometry.NormalizedVertex(
                        x=0,
                        y=1
                    )
                ]
            )
        )
    ]

    # Setting enabled=True enables form extraction
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True, table_bound_hints=table_bound_hints)

    # Location can be 'us' or 'eu'
    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        input_config=input_config,
        output_config=output_config,
        table_extraction_params=table_extraction_params)

    requests = []
    requests.append(request)

    batch_request = documentai.types.BatchProcessDocumentsRequest(
        parent=parent, requests=requests
    )

    operation = client.batch_process_documents(batch_request)

    # Wait for the operation to finish
    operation.result()

    # Results are written to GCS. Use a regex to find
    # output files
    match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
    output_bucket = match.group(1)
    prefix = match.group(2)

    storage_client = storage.client.Client()
    bucket = storage_client.get_bucket(output_bucket)
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

In [53]:
batch_parse_table(project_id="project-001-285307",input_uri="gs://bucket0406/invoice.pdf",destination_uri="gs://bucket0406/invoice_output.pdf")

Output files:
invoice_output.pdf-output-page-1-to-1.json


It stores output as a JSON file in a specified Cloud Storage bucket.