In [23]:
import pandas as pd
import boto3
import io
from PIL import Image, ImageDraw
import json

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [24]:
def draw_bounding_box(key, val, width, height, draw):
    # If a key is Geometry, draw the bounding box info in it
    if "Geometry" in key:
        # Draw bounding box information
        box = val["BoundingBox"]
        left = width * box['Left']
        top = height * box['Top']
        draw.rectangle([left, top, left + (width * box['Width']), top + (height * box['Height'])],
                       outline='black')
                       
# Takes a field as an argument and prints out the detected labels and values
def print_labels_and_values(field):
    # Only if labels are detected and returned
    if "LabelDetection" in field:
        print("Summary Label Detection - Confidence: {}".format(
            str(field.get("LabelDetection")["Confidence"])) + ", "
              + "Summary Values: {}".format(str(field.get("LabelDetection")["Text"])))
#         print(field.get("LabelDetection")["Geometry"])
    else:
        print("Label Detection - No labels returned.")
    if "ValueDetection" in field:
        print("Summary Value Detection - Confidence: {}".format(
            str(field.get("ValueDetection")["Confidence"])) + ", "
              + "Summary Values: {}".format(str(field.get("ValueDetection")["Text"])))
#         print(field.get("ValueDetection")["Geometry"])
    else:
        print("Value Detection - No values returned")

def process_text_detection(bucket, document):
    # Get the document from S3
    s3_connection = boto3.resource('s3')
    s3_object = s3_connection.Object(bucket, document)
    s3_response = s3_object.get()

    # opening binary stream using an in-memory bytes buffer
    stream = io.BytesIO(s3_response['Body'].read())

    # loading stream into image
    image = Image.open(stream)

    # Detect text in the document
    client = boto3.client('textract', region_name="us-east-1")

    # process using S3 object
    response = client.analyze_expense(
        Document={'S3Object': {'Bucket': bucket, 'Name': document}})

    # Set width and height to display image and draw bounding boxes
    # Create drawing object
    width, height = image.size
    draw = ImageDraw.Draw(image)

    for expense_doc in response["ExpenseDocuments"]:
        for line_item_group in expense_doc["LineItemGroups"]:
            for line_items in line_item_group["LineItems"]:
                for expense_fields in line_items["LineItemExpenseFields"]:
                    print_labels_and_values(expense_fields)
                    print()

        print("Summary:")
        for summary_field in expense_doc["SummaryFields"]:
            print_labels_and_values(summary_field)
            print()

        #For draw bounding boxes
        for line_item_group in expense_doc["LineItemGroups"]:
            for line_items in line_item_group["LineItems"]:
                for expense_fields in line_items["LineItemExpenseFields"]:
                    for key, val in expense_fields["ValueDetection"].items():
                        if "Geometry" in key:
                            draw_bounding_box(key, val, width, height, draw)

        for label in expense_doc["SummaryFields"]:
            if "LabelDetection" in label:
                for key, val in label["LabelDetection"].items():
                    draw_bounding_box(key, val, width, height, draw)

    # Display the image
    image.save("invoice_img.jpg" + document)
    
    return response


In [25]:
bucket = 'aws-textract-text-invoice'
document = 'invoice_img.jpg'

In [26]:
bucket

'aws-textract-text-invoice'

In [28]:
stuff = process_text_detection(bucket, document)

Label Detection - No labels returned.
Summary Value Detection - Confidence: 99.9510498046875, Summary Values: 0.00

Label Detection - No labels returned.
Summary Value Detection - Confidence: 99.92350006103516, Summary Values: 0.00

Label Detection - No labels returned.
Summary Value Detection - Confidence: 99.94314575195312, Summary Values: 0.00

Label Detection - No labels returned.
Summary Value Detection - Confidence: 98.36853790283203, Summary Values: 0.00

Label Detection - No labels returned.
Summary Value Detection - Confidence: 99.93843078613281, Summary Values: 0.00

Label Detection - No labels returned.
Summary Value Detection - Confidence: 99.2732925415039, Summary Values: 0.00

Label Detection - No labels returned.
Summary Value Detection - Confidence: 99.9324951171875, Summary Values: 0.00

Label Detection - No labels returned.
Summary Value Detection - Confidence: 99.81150817871094, Summary Values: 0.00

Label Detection - No labels returned.
Summary Value Detection - Con

In [34]:
for expense_doc in stuff["ExpenseDocuments"]:
    for summary_field in expense_doc["SummaryFields"]:
        summary_field
        confidence = summary_field.get("LabelDetection")["Confidence"]
        if confidence < 90:
            if "LabelDetection" in summary_field:
                print("{}:".format(str(summary_field.get("LabelDetection")["Text"])))
            else:
                print("Label Detection - No labels returned.")
            if "ValueDetection" in summary_field:
                print("{}".format(str(summary_field.get("ValueDetection")["Text"])) + " - {}".format(str(summary_field.get("LabelDetection")["Confidence"])))
            else:
                print("Value Detection - No values returned")
            print()

{'Type': {'Text': 'ADDRESS', 'Confidence': 94.5084457397461},
 'LabelDetection': {'Text': 'BILL TO:',
  'Geometry': {'BoundingBox': {'Width': 0.08982668817043304,
    'Height': 0.013093486428260803,
    'Left': 0.021910514682531357,
    'Top': 0.21478794515132904},
   'Polygon': [{'X': 0.021910514682531357, 'Y': 0.21478794515132904},
    {'X': 0.1117371991276741, 'Y': 0.21478794515132904},
    {'X': 0.1117371991276741, 'Y': 0.22788143157958984},
    {'X': 0.021910514682531357, 'Y': 0.22788143157958984}]},
  'Confidence': 94.3837661743164},
 'ValueDetection': {'Text': 'ATTN: Name / Dept\nCompany Name\n123 Main Street\nUSA\nContact No.\nEmail Address',
  'Geometry': {'BoundingBox': {'Width': 0.17385926842689514,
    'Height': 0.11326257884502411,
    'Left': 0.02040134370326996,
    'Top': 0.23601984977722168},
   'Polygon': [{'X': 0.02040134370326996, 'Y': 0.23601984977722168},
    {'X': 0.1942606121301651, 'Y': 0.23601984977722168},
    {'X': 0.1942606121301651, 'Y': 0.349282443523407}

{'Type': {'Text': 'STREET', 'Confidence': 94.5084457397461},
 'ValueDetection': {'Text': '123 Main Street',
  'Geometry': {'BoundingBox': {'Width': 0.13773296773433685,
    'Height': 0.013055354356765747,
    'Left': 0.021726027131080627,
    'Top': 0.2759999930858612},
   'Polygon': [{'X': 0.021726027131080627, 'Y': 0.2759999930858612},
    {'X': 0.15945899486541748, 'Y': 0.2759999930858612},
    {'X': 0.15945899486541748, 'Y': 0.28905534744262695},
    {'X': 0.021726027131080627, 'Y': 0.28905534744262695}]},
  'Confidence': 99.94744873046875},
 'PageNumber': 1,
 'GroupProperties': [{'Types': ['RECEIVER_BILL_TO'],
   'Id': 'efe7d3f3-4937-4346-9f4e-c886710950ea'}]}

TypeError: 'NoneType' object is not subscriptable