# Extract information from invoice using AWS Textract service


In [None]:
from PIL import Image

# load the invoice image
invoice_file = "./documents/sample-invoice-1.jpg"
image = Image.open(invoice_file)

# open image in system viewer 
#image.show()

display(image)

In [None]:
from textractor import Textractor

extractor = Textractor(region_name="us-east-1")
document = extractor.analyze_expense(
        file_source=image, s3_output_path=None,
        save_image=True
    )

In [None]:
from PIL import ImageDraw

def draw_bounding_box(key, val, width, height, draw, is_label=False):
    box_color = 'blue'

    if is_label:
        box_color = 'red'
    
    # If a key is Geometry, draw the bounding box info in it
    if "Geometry" in key:
        # Draw bounding box information
        box = val["BoundingBox"]
        left = width * box['Left']
        top = height * box['Top']
        draw.rectangle([left, top, left + (width * box['Width']), top + (height * box['Height'])],
                       outline=box_color, width=1)
        
# Set width and height to display image and draw bounding boxes
# Create drawing object

image = image.convert('RGBA')
width, height = image.size
draw = ImageDraw.Draw(image)

for expense_doc in document.response["ExpenseDocuments"]:
    #For draw bounding boxes
    for line_item_group in expense_doc["LineItemGroups"]:
        for line_items in line_item_group["LineItems"]:
            for expense_fields in line_items["LineItemExpenseFields"]:
                for key, val in expense_fields["ValueDetection"].items():
                    if "Geometry" in key:
                        draw_bounding_box(key, val, width, height, draw)

        for label in expense_doc["SummaryFields"]:
            if "LabelDetection" in label:
                for key, val in label["LabelDetection"].items():
                    draw_bounding_box(key, val, width, height, draw, True)
            
            if "ValueDetection" in label:
                for key, val in label["ValueDetection"].items():
                    draw_bounding_box(key, val, width, height, draw)

    # Display the image
    display(image)
    #image.show()

In [None]:
from textractprettyprinter import Pretty_Print_Table_Format
from textractprettyprinter.t_pretty_print_expense import get_string, Textract_Expense_Pretty_Print

pretty_string = get_string(textract_json=document.response,
                               output_type=[Textract_Expense_Pretty_Print.SUMMARY,
                                            Textract_Expense_Pretty_Print.LINEITEMGROUPS],
                               table_format=Pretty_Print_Table_Format.fancy_grid)

print(f"pretty_string=\n{pretty_string}")


In [None]:
pretty_string = get_string(textract_json=document.response,
                               output_type=[Textract_Expense_Pretty_Print.SUMMARY,
                                            Textract_Expense_Pretty_Print.LINEITEMGROUPS],
                               table_format=Pretty_Print_Table_Format.csv)

print(f"pretty_string=\n{pretty_string}")

In [None]:
from trp.trp2_expense import TAnalyzeExpenseDocumentSchema

def convert_summary_to_dict(exp_doc):
    summary_dict = {}
    for field in exp_doc.summaryfields:
        t_key = ""
        t_value = ""
        if field.ftype.text:
            if field.group_properties and len(field.group_properties) > 0 \
                    and field.group_properties[0].types \
                    and len(field.group_properties[0].types) > 0:
                t_key = field.group_properties[0].types[0] + "-" + field.ftype.text
            else:
                t_key = field.ftype.text
        if field.valuedetection:
            t_value = field.valuedetection.text

        if t_key:
            print(f"adding to summary: {t_key}={t_value}")
            summary_dict[t_key] = t_value

    return summary_dict


def convert_lineitems_to_dict(exp_doc):
    rows_list = list()
    skip_expense_row = True

    for exp_table in exp_doc.lineitemgroups:
        for _, row in enumerate(exp_table.lineitems):
            li_dict = {}
            for _, cell in enumerate(row.lineitem_expensefields):
                t_key = ""
                t_value = ""
                if skip_expense_row and cell.ftype and cell.ftype.text == 'EXPENSE_ROW':
                    continue

                if cell.ftype and cell.ftype.text:
                    t_key = cell.ftype.text

                if cell.valuedetection:
                    t_value = cell.valuedetection.text

                if t_key:
                    print(f"adding line item cell: {t_key}={t_value}")
                    li_dict[t_key] = t_value

            print(f"adding line item: {li_dict}")
            rows_list.append(li_dict)

    return rows_list


if (
    document.response
    and "ExpenseDocuments" in document.response
    and len(document.response) > 0
):
    t_doc = TAnalyzeExpenseDocumentSchema().load(document.response)

    for exp_doc in t_doc.expenses_documents:
        summary_fields_dict = convert_summary_to_dict(exp_doc)
        print(summary_fields_dict)

        li_fields_dict = convert_lineitems_to_dict(exp_doc)
        print(li_fields_dict)

In [None]:
print(f"Vendor Name: {summary_fields_dict['VENDOR-NAME']}")
print(f"Vendor Address: {summary_fields_dict['VENDOR-ADDRESS_BLOCK']}")
print(f"Sub-Total: {summary_fields_dict['SUBTOTAL']}")
print(f"TAX: {summary_fields_dict['TAX']}")
print(f"Total: {summary_fields_dict['TOTAL']}")