# A playground for Amazon textract

Make sure your default aws credentials are properly defined. In linux, the credentials file lives in `~/.aws/credentials`.

More info on this here: `https://docs.aws.amazon.com/textract/latest/dg/setup-awscli-sdk.html`
Make sure your user has `AmazonTextractFullAccess` permission

In [1]:
s3_bucket = 'datarock-textract-test'
input_dir = 'data'
results_dir = 'results'

In [2]:
import boto3
import cv2
import numpy as np

In [3]:
#get list of documents in the s3 bucket/input_dir
import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(s3_bucket)
img_names=[]
for object_summary in my_bucket.objects.filter(Prefix=input_dir+'/'):
    
    file_name=object_summary.key.replace(input_dir+'/','')
    if len(file_name)>0 :
        img_names.append(file_name )

print(f'{len(img_names)} files found in {"s3://"+s3_bucket+"/"+input_dir} directory')

7 files found in s3://datarock-textract-test/data directory


methods in the textract client for synchronous results:

analyze_document(): Analyzes an input document for relationships between detected items. It returns key value sets or tables, and can be used for human in the loop.

detect_document_text(): Returns geometry (polygon and bbox), text and confidence

The rest of the methods allow asynchronous calls to Textract

List of blocktypes:
- PAGE: Contains a list of the LINE Block objects that are detected on a document page.

- WORD: A word detected on a document page. A word is one or more ISO basic Latin script characters that aren't separated by spaces.

- LINE: A string of tab-delimited, contiguous words that are detected on a document page. 


In text analysis operations, the following types are returned:

- PAGE: Contains a list of child Block objects that are detected on a document page.

- KEY_VALUE_SET: Stores the KEY and VALUE Block objects for linked text that's detected on a document page. Use the EntityType field to determine if a KEY_VALUE_SET object is a KEY Block object or a VALUE Block object.

- WORD: A word that's detected on a document page. A word is one or more ISO basic Latin script characters that aren't separated by spaces.

- LINE: A string of tab-delimited, contiguous words that are detected on a document page.

- TABLE: A table that's detected on a document page. A table is grid-based information with two or more rows or columns, with a cell span of one row and one column each.

- CELL: A cell within a detected table. The cell is the parent of the block that contains the text in the cell.

- SELECTION_ELEMENT: A selection element such as an option button (radio button) or a check box that's detected on a document page. Use the value of SelectionStatus to determine the status of the selection element.

In [4]:
#running textract on images and parsing the results:
textract=boto3.client('textract')
responses={}
for img_name in img_names:
    response=textract.detect_document_text(Document={
         'S3Object': {
                'Bucket': s3_bucket,
                'Name': input_dir+'/'+img_name
            }

    })
    responses[img_name]=response['Blocks']


In [5]:
#let's have a look at one of the results:
responses[img_names[0]]

[{'BlockType': 'PAGE',
  'Geometry': {'BoundingBox': {'Width': 0.9691512584686279,
    'Height': 0.9673818945884705,
    'Left': 0.0046663773246109486,
    'Top': 0.010241251438856125},
   'Polygon': [{'X': 0.0057899258099496365, 'Y': 0.010241251438856125},
    {'X': 0.958276093006134, 'Y': 0.028770117089152336},
    {'X': 0.9738176465034485, 'Y': 0.9776231646537781},
    {'X': 0.0046663773246109486, 'Y': 0.9748016595840454}]},
  'Id': '1023f678-e300-4f3c-8f27-6e481d74bee9',
  'Relationships': [{'Type': 'CHILD',
    'Ids': ['cc807979-dace-4aea-91a4-217e81079f53',
     '157271f0-8a9f-491b-a2bb-0c3b7ad54791']}]},
 {'BlockType': 'LINE',
  'Confidence': 96.54811096191406,
  'Text': 'TRAY 142',
  'Geometry': {'BoundingBox': {'Width': 0.09914443641901016,
    'Height': 0.020945116877555847,
    'Left': 0.16762809455394745,
    'Top': 0.07973362505435944},
   'Polygon': [{'X': 0.16762809455394745, 'Y': 0.07973362505435944},
    {'X': 0.26670292019844055, 'Y': 0.08154790848493576},
    {'X': 0

In [6]:
# #analyze document
# response=textract.analyze_document(Document={
#      'S3Object': {
#             'Bucket': s3_bucket,
#             'Name': input_dir+'/'+img_names[0]
#         }},
#     FeatureTypes=['TABLES','FORMS']
# )['Blocks']
# response
## The results are the same as detect_document_text for rock images 

In [7]:
#Let's draw the found text on the image and save the image to the output directory

In [8]:
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
def download_into_memory(s3_client,bucket_name,file_url):
    s3_response_object = s3_client.get_object(Bucket=bucket_name, Key=file_url)
    object_content = s3_response_object['Body'].read()
    return object_content
def convert_to_img(img_content):
    image = np.asarray(bytearray(img_content), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    return image

def save_img_to_s3_bucket(img,s3_client,bucket_name,file_url):
    img_str = cv2.imencode('.jpg', img)[1].tostring()
    s3_client.put_object(Bucket=bucket_name, Key = file_url, Body = img_str, ContentType= 'image/jpeg')  

In [9]:
def get_color_codes():
    colors={} #Colors in BGR
    colors['PAGE'] = (255,0,0) # Blue
    colors['LINE']= (0,255,0) #Green
    colors['WORD'] = (0,0,255) #Red
    return colors

In [14]:
font = cv2.FONT_HERSHEY_SIMPLEX

def convert_bbox_to_coords(img_size,bbox):
    #x coords
    start_point_x = int(img_size[1]* bbox['Left'])
    end_point_x = int(start_point_x + img_size[1]*bbox['Width'])
    #y coords
    start_point_y = int(img_size[0] * bbox['Top'])
    end_point_y = int(start_point_y + img_size[0] * bbox['Height'])
    
    return (start_point_x,start_point_y) , (end_point_x,end_point_y)

def draw_on_top_of_img(img,textract_results,color_codes):
    for text_result in textract_results:
        if 'Text' in text_result.keys():
            drawing_color = color_codes[text_result['BlockType']]
            start_pnt,end_pnt = convert_bbox_to_coords(img.shape,text_result['Geometry']['BoundingBox'])
            img = cv2.rectangle(img, start_pnt, end_pnt, drawing_color, thickness=2) 
            cv2.putText(img, text_result['Text'], (start_pnt[0]+10,start_pnt[1]+30), font, 1, drawing_color, 2, cv2.LINE_AA)
    return img

In [16]:
color_codes = get_color_codes()
for img_name in img_names:
    print(img_name)
    img_content = download_into_memory(s3_client,s3_bucket,input_dir+'/'+img_name)
    img = convert_to_img(img_content)
    resp = responses[img_name]
    img=draw_on_top_of_img(img,resp,color_codes)
    save_img_to_s3_bucket(img,s3_client,s3_bucket,results_dir+'/'+img_names[0])

CE002_142+143_469.20_476.20.jpg
UE108_005+006_13.90_20.95.JPG
UE108_013+014_42.15_48.85.JPG
UE108_021+022_69.25_76.05.JPG
UE108_029+030_97.15_104.15.JPG
UE108_063+064_217.25_224.00.JPG
UE108_089+090_308.00_314.95.JPG
