## Teste Analise - Formulário 01

In [2]:
# Libs
import boto3
import pandas as pd
import os

In [3]:
# Parameters
ACCESS_KEY=''
SECRET_KEY=''


In [4]:
os.getenv('AWS_ACCESS_KEY_ID') 

In [5]:
# Declaração AWS Boto3 Client - Textarct
client = boto3.client(
    'textract', 
    region_name='us-west-2', 
    aws_access_key_id=ACCESS_KEY, 
    aws_secret_access_key=SECRET_KEY
)

In [7]:
# Lendo a imagem
FORM_IMG = '../data/images/form-sample-01/form-01.png'

with open(FORM_IMG, 'rb') as document:
    img = bytearray(document.read())
    document.close()

In [8]:
# >> Call Amazon Textract
# response = client.analyze_document(
#     Document={'Bytes': img},
#     FeatureTypes=["TABLES", "FORMS"]
# )

response = client.analyze_document(
    Document={'Bytes': img},
    FeatureTypes=["TABLES"]
)

In [None]:
for item in response["Blocks"]:
    if item["BlockType"] == "LINE":
        print ('\033[94m' +  item["Text"] + '\033[0m')

In [None]:
for item in response["Blocks"]:
    if item["BlockType"] == "TABLE":
        print(item['Id'], item['BlockType'])
        #DisplayBlockInformation(item)

## https://github.com/awsdocs/amazon-textract-developer-guide/blob/master/doc_source/analyzing-document-text.md

In [22]:
def ShowBoundingBox(draw,box,width,height,boxColor):
             
    left = width * box['Left']
    top = height * box['Top'] 
    draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],outline=boxColor)   

def ShowSelectedElement(draw,box,width,height,boxColor):
             
    left = width * box['Left']
    top = height * box['Top'] 
    draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],fill=boxColor)  

# Displays information about a block returned by text detection and text analysis
def DisplayBlockInformation(block):
    print('Id: {}'.format(block['Id']))
    if 'Text' in block:
        print('    Detected: ' + block['Text'])
    print('    Type: ' + block['BlockType'])
   
    if 'Confidence' in block:
        print('    Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")

    if block['BlockType'] == 'CELL':
        print("    Cell information")
        print("        Column:" + str(block['ColumnIndex']))
        print("        Row:" + str(block['RowIndex']))
        print("        Column Span:" + str(block['ColumnSpan']))
        print("        RowSpan:" + str(block['ColumnSpan']))    
    
    if 'Relationships' in block:
        print('    Relationships: {}'.format(block['Relationships']))
    print('    Geometry: ')
    print('        Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
    print('        Polygon: {}'.format(block['Geometry']['Polygon']))
    
    if block['BlockType'] == "KEY_VALUE_SET":
        print ('    Entity Type: ' + block['EntityTypes'][0])
    
    if block['BlockType'] == 'SELECTION_ELEMENT':
        print('    Selection element detected: ', end='')

        if block['SelectionStatus'] =='SELECTED':
            print('Selected')
        else:
            print('Not selected')    
    
    if 'Page' in block:
        print('Page: ' + block['Page'])
    print()

## https://github.com/awsdocs/amazon-textract-developer-guide/blob/master/doc_source/analyzing-document-text.md

In [11]:
def map_blocks(blocks, block_type):
    return {
        block['Id']: block
        for block in blocks
        if block['BlockType'] == block_type
    }

def get_children_ids(block):
    for rels in block.get('Relationships', []):
        if rels['Type'] == 'CHILD':
            yield from rels['Ids']


In [12]:

blocks = response['Blocks']
tables = map_blocks(blocks, 'TABLE')
cells = map_blocks(blocks, 'CELL')
words = map_blocks(blocks, 'WORD')
selections = map_blocks(blocks, 'SELECTION_ELEMENT')

In [101]:

dataframes = []

for table in tables.values():

    # Determine all the cells that belong to this table
    table_cells = [cells[cell_id] for cell_id in get_children_ids(table)]

    # Determine the table's number of rows and columns
    n_rows = max(cell['RowIndex'] for cell in table_cells)
    n_cols = max(cell['ColumnIndex'] for cell in table_cells)
    content = [[None for _ in range(n_cols)] for _ in range(n_rows)]

    # Fill in each cell
    for cell in table_cells:
        cell_contents = [
            words[child_id]['Text']
            if child_id in words
            else selections[child_id]['SelectionStatus']
            for child_id in get_children_ids(cell)
        ]
        i = cell['RowIndex'] - 1
        j = cell['ColumnIndex'] - 1
        content[i][j] = ' '.join(cell_contents)

    # We assume that the first row corresponds to the column names
    dataframe = pd.DataFrame(content[1:], columns=content[0])
    dataframes.append(dataframe)

In [105]:
dataframes[1]

Unnamed: 0,N.° Talhão,Área Trab. (ha),N.° Veículo/Equip.,N.° Implemento,H. máq/km,Matrícula Operador
0,25,18.0,TRR0070,-,91.0,1800011.0
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
5,,,,,,
6,Total,180.0,,Total,91.0,


## Key-values

In [88]:
def get_kv_map(response):

    # Get the text blocks
    blocks=response['Blocks']

    # get key and value maps
    key_map = {}
    value_map = {}
    block_map = {}
    for block in blocks:
        block_id = block['Id']
        block_map[block_id] = block
        if block['BlockType'] == "KEY_VALUE_SET":
            if 'KEY' in block['EntityTypes']:
                key_map[block_id] = block
            else:
                value_map[block_id] = block

    return key_map, value_map, block_map


def get_kv_relationship(key_map, value_map, block_map):
    kvs = {}
    for block_id, key_block in key_map.items():
        value_block = find_value_block(key_block, value_map)
        key = get_text(key_block, block_map)
        val = get_text(value_block, block_map)
        kvs[key] = val
    return kvs


def find_value_block(key_block, value_map):
    for relationship in key_block['Relationships']:
        if relationship['Type'] == 'VALUE':
            for value_id in relationship['Ids']:
                value_block = value_map[value_id]
    return value_block


def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] == 'SELECTED':
                            text += 'X'    

                                
    return text

In [89]:
key_map, value_map, block_map = get_kv_map(response)

In [90]:
kvs = get_kv_relationship(key_map, value_map, block_map)

In [91]:
kvs

{'Implantação ': '',
 'Manutenção ': '',
 'Reforma ': 'X',
 'Empreiteira: ': 'minos L. 605 ',
 '': '20,03.21 ',
 'Projeto: ': '110 ',
 'Feitor: ': 'Jose-upby ',
 'Manual ': '',
 'Diária ': '',
 'Mat.: ': '',
 'Mecânica ': 'X',
 'Empreitada ': 'X',
 'Outros Serviços ': '',
 'Engenheiro: ': '',
 'Fazenda: ': 'centerario ',
 'Total ': '',
 'N.° ': '',
 'N.° de Diárias: ': '1 ',
 'APONTAMENTO DIÁRIO DA OPERAÇÃO DE ': 'Remocto inconicods De Copers-156arn ',
 'N.° de pessoas: ': '1 ',
 'Enc. Fazenda: ': '',
 'Horário Trabalhado: de ': '7 às 11 ',
 'Chapa de Funcionários: ': '1800011 ',
 'Observações: ': '',
 '1³Via-Assessoria-BH - 2 Via - Fazenda ': ''}