In [None]:
pip install amazon-textract-textractor

In [None]:
pip install amazon-textract-textractor[pdf]

# AnalyzeExpense API

In [None]:
from textractor import Textractor
from PIL import Image

extractor = Textractor(profile_name="default")

document = extractor.analyze_expense(
    file_source="CTR_4878.png",
    save_image=True,
)

In [None]:
Image.open("CTR_4878.png")

In [None]:
document

In [None]:
from textractor.data.constants import AnalyzeExpenseFields, AnalyzeExpenseFieldsGroup, AnalyzeExpenseLineItemFields

In [None]:
expense_doc = document.expense_documents[0]
expense_doc

In [None]:
expense_doc.summary_fields

In [None]:
#Group summary fields into semantic groups
expense_doc.summary_groups

In [None]:
expense_doc.line_items_groups

In [None]:
#expense_doc.line_items_groups[0].to_pandas()

# Layout Analysis

In [None]:
import os
from PIL import Image
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures

In [None]:
image = Image.open("CTR_8809178.png").convert("RGB")
image

In [None]:
extractor = Textractor(region_name="us-east-1")

document = extractor.analyze_document(
    file_source=image,
    features=[TextractFeatures.LAYOUT],
    save_image=True
)

In [None]:
#Return the layout analyis
print(document.pages[0].layouts)
document.pages[0].layouts.visualize().convert("RGB")

In [None]:
#return a subset of the analysis
document.pages[0].page_layout.titles[0].text

In [None]:
document.pages[0].page_layout.tables[1].visualize().convert("RGB")

In [None]:
bbox = document.pages[0].page_layout.tables[1].bbox
width, height = document.pages[0].image.size

document.pages[0].image.crop((
    bbox.x * width,
    bbox.y * height,
    (bbox.x + bbox.width) * width,
    (bbox.y + bbox.height) * height
))

# Layout Analysis for Text Linearization

In [None]:
import os
from PIL import Image
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures

In [None]:
image = Image.open("ENBRDGE_Pg1.png")
image

In [None]:
document = extractor.analyze_document(
    file_source=image,
    features=[TextractFeatures.LAYOUT],
    save_image=True
)

In [None]:
print(document.text)

In [None]:
from textractor.data.text_linearization_config import TextLinearizationConfig

config = TextLinearizationConfig(
    hide_figure_layout=True,
    title_prefix="# ",
    section_header_prefix="## "
)
print(document.get_text(config=config))

# Document Linearization to Markdown

In [None]:
extractor = Textractor(region_name="us-east-1")

document = extractor.analyze_document(
    file_source=image,
    features=[TextractFeatures.LAYOUT, TextractFeatures.TABLES, TextractFeatures.FORMS, TextractFeatures.SIGNATURES],
    save_image=True
)

In [None]:
print(document.tables[1].get_text())

In [None]:
print(document.tables[1].to_markdown())

In [None]:
print(document.key_values.get_text())

# Queries

In [None]:
from PIL import Image
Image.open("CTR_8809178.png")

In [None]:
queries = [
    "What is the trade date?",
    "What is the Confirmation Number?",
    "Who is the Seller?",
    "Who is the Buyer?",
    "What is the deal type?",
    "What is the Delivery Period?",
    "What is the daily volume elections?",
    "What is the contract price",
    "What is the All-in-Price?"
]

In [None]:
import os
from textractor import Textractor
from textractor.data.constants import TextractFeatures

extractor = Textractor(profile_name="default")
document = extractor.analyze_document(
    file_source=Image.open("CTR_8809178.png"),
    features=[TextractFeatures.QUERIES],
    queries=queries
)

In [None]:
document.queries

In [None]:
for query in document.queries:
    if query.result:
        print(f"{query.query}\n\tAnswer: {query.result.answer}\n\tConfidence: {query.result.confidence}\n")
    else:
        print(f"{query.query}\n\tNo Answer\n")

# Large Language Models

In [None]:
import os
import boto3
import json

from PIL import Image
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures

def get_response_from_claude(context, prompt_data):
    body = json.dumps({
        "prompt": f"""Human: Given the following document:
        {context}
        Answer the following:\n {prompt_data}
        Assistant:""",
        "max_tokens_to_sample": 2000,
        "top_k": 1,
    })
    modelId = f'anthropic.claude-v2' # change this to use a different version from the model provider
    accept = '*/*'
    contentType = 'application/json'

    response = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    response_body = json.loads(response.get('body').read())
    answer = response_body.get('completion')

    return answer

os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
os.environ["BEDROCK_ENDPOINT_URL"] = "https://bedrock-runtime.us-east-1.amazonaws.com"

bedrock = boto3.client(service_name='bedrock-runtime',region_name='us-east-1',endpoint_url='https://bedrock-runtime.us-east-1.amazonaws.com')

In [None]:
image = Image.open("CTR_4878.png").convert("RGB")
image

In [None]:
from textractor import Textractor
from textractor.data.text_linearization_config import TextLinearizationConfig

extractor = Textractor(region_name="us-east-1")
document = extractor.analyze_document(
    file_source=image,
    features=[TextractFeatures.LAYOUT,TextractFeatures.FORMS],
    save_image=True
)
print(document.get_text())

In [None]:
print(get_response_from_claude(
    document.get_text(),
    """
    - What is the contract price?
    """
))