In [None]:
import boto3
import io
import base64
import json
from pdf2image import convert_from_bytes
from PIL import Image

# Initialize AWS clients
s3 = boto3.client('s3')
bedrock = boto3.client('bedrock-runtime')

# Define constants
MODEL_ID = 'anthropic.claude-3-sonnet-20240229-v1:0'
ACCEPT = 'application/json'
CONTENT_TYPE = 'application/json'

# S3 bucket and file details
BUCKET_NAME = 'app2container-pr-mar-30'
FILE_KEY = 'job-99dcf59c-c0df-429c-ae02-5e335816702c/Direct DepositShanelMoseleysigned.png'

def process_pdf_from_s3(bucket_name, file_key):
    try:
        # Download PDF from S3
        pdf_obj = s3.get_object(Bucket=bucket_name, Key=file_key)
        pdf_data = pdf_obj['Body'].read()

        # Convert PDF to PNG
        images = convert_from_bytes(pdf_data, fmt='png')

        # Process each PNG image
        for i, image in enumerate(images):
            # Split the PNG image into multiple pages
            page_images = split_image_into_pages(image)

            # Process each page and pass it to the Bedrock model
            for page_num, page_image in enumerate(page_images):
                # Convert PNG to base64 encoding
                png_data = io.BytesIO()
                page_image.save(png_data, format='PNG')
                png_base64 = base64.b64encode(png_data.getvalue()).decode('utf-8')

                # Body for the machine learning model
                body = {
                    "anthropic_version": "bedrock-2023-05-31",
                    "max_tokens": 100000,
                    "messages": [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "image",
                                    "source": {
                                        "type": "base64",
                                        "media_type": "image/png",
                                        "data": png_base64
                                    }
                                },
                                {
                                    "type": "text",
                                    "text": "Can you extract key-value pairs from this image and show in human readable format on new lines?"
                                }
                            ]
                        }
                    ]
                }

                # Invoke the machine learning model
                response = bedrock.invoke_model(
                    modelId=MODEL_ID,
                    body=json.dumps(body).encode('utf-8'),
                    accept=ACCEPT,
                    contentType=CONTENT_TYPE
                )

                # Process the model's response
                result = json.loads(response.get("body").read())
                output_list = result.get("content", [])

                print(f"Page {page_num+1}:")
                for output in output_list:
                    print(output["text"])

    except Exception as e:
        print(f"Error processing PDF: {e}")

def split_image_into_pages(image, max_height=1024):
    """
    Splits a multi-page PNG image into individual pages.
    """
    width, height = image.size
    page_images = []

    # Split the image into pages based on the maximum height
    for y in range(0, height, max_height):
        page_image = Image.new('RGB', (width, min(max_height, height - y)), (255, 255, 255))
        page_image.paste(image, (0, -y))
        page_images.append(page_image)

    return page_images

# Example usage
result = process_pdf_from_s3(BUCKET_NAME, FILE_KEY)

In [None]:
import boto3
import io
import base64
import json
from pdf2image import convert_from_bytes
from PIL import Image

# Initialize AWS clients
s3 = boto3.client('s3')
bedrock = boto3.client('bedrock-runtime')

# Define constants
MODEL_ID = 'anthropic.claude-3-sonnet-20240229-v1:0'
ACCEPT = 'application/json'
CONTENT_TYPE = 'application/json'

# S3 bucket and file details
BUCKET_NAME = 'app2container-pr-mar-30'
FILE_KEY = 'job-99dcf59c-c0df-429c-ae02-5e335816702c/Direct DepositShanelMoseleysigned-2.pdf'

def process_pdf_from_s3(bucket_name, file_key):
    try:
        # Download PDF from S3
        pdf_obj = s3.get_object(Bucket=bucket_name, Key=file_key)
        pdf_data = pdf_obj['Body'].read()

        # Convert PDF to PNG
        images = convert_from_bytes(pdf_data, fmt='png')

        # Process each PNG image
        for i, image in enumerate(images):
            # Split the PNG image into multiple pages
            page_images = split_image_into_pages(image)

            # Process each page and pass it to the Bedrock model
            for page_num, page_image in enumerate(page_images):
                # Convert PNG to base64 encoding
                png_data = io.BytesIO()
                page_image.save(png_data, format='PNG')
                png_base64 = base64.b64encode(png_data.getvalue()).decode('utf-8')

                # Body for the machine learning model
                body = {
                    "anthropic_version": "bedrock-2023-05-31",
                    "max_tokens": 100000,
                    "messages": [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "image",
                                    "source": {
                                        "type": "base64",
                                        "media_type": "image/png",
                                        "data": png_base64
                                    }
                                },
                                {
                                    "type": "text",
                                    "text": "Can you extract text from this image and show in human readable format on new lines?"
                                }
                            ]
                        }
                    ]
                }

                # Invoke the machine learning model
                response = bedrock.invoke_model(
                    modelId=MODEL_ID,
                    body=json.dumps(body).encode('utf-8'),
                    accept=ACCEPT,
                    contentType=CONTENT_TYPE
                )

                # Process the model's response
                result = json.loads(response.get("body").read())
                output_list = result.get("content", [])

                print(f"Page {page_num+1}:")
                for output in output_list:
                    print(output["text"])

    except Exception as e:
        print(f"Error processing PDF: {e}")

def split_image_into_pages(image, max_height=1024):
    """
    Splits a multi-page PNG image into individual pages.
    """
    width, height = image.size
    page_images = []

    # Split the image into pages based on the maximum height
    for y in range(0, height, max_height):
        page_image = Image.new('RGB', (width, min(max_height, height - y)), (255, 255, 255))
        page_image.paste(image, (0, -y))
        page_images.append(page_image)

    return page_images

# Example usage
result = process_pdf_from_s3(BUCKET_NAME, FILE_KEY)