# Ingest with LlamaParse into S3 for KB

In [7]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
%pip install llama-parse


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

parser = LlamaParse(
    api_key="llx-2VStQ0IBwnbc7v5xTvhE1eOh0n6Vd2NDU5TOyxXYv3or2yHO",  # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type="markdown",  # "markdown" and "text" are available
    num_workers=4,  # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="en",  # Optionally you can define a language, default=en
)

file_extractor = {".pdf": parser}
reader = SimpleDirectoryReader(
    input_files=[
        'data/apple_2019.pdf',
        'data/tesla_2019.pdf',
    ],
    file_extractor=file_extractor
)

In [22]:
documents = reader.load_data()

Started parsing the file under job_id ca350a45-3fa3-45ea-807c-511638626a49
Started parsing the file under job_id a9228edd-0a66-44ce-8779-71bcf846b7a2


In [57]:
documents[0].metadata

{'file_path': 'data/apple_2019.pdf',
 'file_name': 'apple_2019.pdf',
 'file_type': 'application/pdf',
 'file_size': 855181,
 'creation_date': '2024-04-17',
 'last_modified_date': '2024-04-17'}

In [54]:
import boto3 
import botocore

# Create an S3 client
s3 = boto3.client('s3')

# Specify the bucket
bucket_name = 'bedrock-kb-10ks'

In [58]:
import json

def create_bucket(bucket_name):
    try:
        s3.head_bucket(Bucket=bucket_name)
        print(f"Bucket '{bucket_name}' already exists.")
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            s3.create_bucket(Bucket=bucket_name)
            print(f"Bucket '{bucket_name}' created successfully.")
        else:
            print(f"Error creating bucket: {str(e)}")
            raise    

def upload_document(document, bucket_name):
    try:
        object_key = document.metadata['file_path']
        
        # Upload the text as a markdown file
        s3.put_object(
            Body=document.text.encode('utf-8'),
            Bucket=bucket_name,
            Key=f"{object_key}.md",
            ContentType='text/markdown'
        )
        print(f"Text uploaded to S3 as '{object_key}.md'")

        # Format the metadata in the desired structure
        formatted_metadata = {
            "metadataAttributes": document.metadata
        }
        
        # Upload the metadata as a JSON file
        metadata_json = json.dumps(formatted_metadata, indent=4)
        s3.put_object(
            Body=metadata_json.encode('utf-8'),
            Bucket=bucket_name,
            Key=f"{object_key}.md.metadata.json",
            ContentType='application/json'
        )
        print(f"Metadata uploaded to S3 as '{object_key}.md.metadata.json'")
    
    except Exception as e:
        print(f"Error uploading document: {str(e)}")

In [59]:
for doc in documents:
    upload_document(doc, bucket_name)  

Text uploaded to S3 as 'data/apple_2019.pdf.md'
Metadata uploaded to S3 as 'data/apple_2019.pdf.md.metadata.json'
Text uploaded to S3 as 'data/tesla_2019.pdf.md'
Metadata uploaded to S3 as 'data/tesla_2019.pdf.md.metadata.json'
