# LlamaIndex <> Bedrock

### Setup environment

In [10]:
import nest_asyncio
nest_asyncio.apply()

In [5]:
%pip install llama-index-llms-bedrock
%pip install llama-index-retrievers-bedrock
%pip install llama-index-vector-stores-opensearch
%pip install requests-aws4auth


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Collecting llama-index-vector-stores-opensearch
  Using cached llama_index_vector_stores_opensearch-0.1.8-py3-none-any.whl (6.2 kB)
Collecting opensearch-py[async]<3.0.0,>=2.4.2
  Downloading opensearch_py-2.5.0-py2.py3-none-any.whl (266 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00

### Configure OpenSearch and create collection

1. Go to Amazon OpenSearch Service
2. Select serverless, get started
3. create collection (using "rag-bedrock" as collection name)  
wait for collection to create, takes ~5 mins


Or alternatively, do this programmatically as below.

In [1]:
import boto3
import botocore
from requests_aws4auth import AWS4Auth

client = boto3.client('opensearchserverless')

service = 'aoss'
region = 'us-east-2'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                   region, service, session_token=credentials.token)

In [104]:
import time 

def createEncryptionPolicy(client):
    """Creates an encryption policy that matches all collections beginning with tv-"""
    try:
        response = client.create_security_policy(
            description='Encryption policy for TV collections',
            name='tv-policy',
            policy="""
                {
                    \"Rules\":[
                        {
                            \"ResourceType\":\"collection\",
                            \"Resource\":[
                                \"collection\/tv-*\"
                            ]
                        }
                    ],
                    \"AWSOwnedKey\":true
                }
                """,
            type='encryption'
        )
        print('\nEncryption policy created:')
        print(response)
    except botocore.exceptions.ClientError as error:
        if error.response['Error']['Code'] == 'ConflictException':
            print(
                '[ConflictException] The policy name or rules conflict with an existing policy.')
        else:
            raise error


def createNetworkPolicy(client):
    """Creates a network policy that matches all collections beginning with tv-"""
    try:
        response = client.create_security_policy(
            description='Network policy for TV collections',
            name='tv-policy',
            policy="""
                [{
                    \"Description\":\"Public access for TV collection\",
                    \"Rules\":[
                        {
                            \"ResourceType\":\"dashboard\",
                            \"Resource\":[\"collection\/tv-*\"]
                        },
                        {
                            \"ResourceType\":\"collection\",
                            \"Resource\":[\"collection\/tv-*\"]
                        }
                    ],
                    \"AllowFromPublic\":true
                }]
                """,
            type='network'
        )
        print('\nNetwork policy created:')
        print(response)
    except botocore.exceptions.ClientError as error:
        if error.response['Error']['Code'] == 'ConflictException':
            print(
                '[ConflictException] A network policy with this name already exists.')
        else:
            raise error

def createAccessPolicy(client):
    """Creates a data access policy that matches all collections beginning with tv-"""
    try:
        response = client.create_access_policy(
            description='Data access policy for TV collections',
            name='tv-policy',
            policy="""
                [{
                    \"Rules\":[
                        {
                            \"Resource\":[
                                \"index\/tv-*\/*\"
                            ],
                            \"Permission\":[
                                \"aoss:CreateIndex\",
                                \"aoss:DeleteIndex\",
                                \"aoss:UpdateIndex\",
                                \"aoss:DescribeIndex\",
                                \"aoss:ReadDocument\",
                                \"aoss:WriteDocument\"
                            ],
                            \"ResourceType\": \"index\"
                        },
                        {
                            \"Resource\":[
                                \"collection\/tv-*\"
                            ],
                            \"Permission\":[
                                \"aoss:CreateCollectionItems\"
                            ],
                            \"ResourceType\": \"collection\"
                        }
                    ],
                    \"Principal\":[
                        \"arn:aws:iam::357961106903:role\/Admin\"
                    ]
                }]
                """,
            type='data'
        )
        print('\nAccess policy created:')
        print(response)
    except botocore.exceptions.ClientError as error:
        if error.response['Error']['Code'] == 'ConflictException':
            print(
                '[ConflictException] An access policy with this name already exists.')
        else:
            raise error

In [112]:
def createCollection(client):
    """Creates a collection"""
    try:
        response = client.create_collection(
            name='tv-sitcoms',
            type='SEARCH'
        )
        return(response)
    except botocore.exceptions.ClientError as error:
        if error.response['Error']['Code'] == 'ConflictException':
            print(
                '[ConflictException] A collection with this name already exists. Try another name.')
        else:
            raise error

def waitForCollectionCreation(client):
    """Waits for the collection to become active"""
    response = client.batch_get_collection(
        names=['tv-sitcoms'])
    # Periodically check collection status
    while (response['collectionDetails'][0]['status']) == 'CREATING':
        print('Creating collection...')
        time.sleep(30)
        response = client.batch_get_collection(
            names=['tv-sitcoms'])
    print('\nCollection successfully created:')
    print(response["collectionDetails"])
    # Extract the collection endpoint from the response
    host = (response['collectionDetails'][0]['collectionEndpoint'])
    final_host = host.replace("https://", "")
    return final_host

In [107]:
createEncryptionPolicy(client)
createNetworkPolicy(client)
createAccessPolicy(client)
createCollection(client)

[ConflictException] The policy name or rules conflict with an existing policy.
[ConflictException] A network policy with this name already exists.
[ConflictException] An access policy with this name already exists.
[ConflictException] A collection with this name already exists. Try another name.


In [113]:
final_host = waitForCollectionCreation(client)


Collection successfully created:
[{'arn': 'arn:aws:aoss:us-east-1:357961106903:collection/9qjgnx68im5mcri7rtp9', 'collectionEndpoint': 'https://9qjgnx68im5mcri7rtp9.us-east-1.aoss.amazonaws.com', 'createdDate': 1713394114452, 'dashboardEndpoint': 'https://9qjgnx68im5mcri7rtp9.us-east-1.aoss.amazonaws.com/_dashboards', 'id': '9qjgnx68im5mcri7rtp9', 'kmsKeyArn': 'auto', 'lastModifiedDate': 1713394625015, 'name': 'tv-sitcoms', 'standbyReplicas': 'ENABLED', 'status': 'ACTIVE', 'type': 'SEARCH'}]


In [114]:
final_host

'9qjgnx68im5mcri7rtp9.us-east-1.aoss.amazonaws.com'

### Connect to OpenSearch for indexing

In [96]:
final_host = 'i9fsvb9a26e57d7ly111.us-east-2.aoss.amazonaws.com'
full_final_host = 'https://i9fsvb9a26e57d7ly111.us-east-2.aoss.amazonaws.com'
index_name = 'python-test-index-new'

In [70]:
from opensearchpy import OpenSearch, AsyncOpenSearch, AsyncHttpConnection, AWSV4SignerAsyncAuth

In [47]:
# os_client = OpenSearch(
#     hosts=[{'host': final_host, 'port': 443}],
#     http_auth=awsauth,
#     use_ssl=True,
#     verify_certs=True,
#     connection_class=RequestsHttpConnection,
#     timeout=300
# )

In [51]:
# os_client.indices.get(index='python-test-index')

{'python-test-index': {'aliases': {},
  'mappings': {},
  'settings': {'index': {'creation_date': '1713395151083',
    'number_of_shards': '2',
    'number_of_replicas': '0',
    'uuid': 'FipP7o4B79BuqcGTMCl1',
    'version': {'created': '135217827'},
    'provided_name': 'python-test-index'}}}}

In [71]:
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAsyncAuth(credentials, region, service)

In [77]:
# async_os_client = AsyncOpenSearch(
#     full_final_host,
#     http_auth=auth,
#     use_ssl=True,
#     verify_certs=True,
#     connection_class=AsyncHttpConnection,
#     timeout=300
# )

In [78]:
from os import getenv
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core import SimpleDirectoryReader

In [98]:
from llama_index.vector_stores.opensearch import (
    OpensearchVectorStore,
    OpensearchVectorClient,
)

# OpensearchVectorClient stores text in this field by default
text_field = "content"
# OpensearchVectorClient stores embeddings in this field by default
embedding_field = "embedding"
# OpensearchVectorClient encapsulates logic for a
# single opensearch index with vector search enabled
client = OpensearchVectorClient(
    full_final_host,
    index_name, 
    1536, 
    embedding_field=embedding_field, 
    text_field=text_field, 
    use_ssl=True,
    verify_certs=True,
    http_auth=auth, 
    connection_class=AsyncHttpConnection,
)

In [99]:
# initialize vector store
vector_store = OpensearchVectorStore(client)
index = VectorStoreIndex.from_vector_store(vector_store)

### Parsing & Ingestion

Todo: create OpenSearch cluster, is this the same as the open search serverless?
* looks like it's part of the open search service.

Todo: use LlamaParse and ingest data into OpenSearch  
Todo: figure out how to ingest into OpenSearch

Reference: https://docs.llamaindex.ai/en/stable/examples/vector_stores/OpensearchDemo/?h=opensearch

In [86]:
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

--2024-04-17 17:30:15--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8001::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘data/paul_graham/paul_graham_essay.txt’


2024-04-17 17:30:15 (6.74 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]



In [100]:
docs = SimpleDirectoryReader("./data/paul_graham/").load_data()

In [105]:
for doc in docs:
    index.insert(doc)

In [104]:
index.as_retriever().retrieve('who is paul graham')

[NodeWithScore(node=TextNode(id_='c42abc11-91c9-4987-9ae9-a0a5d106a234', embedding=None, metadata={'file_path': '/Users/suo/dev/rag-bedrock/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-04-17', 'last_modified_date': '2024-04-17'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='1886bf3f-de3d-46e7-8518-69565ee87ef5', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/Users/suo/dev/rag-bedrock/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-04-17', 'last_modified_date': '2024-04-17'}, hash='c74850

### Retrieval

Reference: https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/retrievers/bedrock_retriever.ipynb

In [2]:
from llama_index.retrievers.bedrock import AmazonKnowledgeBasesRetriever

In [3]:
retriever = AmazonKnowledgeBasesRetriever(
    knowledge_base_id="<knowledge-base-id>",
    retrieval_config={
        "vectorSearchConfiguration": {
            "numberOfResults": 4,
            "overrideSearchType": "HYBRID",
            "filter": {"equals": {"key": "tag", "value": "space"}},
        }
    },
)

NameError: name 'AmazonKnowledgeBasesRetriever' is not defined