# Demo with Amazon Bedrock

## 1 - Bedrock Setup

In [None]:
%pip install --no-build-isolation --force-reinstall \
    "boto3>=1.28.57" \
    "awscli>=1.29.57" \
    "botocore>=1.31.57"

langchain is an open source library that accelerates integration with LLM text language models

In [None]:
%pip install langchain==0.0.309


In [None]:
%pip install pypdf


In [None]:
%pip install "faiss-cpu>=1.7,<2" sqlalchemy --quiet


In [None]:
import boto3
import numpy as np

boto_session = boto3.Session()
credentials = boto_session.get_credentials()

In [None]:
bedrock_models = boto3.client('bedrock')
bedrock_models.list_foundation_models()

for item in bedrock_models.list_foundation_models()["modelSummaries"]:
    print(item["modelId"], item["modelName"])
    
bedrock_models.list_foundation_models()

Creating a runtime to execute calls to the foundational models that allows us to make api calls directly

In [None]:
bedrock = boto3.client("bedrock-runtime")

In [None]:
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

# embedding
bedrock_embeddings = BedrockEmbeddings(
    client=bedrock, model_id="amazon.titan-embed-text-v1"
)

# Text LLM (LARGE Language Model)
llm = Bedrock(
    model_id="anthropic.claude-v2",
    client=bedrock,
    model_kwargs={"max_tokens_to_sample": 300},
)

### 2 - Reading a PDF file with LGPD and understanding embeddings

In [None]:
import glob

data_path = "./data/"
data_path_files = data_path + "*.pdf"

pdf_files = glob.glob(data_path_files)
# pdf_files

Breaking the PDF file into smaller blocks of text
This way, when we do the search, it will match this specific block

In [None]:
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader(data_path)

documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    # chunk_size = 1000,
    chunk_size=500,
    chunk_overlap=100,
)

docs = text_splitter.split_documents(documents)
# docs

In [None]:
avg_doc_length = lambda documents: sum(
    [len(doc.page_content) for doc in documents]
) // len(documents)
avg_char_count_pre = avg_doc_length(documents)
avg_char_count_post = avg_doc_length(docs)
print(
    f"Average length among {len(documents)} documents loaded is {avg_char_count_pre} characters."
)
print(
    f"After the split we have {len(docs)} documents more than the original {len(documents)}."
)
print(
    f"Average length among {len(docs)} documents (after split) is {avg_char_count_post} characters."
)

### Calling Bedrock's pure api to return embedding

In [None]:
import json


def create_embedding_bedrock(text, bedrock_client):
    payload = {"inputText": f"{text}"}
    body = json.dumps(payload)
    modelId = "amazon.titan-embed-g1-text-02"
    accept = "application/json"
    contentType = "application/json"

    response = bedrock_client.invoke_model(
        body=body, modelId=modelId, accept=accept, contentType=contentType
    )
    response_body = json.loads(response.get("body").read())

    embedding = response_body.get("embedding")
    return embedding

In [None]:
sample_embedding = create_embedding_bedrock(docs[1].page_content, bedrock)
print(
    f"The embedding vector has {len(sample_embedding)} values\n{sample_embedding[0:3]+['...']+sample_embedding[-3:]}"
)

In [None]:
docs[1]

### 3 - Reading the file and generating a vector base (in memory)

In [None]:
from langchain.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator
from langchain.indexes.vectorstore import VectorStoreIndexWrapper

vectorstore_faiss = FAISS.from_documents(
    docs, # document
    bedrock_embeddings, # embedding model
)

wrapper_store_faiss = VectorStoreIndexWrapper(vectorstore=vectorstore_faiss)

### Asking questions and getting answers from the LLM (Large Language Model)

In [None]:
question = "Quem é o titular de um dado? Mostre-me a referência no contexto"
answer = wrapper_store_faiss.query(question=question, llm=llm)
print(answer)