# RAG with Llama 2 and Pgvector
#### using Sagemaker Jumpstart Foundation models and RDS w/ Pgvector
---

In this lab we'll use the previously created and populated vector store in RDS Postgres as well as the previously deployed Llama 2 endpoint to demonstrate Retrieval Augmented Generation for Q&A on the SEC document data embedded in the previous lab. 

---
##### Lab Agenda:
1. Setup dependencies
2. Connect to the vector store
3. Connect to the Llama 2 endpoint and test model interaction
4. Setup the Text Embedding model embed the questions
5. Bring it all together

---
### 1. Setup dependencies
check python version and import envrionment settings


In [None]:
!python -V #should be 3.10.x

In [None]:

!pip install --upgrade pip --quiet
!pip install --upgrade psycopg2-binary # python 3.8
!pip install --upgrade pgvector --quiet
!pip install --upgrade tiktoken --quiet
!pip install --upgrade langchain --quiet
!pip install --upgrade sagemaker --quiet
!pip install --upgrade beautifulsoup4 --quiet

imports and settings

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys, json
sys.path.append("libs")

from sagemaker.predictor import Predictor
from sagemaker.session import Session
from sagemaker.jumpstart.notebook_utils import list_jumpstart_models
from bs4 import BeautifulSoup

from sagemaker_embeddings_model import SagemakerEmbeddingsModel
# from sagemaker_textgen_model import SagemakerLlama2TextGenModel
from pgclient import PgClient
# from doc_to_vex import DocToVex
import sagemaker_utils


# TIKTOKEN_ENCODING = 'p50k_base'
TEXT_EMBEDDING_ENDPOINT = "### SET TEXT EMBEDDING ENDPOINT ###"
TEXT_GENERATION_ENDPOINT = "### SET TEXT GENERATION ENDPOINT ###"
# EMBEDDING_VECTOR_SIZE = 384 # this depends on the model used to do the embeddings
# RDS_CRED_SECRET_ID = "genai/rds/pgvector-pub1"
REGION = "us-east-1"
DB_SETTINGS_FILE = "dbsettings.json"
TABLE_NAME = "embeddings"

---
### 2. Connect to the vector store


In [None]:

with open( DB_SETTINGS_FILE, 'r', encoding='utf-8') as f:
    content = f.read()
    dbsettings = json.loads(content)

# Setup the database client instance and connect
db = PgClient(dbsettings)
db.connect() # hardcoded to 'postgres' database for this demo

Verify the connection and check for data

In [None]:
rez = db.query(f"select id from {TABLE_NAME}")
print("records: ", len(rez))

---
### 3. Connect to the Llama 2 endpoint and test model interaction

We've already deployed the model endpoints. let's veryify by listing the currently deployed sagemaker endpoints and get the full name of the Llama 2 endpoint

In [None]:
sagemaker_utils.list_endpoints()

---
Set our endpoint vars and setup the llama2 predictor class

In [None]:
TEXT_EMBEDDING_ENDPOINT = "### SET TEXT EMBEDDING ENDPOINT ###"
TEXT_GENERATION_ENDPOINT = "### SET TEXT GENERATION ENDPOINT ###"

In [None]:
# setup the sagemaker session 
session = Session()

# instantiate the predictor
llama2 = Predictor(endpoint_name = TEXT_GENERATION_ENDPOINT, sagemaker_session = session)

# define the query endpoint
def query_endpoint(instruction, context_prompt, context, question, max_new_tokens=512):
    # query the model
    model_params = {
        "max_new_tokens": max_new_tokens, 
        "top_p": 0.9, 
        "temperature": 0.6,
        # "return_full_text": False,
    }

    prompt_array = []
    
    if instruction is not None:
        prompt_array.append({"role": "system", "content": instruction })
        
    if context_prompt is not None:
        prompt_array.append({"role": "user", "content": context_prompt })
        
    if context is not None:
        prompt_array.append({"role": "assistant", "content": context })
        
    if question is not None: 
        prompt_array.append({"role": "user", "content": question })

    prompt = [prompt_array]

    payload = {
        "inputs": prompt,
        "parameters": model_params
    }
    
    # print("encoding payload")
    encoded_payload = json.dumps(payload).encode("utf-8")

    query_response = llama2.predict( 
        encoded_payload,
        {
            "ContentType": "application/json",
            "Accept": "application/json",
            "CustomAttributes": "accept_eula=true" # for Meta models
        }
    )
    
    return json.loads(query_response)[0]['generation']['content']


Test the model with some basic prompts

In [None]:
%%time
 
instruction = "Answer in the form of a Haiku" 
# instruction = "Answer in the form of a poem" 
# instruction = "You are a travel advisor assistant" 
# instruction = "You are a high school science teacher"

context_prompt = None

context = None

question = "What is the best pizza in New York City"
# question = "What is the difference between nuclear fusion and nuclear fission"

result = query_endpoint(instruction, context_prompt, context, question)
# generated = json.loads(result)

print("generated text:\n", result, "\n")


---
### 4. Connect to the Text Embedding model to do vector queries


In [None]:
# instance the embedder with the previously deployed text embedding endpoint 
embedder = SagemakerEmbeddingsModel(TEXT_EMBEDDING_ENDPOINT, session)

# test it 
vec = embedder.query_endpoint("Did it work?")
print(len(vec))

---
### 5. Bring it all together 

Now that we have an embedder to vectorize our questions and the Llama 2 endpoint to generate nice responses, we'll add our similarity query responses from the vector store to the LLM prompt to answer questions about the document 


In [None]:
# define some helper functions

def similarity_search(question):
    print("embed the question")
          
    question_embedding = embedder.query_endpoint(question)
    
    vec_results = 3

    print("query the vector store")
    # <-> l2 distance
    # <=> cosine distance
    # <#> inner product
    # Note: <#> returns the negative inner product since Postgres only supports ASC order index scans on operators
    query = """SELECT id, source, content, 
                descriptions_embeddings <-> '{}' as distance
                FROM embeddings 
                ORDER BY descriptions_embeddings <-> '{}' limit {};""".format(question_embedding, question_embedding, vec_results)

    rez = db.query(query)
    return rez


def build_context_string(query_result):
    print("parse the results and prompt the LLM")
    context = ""
    for r in query_result:
        context += r[2] + "\n"
    # print(context)
    return context


def rag_query(question, context):
    
    instruction = "You are a helpful assistant that is good at giving succint answers to questions. If you don't know an answer, say Don't Know"
    context_prompt = "What are some excerpts from a Company's annual meeting?"
    
    return query_endpoint(instruction, context_prompt, context, question)
    
    

End to end RAG querying

In [None]:
question = "Who is the Chief Executive Officer?"
# question = "Who are the Board of Directors?"
# question = "How many directors does the company have?"
# question = "How much do the board members make?"
# question = "What is the max number of directors?"


search_results = similarity_search(question)

context = build_context_string(search_results)

result = rag_query(question, context)

print("\nquestion: \n", question, "\n")
print("\ngenerated text: \n", result, "\n")
print("\nsource: \n", context)