# RAG Sandbox

Example code about how to use LangChain to do RAG using pgvector.

* https://python.langchain.com/docs/integrations/vectorstores/pgvector/
* https://github.com/langchain-ai/langchain-postgres/blob/main/examples/pg_vectorstore.ipynb

In [1]:
from dotenv import load_dotenv
import os

# Load environment variables from .env.local file
load_dotenv(dotenv_path='../../.env.local')

# Update with your name to group your own traces
os.environ['LANGCHAIN_PROJECT'] = 'steve-fap-sandbox'

# RAG Example

Demonstration of Postgres pg_vector for document embedding and retrieval.

In [2]:
from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector

DB_URI = os.environ['DB_DEV_CONNECTION']
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = PGVector(
    embeddings=embeddings,
    collection_name="my_docs",
    connection=DB_URI.replace("postgresql","postgresql+psycopg"),
)

In [3]:
from langchain_core.documents import Document

docs = [
    Document(
        page_content="there are cats in the pond",
        metadata={"id": 1, "location": "pond", "topic": "animals"},
    ),
    Document(
        page_content="ducks are also found in the pond",
        metadata={"id": 2, "location": "pond", "topic": "animals"},
    ),
    Document(
        page_content="fresh apples are available at the market",
        metadata={"id": 3, "location": "market", "topic": "food"},
    ),
    Document(
        page_content="the market also sells fresh oranges",
        metadata={"id": 4, "location": "market", "topic": "food"},
    ),
    Document(
        page_content="the new art exhibit is fascinating",
        metadata={"id": 5, "location": "museum", "topic": "art"},
    ),
    Document(
        page_content="a sculpture exhibit is also at the museum",
        metadata={"id": 6, "location": "museum", "topic": "art"},
    ),
    Document(
        page_content="a new coffee shop opened on Main Street",
        metadata={"id": 7, "location": "Main Street", "topic": "food"},
    ),
    Document(
        page_content="the book club meets at the library",
        metadata={"id": 8, "location": "library", "topic": "reading"},
    ),
    Document(
        page_content="the library hosts a weekly story time for kids",
        metadata={"id": 9, "location": "library", "topic": "reading"},
    ),
    Document(
        page_content="a cooking class for beginners is offered at the community center",
        metadata={"id": 10, "location": "community center", "topic": "classes"},
    ),
]

vector_store.add_documents(docs, ids=[doc.metadata["id"] for doc in docs])

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
# Similarity search using a single filter
results = vector_store.similarity_search(
    "kitty", k=10, filter={"id": {"$in": [1, 5, 2, 9]}}
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* there are cats in the pond [{'id': 1, 'topic': 'animals', 'location': 'pond'}]
* the library hosts a weekly story time for kids [{'id': 9, 'topic': 'reading', 'location': 'library'}]
* ducks are also found in the pond [{'id': 2, 'topic': 'animals', 'location': 'pond'}]
* the new art exhibit is fascinating [{'id': 5, 'topic': 'art', 'location': 'museum'}]


In [None]:
# Similarity search using two filters (and'ed together)
vector_store.similarity_search(
    "ducks",
    k=10,
    filter={"id": {"$in": [1, 5, 2, 9]}, "location": {"$in": ["pond", "market"]}},
)

[Document(id='2', metadata={'id': 2, 'topic': 'animals', 'location': 'pond'}, page_content='ducks are also found in the pond'),
 Document(id='1', metadata={'id': 1, 'topic': 'animals', 'location': 'pond'}, page_content='there are cats in the pond')]

In [None]:
# Similarity search where the score is returned as well
results = vector_store.similarity_search_with_score(query="cats", k=1)
for doc, score in results:
    print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")

* [SIM=0.554739] there are cats in the pond [{'id': 1, 'topic': 'animals', 'location': 'pond'}]


In [15]:
# Langchain often uses retrievers, which are a wrapper around a vector store
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 1})
retriever.invoke("kitty")

[Document(id='1', metadata={'id': 1, 'topic': 'animals', 'location': 'pond'}, page_content='there are cats in the pond')]

# RAG

Now to use the vector store within a agent to answer questions

In [25]:
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from typing import Annotated, TypedDict
from langchain_openai import ChatOpenAI


retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3})

class State(TypedDict):
    messages: Annotated[list, add_messages]

llm = ChatOpenAI(model="gpt-4o-mini")

def agent(state: State):
    query = state["messages"][-1].content
    docs = retriever.invoke(query)
    response = llm.invoke("Summarize the following documents: " + "\n".join([doc.page_content for doc in docs]))    
    return {"messages": [response]}

graph = StateGraph(State)
graph.add_node("agent", agent)
graph.add_edge(START, "agent")
graph.add_edge("agent", END)
app = graph.compile()
app.invoke({"messages": ["testing"]})


{'messages': [HumanMessage(content='testing', additional_kwargs={}, response_metadata={}, id='3e892564-99a8-46e7-a8bc-8c06e07c5233'),
  AIMessage(content='1. The new art exhibit is captivating and engaging for visitors.\n2. A cooking class designed for beginners is available at the community center.\n3. Ducks inhabit the pond, adding to its natural appeal.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 42, 'prompt_tokens': 41, 'total_tokens': 83, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_dbaca60df0', 'id': 'chatcmpl-BTGeK04aHnaMmkCVel4WbDYIvXkn0', 'finish_reason': 'stop', 'logprobs': None}, id='run-3fb63f42-f103-4591-8d66-eb2038f3b8f4-0', usage_metadata={'input_tokens': 41, 'output_tokens': 42, 'total_tokens': 83