# 3.1 Advanced RAG intro - query analysis and meta-data


## Setup

### Install dependencies

In [None]:
%pip install python-dotenv~=1.0 docarray~=0.40.0 pypdf~=5.1 --upgrade --quiet
%pip install chromadb~=0.5.18 sentence-transformers~=3.3 --upgrade --quiet 
%pip install langchain~=0.3.7 langchain_openai~=0.2.6 langchain_community~=0.3.5 langchain-chroma~=0.1.4 langchainhub~=0.1.21 --upgrade --quiet

# If running locally, you can do this instead:
#%pip install -r ../requirements.txt

### Load environment variables

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

# If running in Google Colab, you can use this code instead:
# from google.colab import userdata
# os.environ["AZURE_OPENAI_API_KEY"] = userdata.get("AZURE_OPENAI_API_KEY")
# os.environ["AZURE_OPENAI_ENDPOINT"] = userdata.get("AZURE_OPENAI_ENDPOINT")

### Setup models

In [None]:
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
api_version = "2024-10-01-preview"
llm = AzureChatOpenAI(deployment_name="gpt-4o", temperature=0.0, openai_api_version=api_version)
embedding_model = AzureOpenAIEmbeddings(model="text-embedding-3-large", openai_api_version=api_version)

### Setup LangSmith tracing for this notebook

In [None]:
import os

# API key etc is in the .env file
# my_name = "Totoro"
# os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_PROJECT"] = f"tokyo24-test-{my_name}"

### Setup path to data 

In [None]:
data_path = "../data"

### Let's setup our vectorDB as before
Load ML sample docs and setup Vector DB

In [None]:
# Load PDFs
from langchain.document_loaders import PyPDFLoader
loaders = [
    PyPDFLoader(f"{data_path}/MachineLearning-Lecture01.pdf"),
    PyPDFLoader(f"{data_path}/MachineLearning-Lecture01.pdf"),
    PyPDFLoader(f"{data_path}/MachineLearning-Lecture03.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)
splits = text_splitter.split_documents(docs)

# Setup vector DB
from langchain.vectorstores import Chroma
persist_directory = './db/chroma-ML-docs/'
vectordb = Chroma.from_documents(
    collection_name="ml_docs",
    documents=splits,
    embedding=embedding_model,
    #persist_directory=persist_directory # Optionally persist the database
)

print(vectordb._collection.count())

## Query analysis - understanding what the user is asking for

### Let's start by setting up a simple model
The model just hols a simple flag, indicating if a question is related to the topics of the documents in the database. 

In [None]:
from pydantic import BaseModel, Field

class QueryAnalysis(BaseModel):
    """Binary score for relevance of the user's question to knowledge base topics."""

    question_relevant_to_topics: bool = Field(description="User question is related to the topics in the knowledge base, 'true' or 'false'")

## We need a matching promt 

In [None]:
from langchain_core.prompts import ChatPromptTemplate

# Prompt 
system = """You are an expert determining if a user question is related to topics of data stored in a knowledge base.
    If the question contains keyword(s) or semantic meaning related to the topics, mark it as relevant to the topics. \n
    Give a binary value of 'true' or 'false' to indicate whether the question is relevant. \n\n 
    The topics of the data stored in the knowledge base are: \n{document_topics}"""
analysis_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

### Now we can construct a chain that uses structured output

In [None]:
# TODO: Your task - create the chain for query analysis
query_analysis_chain = analysis_prompt | llm.with_structured_output(QueryAnalysis)

In [None]:
topics = "Machine Learning, Math, Computer Science, CS229, Andrew Ng"
question = "what did they say about matlab?"
#question = "what must I see in Tokyo?"
response = query_analysis_chain.invoke({"question": question, "document_topics": topics})
print(type(response))

In [None]:
analysis_response: QueryAnalysis = response
print(f"Relevant: {analysis_response.question_relevant_to_topics}")

### Addressing Specificity: working with metadata

In last lecture, we showed that a question about the third lecture can include results from other lectures as well.

To address this, many vectorstores support operations on `metadata`.

`metadata` provides context for each embedded chunk.

In [None]:
question = "what did they say about regression in the third lecture?"

In [None]:
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":"../data/MachineLearning-Lecture03.pdf"}
)

In [None]:
for d in docs:
    print(d.metadata)

### Addressing Specificity: working with metadata using self-query retriever

But we have an interesting challenge: we often want to infer the metadata from the query itself.

To address this, we can use `SelfQueryRetriever`, which uses an LLM to extract:
 
1. The `query` string to use for vector search
2. A metadata filter to pass in as well

Most vector databases support metadata filters, so this doesn't require any new databases or indexes.

In [None]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [None]:
from langchain.storage import InMemoryStore
store = InMemoryStore()

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `../data/MachineLearning-Lecture01.pdf`, `../data/MachineLearning-Lecture02.pdf`, or `../data/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

In [None]:
document_content_description = "Lecture notes"

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [None]:
question = "what did they say about regression in the third lecture?"
docs = retriever.invoke(question)

# To see what's happening under the hood, you can use a ConsoleCallbackHandler:
#from langchain.callbacks.tracers import ConsoleCallbackHandler
#docs = retriever.with_config({'callbacks': [ConsoleCallbackHandler()]}).invoke(question)
 
print(len(docs))

In [None]:
for d in docs:
    print(d.metadata)

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>



#### Ok, some cheats

##### Query analysis chain:

In [None]:
analysis_prompt | llm.with_structured_output(QueryAnalysis)