In [1]:
import sys
from pathlib import Path

# 1. Define the directory *containing* the all_rag_techniques package
# Get the directory of the current notebook/script (__file__ might not work in some notebooks)
# Assuming the notebook is inside all_rag_techniques/
current_dir = Path.cwd() 

# The directory containing 'all_rag_techniques' is the parent directory
project_root = current_dir.parent 

# 2. Add this root to the system path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print(f"Added project root to path: {project_root}")
else:
    print("Project root already in path.")

# 3. Now the import should work
try:
    from all_rag_techniques import setup_environment, check_keys
    print("✅ Package imported successfully!")
    setup_environment()
    check_keys()
except Exception as e:
    print(f"❌ Final import failed: {e}")

Added project root to path: /Users/ruhwang/Desktop/AI/my_projects/context-engineering/advanced-rag
✅ Package imported successfully!
LANGCHAIN_API_KEY not set (empty in .env file)
Environment setup complete!
=== API Keys from config.py ===
  GROQ_API_KEY: Loaded
  COHERE_API_KEY: Loaded
  OPENAI_API_KEY: Loaded
  LANGCHAIN_API_KEY: Missing

=== Environment Variables ===
  os.environ['GROQ_API_KEY']: Set
  os.environ['COHERE_API_KEY']: Set

All essential keys loaded!


In [24]:
from langchain.retrievers import BM25Retriever, SVMRetriever, TFIDFRetriever, EmbedchainRetriever, MultiQueryRetriever
from pydantic import BaseModel, Field
from langchain.schema import Document

from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_core.retrievers import BaseRetriever

from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

from langchain.embeddings.openai import OpenAIEmbeddings

In [6]:
from typing import List, Dict, Any, Tuple
from helper_functions import *
from evaluation.evalute_rag import *

In [7]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=512)
path = "/Users/ruhwang/Desktop/AI/my_projects/context-engineering/advanced-rag/data/Understanding_Climate_Change.pdf"
vectorstore = encode_pdf(path)

In [8]:
class RatingScore(BaseModel):
    relevance_score: float = Field(..., description="The relevance score of a document to a query.")

def rerank_documents(query: str, docs: List[Document], top_n: int = 3) -> List[Document]:
    prompt_template = PromptTemplate(
        input_variables=["query", "doc"],
        template="""On a scale of 1-10, rate the relevance of the following document to the query. 
        Consider the specific context and intent of the query, not just keyword matches.
        Query: {query}
        Document: {doc}
        Relevance Score:"""
    )
    
    llm = ChatOpenAI(temperature=0.25, model_name="gpt-4o-mini", max_tokens=4000)
    llm_chain = prompt_template | llm.with_structured_output(RatingScore)
    
    scored_docs = []
    for doc in docs:
        input_data = {"query": query, "doc": doc.page_content}
        score = llm_chain.invoke(input_data).relevance_score
        try:
            score = float(score)
        except ValueError:
            score = 0  # Default score if parsing fails
        scored_docs.append((doc, score))
    
    reranked_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in reranked_docs[:top_n]]

BM25Retriever must be initialized with the documents it needs to search over. Unlike a vector store that you might pass in, a BM25Retriever is typically built from a collection of documents.

In [15]:
chunk_size = 250
chunk_overlap = 20

# Load PDF and create documents
loader = PyPDFLoader(path)
documents = loader.load()

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
)
texts = text_splitter.split_documents(documents)
cleaned_texts = replace_t_with_space(texts)

# Create embeddings and vector store
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=512)
vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

In [19]:
# 2. Instantiate the BM25Retriever using the class method 'from_documents'
bm25_retriever = BM25Retriever.from_documents(documents=documents)
svm_retriever = SVMRetriever.from_documents(
    documents=documents,
    embeddings=embeddings 
)

In [22]:
tfidf_retriever = TFIDFRetriever.from_documents(
    documents=documents
)

In [26]:
# Use the same LLM you're using for your QA chain (or a faster one)
llm_for_query_gen = ChatOpenAI(temperature=0.2, model_name="gpt-4o-mini") 

# Use vector store retriever as the base
base_retriever = vectorstore.as_retriever()

# 2. Create the Multi-Query Retriever
multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=base_retriever,
    llm=llm_for_query_gen
)

In [27]:
# Create a custom retriever class
class CustomEnsembleRetriever(BaseRetriever, BaseModel):
    
    vectorstore: Any = Field(description="Vector store for initial retrieval")
    retrievers: List[BaseRetriever] = Field(description="List of retrievers to ensemble")
    weights: List[float] = Field(description="Weights for each retriever in the ensemble")

    def get_relevant_documents(self, query: str, num_docs=2) -> List[Document]:
        initial_docs = self.vectorstore.similarity_search(query, k=30)
        return rerank_documents(query, initial_docs, top_n=num_docs)

# Create the custom retriever
ensemble_retriever = CustomEnsembleRetriever(vectorstore=vectorstore,
                                             retrievers=[bm25_retriever, svm_retriever, tfidf_retriever, multiquery_retriever],
                                             weights=[0.25, 0.25, 0.25, 0.25])

# Create an LLM for answering questions
llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")

# Create the RetrievalQA chain with the custom retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=ensemble_retriever,
    return_source_documents=True
)

  class CustomEnsembleRetriever(BaseRetriever, BaseModel):


In [28]:
query = "How does climate change affect marine ecosystems?"
result = qa_chain({"query": query})

print(f"\nQuestion: {query}")
print(f"Answer: {result['result']}")
print("\nRelevant source documents:")
for i, doc in enumerate(result["source_documents"]):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:200] + "...")  # Print first 200 characters of each document

  result = qa_chain({"query": query})



Question: How does climate change affect marine ecosystems?
Answer: Climate change affects marine ecosystems in several ways, including rising sea temperatures, ocean acidification, and changing currents. These changes can impact marine biodiversity, affecting everything from coral reefs to deep-sea fisheries. For example, ocean acidification disrupts the health and survival of various marine species, which in turn can disrupt food webs. Protecting and restoring coral reefs is essential for marine conservation in the face of these challenges.

Relevant source documents:

Document 1:
Marine Ecosystems 
Marine ecosystems are highly vulnerable to climate change. Rising sea temperatures, ocean 
acidification, and changing currents affect marine biodiversity, from coral reefs to deep-...

Document 2:
fisheries. Protecting and restoring coral reefs is essential for marine conservation. 
Marine Ecosystems 
Acidification affects the health and survival of various marine species, disrupting fo