<a href="https://colab.research.google.com/github/rodiwaa/learnings-pocs/blob/main/notebooks/yt_rag_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# YT Rag System
Learning RAG components from the excellent CampusX Playlist on YT.

Scope
- RAG
  - Doc Loaders
  - Text Splitters
  - Vector Stores
  - Retrievers
- Advanced RAG (Future)
  - UI (streamlit, chainlit, gradio, react)
  - Evaluations
    - Ragas (4xmetrics)
    - Langsmith (traces, tags)
  - Indexing
  - Retrieval
    - Pre R
      - LLM Q Rewrite
      - Multi query
      - Domain Aware Routing
    - During R (search strategy)
      - MMR
      - Hybrid (Semantic, BM25, Keyword)
      - Reranking (algo, LLMs)
    - Post R
      - contextual compression
  - Augmentation
    - Prompt templating
    - Grounding (use context only, else say IDK)
    - Context window optimisation
  - Generation
    - Citations
    - Guardrails
  - System Design
    - Multimodal
    - Agentic (web search, routers)
    - Memory based (from last time convos)



In [None]:
!pip install langchain langgraph langsmith langchain.community wikipedia langchain_openai chromadb python-dotenv

## Ingestion Module

In [None]:
from google.colab import drive
from dotenv import load_dotenv
import os

MOUNT_PATH="/content/drive"
# drive.mount(MOUNT_PATH, force_remount=True) # when .env is updated
drive.mount(MOUNT_PATH)

ENV_PATH=f"{MOUNT_PATH}/MyDrive/Projects/.env/.env"
print(ENV_PATH)

load_dotenv(dotenv_path=ENV_PATH)

# FIXME: get creds, API KEYS from external .env
# fetch .env from gdrive
# dotenv the .env


In [None]:
from langchain_core.documents import Document

# custom docs
all_docs = [
    Document(page_content="Regular walking boosts heart health and can reduce symptoms of depression.", metadata={"source": "H1"}),
    Document(page_content="Consuming leafy greens and fruits helps detox the body and improve longevity.", metadata={"source": "H2"}),
    Document(page_content="Deep sleep is crucial for cellular repair and emotional regulation.", metadata={"source": "H3"}),
    Document(page_content="Mindfulness and controlled breathing lower cortisol and improve mental clarity.", metadata={"source": "H4"}),
    Document(page_content="Drinking sufficient water throughout the day helps maintain metabolism and energy.", metadata={"source": "H5"}),
    Document(page_content="The solar energy system in modern homes helps balance electricity demand.", metadata={"source": "I1"}),
    Document(page_content="Python balances readability with power, making it a popular system design language.", metadata={"source": "I2"}),
    Document(page_content="Photosynthesis enables plants to produce energy by converting sunlight.", metadata={"source": "I3"}),
    Document(page_content="The 2022 FIFA World Cup was held in Qatar and drew global energy and excitement.", metadata={"source": "I4"}),
    Document(page_content="Black holes bend spacetime and store immense gravitational energy.", metadata={"source": "I5"}),
]
print(f"{len(all_docs)} docs added")

## Create Vector Store W Documents

In [None]:
# from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(
    documents = all_docs,
    embedding = embedding_model,
    collection_name = "temp-documents"
)
print("vs created")

# Retrievers

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_openai import ChatOpenAI

retriever_basic = vectorstore.as_retriever(
    search_type="mmr",
    # search_kwargs={"kwargs" : 1}
    k=2
)
print("retriever_basic created")

retriever_mmr = vectorstore.as_retriever(
    # search_type="mmr",
    # search_kwargs={"kwargs" : 1}
)
print("retriever_mmr created")


llm = ChatOpenAI(model="gpt-3.5-turbo")
base_retriever = vectorstore.as_retriever(
    k=2
)

base_compressor = LLMChainExtractor.from_llm(llm=llm)

retriever_compress = ContextualCompressionRetriever(
    base_retriever = retriever_mmr,
    base_compressor = base_compressor
)



### Execute Rs and get docs

In [None]:
# test vector store retrieval

query = "how to boost heart health"
docs_basic = retriever_basic.invoke(query)
docs_mmr = retriever_mmr.invoke(query)
docs_compress = retriever_compress.invoke(query)

print(f"\ndocs_basic")
print("*"*20)

for doc in docs_basic:
  print(doc.page_content)

print(f"\ndocs_compress")
print("*"*20)
for doc in docs_compress:
  print(doc.page_content)

print(f"\ndocs_mmr")
print("*"*20)

for doc in docs_mmr:
  print(doc.page_content)

# FIXME: better search needed, this is printing all docs