In [7]:
!pip install llama-index llama-index-llms-gemini llama-index-embeddings-huggingface transformers torch




In [8]:
import asyncio
import os
import json
from pathlib import Path
from typing import List, Any
import requests

# Libraries for RAG components
from llama_index.core import Document, VectorStoreIndex
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.postprocessor import SentenceTransformerRerank

# --- 0. Mock Data Setup (Simulating Kaggle/Public Data) ---

# We simulate a corpus of financial reports/news articles, a great use case for RAG.
# This makes the notebook immediately runnable without requiring file downloads.
MOCK_DATA = [
    {"doc_id": "1", "text": "Google's Q3 2024 earnings report showed strong growth in Cloud services, which offset a slight dip in traditional Search ad revenue. AI integration was cited as a major driver across all segments."},
    {"doc_id": "2", "text": "The latest stock market volatility is primarily driven by interest rate decisions from the Federal Reserve. Analysts predict a continued focus on inflation control through Q1 2025."},
    {"doc_id": "3", "text": "Deep learning models, especially large language models (LLMs), require billions of parameters and vast datasets. The shift towards multimodal AI is the next major challenge in the field."},
    {"doc_id": "4", "text": "A key innovation in RAG is the use of hybrid search (dense + sparse) combined with cross-encoder reranking to improve the precision of context retrieved for the LLM."},
    {"doc_id": "5", "text": "Project Gemini, Google's next-generation AI model, is designed to be natively multimodal, capable of understanding and generating content across text, images, audio, and video."},
    {"doc_id": "6", "text": "The history of deep learning dates back to the perceptron models of the 1950s, but the modern revolution started with advancements in GPU processing and the Transformer architecture."},
]

# --- 1. Custom Retriever Classes (To match the user's snippet structure) ---

class MockRetrievedDocument:
    """A lightweight class to simulate a document object returned from a retriever."""
    def __init__(self, text, score=0.0):
        self.text = text
        self.score = score

    def __eq__(self, other):
        return self.text == other.text

    def __hash__(self):
        return hash(self.text)

    def __repr__(self):
        return f"MockDoc(Score: {self.score:.2f}, Text: '{self.text[:50]}...')"


class DenseRetriever(BaseRetriever):
    """
    A semantic retriever using vector embeddings.
    Simulates searching a vector store.
    """
    def __init__(self, index: VectorStoreIndex):
        self._index = index

    async def _aretrieve(self, query: str, **kwargs) -> List[MockRetrievedDocument]:
        # Use llama-index's internal retriever for semantic search
        retriever = self._index.as_retriever(similarity_top_k=kwargs.get('k', 5))
        nodes = retriever.retrieve(query)
        # Convert to MockRetrievedDocument for consistent output
        return [MockRetrievedDocument(n.text, n.score) for n in nodes]

    def _retrieve(self, query: str, **kwargs) -> List[MockRetrievedDocument]:
        return asyncio.run(self._aretrieve(query, **kwargs))


class SparseRetriever:
    """
    A keyword-based retriever (e.g., BM25).
    Simulates searching a keyword/lexical index.
    """
    def __init__(self, documents: List[str]):
        # Use a simple split/filter for demonstration (actual BM25 would be used in production)
        self.documents = documents

    async def search(self, query: str, k: int) -> List[MockRetrievedDocument]:
        # Simple token match simulation for speed in the demo
        query_tokens = set(query.lower().split())
        scored_docs = []
        for doc_text in self.documents:
            doc_tokens = set(doc_text.lower().split())
            # Score by common words (simulates keyword relevance)
            score = len(query_tokens.intersection(doc_tokens))
            if score > 0:
                scored_docs.append(MockRetrievedDocument(doc_text, score))

        # Sort by score (descending) and return top k
        scored_docs.sort(key=lambda x: x.score, reverse=True)
        return scored_docs[:k]


# --- 2. RAG Component Initialization ---

# Initialize LLM (Gemini) - Assumes API Key is set in Colab secrets or environment
llm = Gemini(model="gemini-2.5-flash", api_key=os.getenv("GEMINI_API_KEY"))

# Initialize Embedding Model (Free/Local) for Dense Retrieval
# Uses a commonly used, small-footprint embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Load and Index Data
print("--- Initializing Data and Indexes ---")
documents = [Document(text=d["text"], id_=d["doc_id"]) for d in MOCK_DATA]
vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
sparse_corpus = [d["text"] for d in MOCK_DATA]

# Initialize Retrievers
dense_retriever = DenseRetriever(vector_index)
sparse_retriever = SparseRetriever(sparse_corpus)

# Initialize Reranker (Cross-Encoder)
# This model is specifically trained for cross-document relevance scoring.
# We will use the built-in LlamaIndex implementation.
reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-6-v2",
    top_n=3,
)

print("--- RAG Components Ready ---")

# --- 3. The Core Hybrid RAG Pipeline (Adapted from User's Snippet) ---

async def hybrid_rag_pipeline(query: str):
    """
    Executes the advanced hybrid RAG pipeline:
    1. Parallel Retrieval (Dense + Sparse)
    2. Merge results (Hybrid Search)
    3. Rerank (Cross-Encoder)
    4. LLM Generation (Streaming)
    """
    print(f"\n--- Running Hybrid RAG Pipeline for Query: '{query}' ---")

    # 1. Parallel Retrieval (Key Demonstrator of Latency Reduction)
    print("1. Executing Dense (Semantic) and Sparse (Keyword) Retrieval in Parallel...")
    dense, sparse = await asyncio.gather(
        dense_retriever._aretrieve(query, k=5),
        sparse_retriever.search(query, k=5)
    )

    print(f"   - Dense Results Found: {len(dense)}")
    print(f"   - Sparse Results Found: {len(sparse)}")

    # 2. Merge (Hybrid Search for Maximizing Recall)
    # Using 'set' to automatically deduplicate documents based on their text
    combined = list(set(dense + sparse))
    print(f"2. Merged Results (Deduplicated): {len(combined)} documents.")

    # 3. Rerank (Cross-Encoder for Maximizing Precision)
    # The reranker object expects llama_index Node/Document objects, so we mock this
    # by using the text and passing it directly to the reranker's rank method.
    # Note: In a real LlamaIndex setup, reranker.postprocess_nodes would be used.
    print("3. Reranking combined documents using Cross-Encoder model...")

    # For the simplified demo, we use the internal `reranker.postprocess_nodes`
    # which takes the initial retrieval list.
    from llama_index.core.schema import NodeWithScore, TextNode

    # Map the Mock Docs back to Llama Index Nodes for the reranker
    nodes_to_rerank = [NodeWithScore(node=TextNode(text=doc.text), score=doc.score) for doc in combined]

    # Perform reranking
    reranked_nodes = reranker.postprocess_nodes(nodes_to_rerank, query_str=query)

    # Extract the top context texts
    top_docs = [node.text for node in reranked_nodes]
    print(f"   - Top 3 Contexts Selected after Reranking:")
    for i, doc_text in enumerate(top_docs):
        print(f"     [{i+1}] Score: {reranked_nodes[i].score:.4f} | Text: {doc_text[:60]}...")

    # 4. LLM with Streaming (Key Demonstrator of UX Optimization)
    final_context = "\n\n".join(top_docs)
    prompt = f"Based ONLY on the following Context, answer the question accurately and concisely.\n\nContext:\n{final_context}\n\nQuestion: {query}\n\nAnswer:"

    print("\n4. Generating Final Answer via Streaming LLM:\n")
    print("--- LLM Output ---")

    full_response = ""
    # Use the streaming API for reduced perceived latency
    response_stream = llm.stream_complete(prompt=prompt)
    for chunk in response_stream:
        print(chunk.text, end="", flush=True)
        full_response += chunk.text

    print("\n--- End of Stream ---")
    return full_response


# --- 4. Execution Block ---

async def main():
    """Main execution function to run the RAG pipeline."""
    # Ensure GEMINI_API_KEY is available in the environment (e.g., Colab Secrets)
    if not os.getenv("GEMINI_API_KEY"):
        print("\nERROR: Please set the GEMINI_API_KEY environment variable (e.g., in Colab Secrets) to run the LLM step.")
        return

    # Example questions that benefit from Hybrid Search
    queries = [
        "What are the major drivers of Google's recent Q3 earnings, and what new AI model is they developing?",
        "Explain why hybrid search is essential in modern RAG systems."
    ]

    for q in queries:
        await hybrid_rag_pipeline(q)

if __name__ == "__main__":
    # In a Colab environment, use asyncio.run(main())
    # but for a standard script, this works fine.
    # If running in a Jupyter/Colab cell, use await main() instead of the block below.
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        print("\nProgram interrupted.")
    except Exception as e:
        print(f"\nAn error occurred during execution: {e}")

# Note: In a real environment, you would use a requirements.txt file:
# llama-index
# llama-index-llms-gemini
# llama-index-embeddings-huggingface
# torch (for HuggingFace)
# transformers (for HuggingFace)

  llm = Gemini(model="gemini-2.5-flash", api_key=os.getenv("GEMINI_API_KEY"))


--- Initializing Data and Indexes ---
--- RAG Components Ready ---

An error occurred during execution: asyncio.run() cannot be called from a running event loop


  print(f"\nAn error occurred during execution: {e}")


In [9]:
import os
from google.colab import userdata

# Get the API key from Colab secrets and set it as an environment variable
# This ensures the 'llm = Gemini(...)' call can find it.
os.environ["GEMINI_API_KEY"] = userdata.get("GEMINI_API_KEY")
print("GEMINI_API_KEY has been set from Colab Secrets.")

GEMINI_API_KEY has been set from Colab Secrets.


In [10]:
import asyncio
import os
import json
from pathlib import Path
from typing import List, Any
import requests

# Libraries for RAG components
from llama_index.core import Document, VectorStoreIndex
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.postprocessor import SentenceTransformerRerank

# --- 0. Mock Data Setup (Simulating Kaggle/Public Data) ---

# MOCK_DATA is already defined in a previous cell, ensuring it's available.

# --- 1. Custom Retriever Classes ---

class MockRetrievedDocument:
    """A lightweight class to simulate a document object returned from a retriever."""
    def __init__(self, text, score=0.0):
        self.text = text
        self.score = score

    def __eq__(self, other):
        return self.text == other.text

    def __hash__(self):
        return hash(self.text)

    def __repr__(self):
        return f"MockDoc(Score: {self.score:.2f}, Text: '{self.text[:50]}...')"


class DenseRetriever(BaseRetriever):
    """
    A semantic retriever using vector embeddings.
    Simulates searching a vector store.
    """
    def __init__(self, index: VectorStoreIndex):
        self._index = index

    async def _aretrieve(self, query: str, **kwargs) -> List[MockRetrievedDocument]:
        # Use llama-index's internal retriever for semantic search
        retriever = self._index.as_retriever(similarity_top_k=kwargs.get('k', 5))
        nodes = retriever.retrieve(query)
        # Convert to MockRetrievedDocument for consistent output
        return [MockRetrievedDocument(n.text, n.score) for n in nodes]

    def _retrieve(self, query: str, **kwargs) -> List[MockRetrievedDocument]:
        return asyncio.run(self._aretrieve(query, **kwargs))


class SparseRetriever:
    """
    A keyword-based retriever (e.g., BM25).
    Simulates searching a keyword/lexical index.
    """
    def __init__(self, documents: List[str]):
        # Use a simple split/filter for demonstration (actual BM25 would be used in production)
        self.documents = documents

    async def search(self, query: str, k: int) -> List[MockRetrievedDocument]:
        # Simple token match simulation for speed in the demo
        query_tokens = set(query.lower().split())
        scored_docs = []
        for doc_text in self.documents:
            doc_tokens = set(doc_text.lower().split())
            # Score by common words (simulates keyword relevance)
            score = len(query_tokens.intersection(doc_tokens))
            if score > 0:
                scored_docs.append(MockRetrievedDocument(doc_text, score))

        # Sort by score (descending) and return top k
        scored_docs.sort(key=lambda x: x.score, reverse=True)
        return scored_docs[:k]


# --- 3. The Core Hybrid RAG Pipeline (Adapted from User's Snippet) ---

async def hybrid_rag_pipeline(query: str):
    """
    Executes the advanced hybrid RAG pipeline:
    1. Parallel Retrieval (Dense + Sparse)
    2. Merge results (Hybrid Search)
    3. Rerank (Cross-Encoder)
    4. LLM Generation (Streaming)
    """
    print(f"\n--- Running Hybrid RAG Pipeline for Query: '{query}' ---")

    # 1. Parallel Retrieval (Key Demonstrator of Latency Reduction)
    print("1. Executing Dense (Semantic) and Sparse (Keyword) Retrieval in Parallel...")
    dense, sparse = await asyncio.gather(
        dense_retriever._aretrieve(query, k=5),
        sparse_retriever.search(query, k=5)
    )

    print(f"   - Dense Results Found: {len(dense)}")
    print(f"   - Sparse Results Found: {len(sparse)}")

    # 2. Merge (Hybrid Search for Maximizing Recall)
    # Using 'set' to automatically deduplicate documents based on their text
    combined = list(set(dense + sparse))
    print(f"2. Merged Results (Deduplicated): {len(combined)} documents.")

    # 3. Rerank (Cross-Encoder for Maximizing Precision)
    # The reranker object expects llama_index Node/Document objects, so we mock this
    # by using the text and passing it directly to the reranker's rank method.
    # Note: In a real LlamaIndex setup, reranker.postprocess_nodes would be used.
    print("3. Reranking combined documents using Cross-Encoder model...")

    from llama_index.core.schema import NodeWithScore, TextNode

    # Map the Mock Docs back to Llama Index Nodes for the reranker
    nodes_to_rerank = [NodeWithScore(node=TextNode(text=doc.text), score=doc.score) for doc in combined]

    # Perform reranking
    reranked_nodes = reranker.postprocess_nodes(nodes_to_rerank, query_str=query)

    # Extract the top context texts
    top_docs = [node.text for node in reranked_nodes]
    print(f"   - Top 3 Contexts Selected after Reranking:")
    for i, doc_text in enumerate(top_docs):
        print(f"     [{i+1}] Score: {reranked_nodes[i].score:.4f} | Text: {doc_text[:60]}...")

    # 4. LLM with Streaming (Key Demonstrator of UX Optimization)
    final_context = "\n\n".join(top_docs)
    prompt = f"Based ONLY on the following Context, answer the question accurately and concisely.\n\nContext:\n{final_context}\n\nQuestion: {query}\n\nAnswer:"

    print("\n4. Generating Final Answer via Streaming LLM:\n")
    print("--- LLM Output ---")

    full_response = ""
    # Use the streaming API for reduced perceived latency
    response_stream = llm.stream_complete(prompt=prompt)
    for chunk in response_stream:
        print(chunk.text, end="", flush=True)
        full_response += chunk.text

    print("\n--- End of Stream ---")
    return full_response


# --- 2. RAG Component Initialization (Now after definitions) ---

# Re-initialize LLM (Gemini)
# This code is copied from cell b7pNBmYaOYo3, but will now succeed after the API key is set.
llm = Gemini(model="gemini-2.5-flash", api_key=os.getenv("GEMINI_API_KEY"))

# Initialize Embedding Model (Free/Local) for Dense Retrieval
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Load and Index Data (MOCK_DATA is already defined in a previous cell)
documents = [Document(text=d["text"], id_=d["doc_id"]) for d in MOCK_DATA]
vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
sparse_corpus = [d["text"] for d in MOCK_DATA]

# Initialize Retrievers (DenseRetriever and SparseRetriever are now defined)
dense_retriever = DenseRetriever(vector_index)
sparse_retriever = SparseRetriever(sparse_corpus)

# Initialize Reranker (Cross-Encoder)
reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-6-v2",
    top_n=3,
)

print("--- RAG Components Re-Initialized Successfully ---")

# Now, run the pipeline with your query:
my_query = "What were the key financial results for Google in Q3 2024?"
await hybrid_rag_pipeline(my_query)

  llm = Gemini(model="gemini-2.5-flash", api_key=os.getenv("GEMINI_API_KEY"))


--- RAG Components Re-Initialized Successfully ---

--- Running Hybrid RAG Pipeline for Query: 'What were the key financial results for Google in Q3 2024?' ---
1. Executing Dense (Semantic) and Sparse (Keyword) Retrieval in Parallel...
   - Dense Results Found: 5
   - Sparse Results Found: 5
2. Merged Results (Deduplicated): 6 documents.
3. Reranking combined documents using Cross-Encoder model...
   - Top 3 Contexts Selected after Reranking:
     [1] Score: 5.3227 | Text: Google's Q3 2024 earnings report showed strong growth in Clo...
     [2] Score: -7.2088 | Text: The latest stock market volatility is primarily driven by in...
     [3] Score: -11.2541 | Text: Project Gemini, Google's next-generation AI model, is design...

4. Generating Final Answer via Streaming LLM:

--- LLM Output ---
Google's Q3 2024 earnings report showed strong growth in Cloud services, which offset a slight dip in traditional Search ad revenue.
--- End of Stream ---


"Google's Q3 2024 earnings report showed strong growth in Cloud services, which offset a slight dip in traditional Search ad revenue."