# **Installation and environment variable**


In [1]:
pip install qdrant-client google-generativeai rank-bm25 sentence-transformers

Collecting qdrant-client
  Downloading qdrant_client-1.14.2-py3-none-any.whl.metadata (10 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none

# **ENV**
  [Gemini](https://aistudio.google.com/apikey)
  [Qdrant](https://qdrant.tech)

In [None]:
ENV_QDRANT_HOST = ""
ENV_QDRANT_API_KEY = ""
ENV_GEMINI_API_KEY = ""

# **Dense Retrieval RAG**

In [4]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Step 1: Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

COLLECTION_NAME = "Dense-Retrieval"

# Step 2: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")

# Step 3: Sample documents
docs = [
    {"id": "1", "text": "We are experiencing delays in shipping due to weather conditions."},
    {"id": "2", "text": "Shipping may take 5-7 business days during holiday seasons."},
    {"id": "3", "text": "Refunds are processed within 3-5 business days."},
]

# Step 4: Embed documents & upload to Qdrant
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant.")

# Step 5: Query
query = "Why is my order late?"

# Step 6: Embed query
query_response = genai.embed_content(model="models/embedding-001", content=query)
query_vector = query_response["embedding"]

# Step 7: Search Qdrant
hits = qdrant.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector,
    limit=3,
)

# Step 8: Display Results
print("\n🔍 Top results for query:", query)
for hit in hits:
    print(f"- {hit.payload['text']} (Score: {hit.score:.4f})")

✅ Collection 'Dense-Retrieval' already exists.
✅ Documents upserted to Qdrant.

🔍 Top results for query: Why is my order late?
- We are experiencing delays in shipping due to weather conditions. (Score: 0.7849)
- Shipping may take 5-7 business days during holiday seasons. (Score: 0.7236)
- Refunds are processed within 3-5 business days. (Score: 0.6823)


  hits = qdrant.search(


# **Sparse Retrieval RAG**

In [5]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

COLLECTION_NAME = "Sparse-Retrieval"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")

# Step 2: Sample documents (expanded with a tech-related example)
docs = [
    {"id": "1", "text": "We are experiencing delays in shipping due to weather conditions."},
    {"id": "2", "text": "Shipping may take 5-7 business days during holiday seasons."},
    {"id": "3", "text": "Refunds are processed within 3-5 business days."},
    {"id": "4", "text": "ERR_CONN_REFUSED: Check your network settings or firewall."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
# Tokenize documents for BM25 (split text into words)
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Query
query = "ERR_CONN_REFUSED"

# Step 6: BM25 Sparse Retrieval
tokenized_query = query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)

# Get top documents with scores
bm25_results = [
    {"text": docs[i]["text"], "score": bm25_scores[i]}
    for i in range(len(docs))
    if bm25_scores[i] > 0
]
bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:3]

# Step 7: Dense Retrieval with Qdrant
query_response = genai.embed_content(model="models/embedding-001", content=query)
query_vector = query_response["embedding"]
dense_hits = qdrant.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector,
    limit=3,
)

# Step 8: Display Results
print("\n🔍 BM25 Sparse Retrieval Results for query:", query)
for result in bm25_results:
    print(f"- {result['text']} (Score: {result['score']:.4f})")

print("\n🔍 Dense Retrieval Results for query:", query)
for hit in dense_hits:
    print(f"- {hit.payload['text']} (Score: {hit.score:.4f})")

✅ Collection 'Sparse-Retrieval' already exists.
✅ Documents upserted to Qdrant for dense retrieval.

🔍 BM25 Sparse Retrieval Results for query: ERR_CONN_REFUSED

🔍 Dense Retrieval Results for query: ERR_CONN_REFUSED
- ERR_CONN_REFUSED: Check your network settings or firewall. (Score: 0.9169)
- We are experiencing delays in shipping due to weather conditions. (Score: 0.6275)
- Refunds are processed within 3-5 business days. (Score: 0.5791)


  dense_hits = qdrant.search(


# **Hybrid Retrieval RAG**

In [6]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

# Initialize cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

COLLECTION_NAME = "Hybrid-Retrieval"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")

# Step 2: Sample documents (tailored to query)
docs = [
    {"id": "1", "text": "The MYC gene regulates cell growth and is influenced by environmental stressors."},
    {"id": "2", "text": "Climate adaptation in plants involves genetic changes, including MYC gene expression."},
    {"id": "3", "text": "Shipping delays may occur due to extreme weather conditions."},
    {"id": "4", "text": "MYC gene mutations are linked to cancer, not climate adaptation."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Query
query = "MYC gene in climate adaptation"

# Step 6: Dense Retrieval with Qdrant
query_response = genai.embed_content(model="models/embedding-001", content=query)
query_vector = query_response["embedding"]
dense_hits = qdrant.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector,
    limit=4,  # Retrieve more to allow re-ranking
)

# Step 7: BM25 Sparse Retrieval
tokenized_query = query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_results = [
    {"text": docs[i]["text"], "score": bm25_scores[i]}
    for i in range(len(docs))
    if bm25_scores[i] > 0
]
bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:4]

# Step 8: Hybrid Retrieval
# Normalize scores
dense_scores = {hit.payload["text"]: hit.score for hit in dense_hits}
bm25_scores = {result["text"]: result["score"] for result in bm25_results}
all_texts = set(dense_scores.keys()).union(bm25_scores.keys())

max_dense = max(dense_scores.values(), default=1.0)
max_bm25 = max(bm25_scores.values(), default=1.0)

# Combine scores (weighted: 60% dense, 40% sparse)
hybrid_results = {}
for text in all_texts:
    dense_score = dense_scores.get(text, 0) / max_dense
    bm25_score = bm25_scores.get(text, 0) / max_bm25
    hybrid_score = 0.6 * dense_score + 0.4 * bm25_score
    hybrid_results[text] = hybrid_score

# Step 9: Re-ranking with Cross-Encoder
rerank_inputs = [[query, text] for text in hybrid_results.keys()]
rerank_scores = cross_encoder.predict(rerank_inputs)
reranked_results = [
    {"text": text, "score": rerank_scores[i]}
    for i, text in enumerate(hybrid_results.keys())
]
reranked_results = sorted(reranked_results, key=lambda x: x["score"], reverse=True)[:3]

# Step 10: Display Results
print("\n🔍 Dense Retrieval Results for query:", query)
for hit in dense_hits:
    print(f"- {hit.payload['text']} (Score: {hit.score:.4f})")

print("\n🔍 BM25 Sparse Retrieval Results for query:", query)
for result in bm25_results:
    print(f"- {result['text']} (Score: {result['score']:.4f})")

print("\n🔍 Hybrid Retrieval Results for query:", query)
for text, score in sorted(hybrid_results.items(), key=lambda x: x[1], reverse=True)[:3]:
    print(f"- {text} (Hybrid Score: {score:.4f})")

print("\n🔍 Re-ranked Hybrid Results for query:", query)
for result in reranked_results:
    print(f"- {result['text']} (Re-ranked Score: {result['score']:.4f})")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

✅ Collection 'Hybrid-Retrieval' already exists.
✅ Documents upserted to Qdrant for dense retrieval.


  dense_hits = qdrant.search(



🔍 Dense Retrieval Results for query: MYC gene in climate adaptation
- MYC gene mutations are linked to cancer, not climate adaptation. (Score: 0.9194)
- Climate adaptation in plants involves genetic changes, including MYC gene expression. (Score: 0.9148)
- The MYC gene regulates cell growth and is influenced by environmental stressors. (Score: 0.8316)
- Shipping delays may occur due to extreme weather conditions. (Score: 0.5368)

🔍 BM25 Sparse Retrieval Results for query: MYC gene in climate adaptation
- Climate adaptation in plants involves genetic changes, including MYC gene expression. (Score: 2.0047)
- MYC gene mutations are linked to cancer, not climate adaptation. (Score: 0.3608)
- The MYC gene regulates cell growth and is influenced by environmental stressors. (Score: 0.3317)

🔍 Hybrid Retrieval Results for query: MYC gene in climate adaptation
- Climate adaptation in plants involves genetic changes, including MYC gene expression. (Hybrid Score: 0.9970)
- MYC gene mutations are

# **Pre-Retrieval RAG**

In [None]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

# Initialize cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

COLLECTION_NAME = "Pre-Retrieval"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")

# Step 2: Sample documents (medical context for Medication X)
docs = [
    {"id": "1", "text": "Medication X may cause nausea, dizziness, and fatigue as common side effects."},
    {"id": "2", "text": "Rare side effects of Medication X include allergic reactions and liver issues."},
    {"id": "3", "text": "Medication X is used to treat hypertension but may cause headaches in some patients."},
    {"id": "4", "text": "Always consult a doctor before stopping Medication X due to side effects."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Query
query = "Medication X side effects"

# Step 6: Dense Retrieval with Qdrant (Updated to query_points)
query_response = genai.embed_content(model="models/embedding-001", content=query)
query_vector = query_response["embedding"]
dense_hits = qdrant.query_points(
    collection_name=COLLECTION_NAME,
    query=query_vector,
    limit=4,
    with_payload=True
).points

# Step 7: BM25 Sparse Retrieval
tokenized_query = query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_results = [
    {"text": docs[i]["text"], "score": bm25_scores[i]}
    for i in range(len(docs))
    if bm25_scores[i] > 0
]
bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:4]

# Step 8: Hybrid Retrieval
dense_scores = {hit.payload["text"]: hit.score for hit in dense_hits}
bm25_scores = {result["text"]: result["score"] for result in bm25_results}
all_texts = set(dense_scores.keys()).union(bm25_scores.keys())

max_dense = max(dense_scores.values(), default=1.0)
max_bm25 = max(bm25_scores.values(), default=1.0)

hybrid_results = {}
for text in all_texts:
    dense_score = dense_scores.get(text, 0) / max_dense
    bm25_score = bm25_scores.get(text, 0) / max_bm25
    hybrid_score = 0.6 * dense_score + 0.4 * bm25_score
    hybrid_results[text] = hybrid_score

# Step 9: Re-ranking with Cross-Encoder
rerank_inputs = [[query, text] for text in hybrid_results.keys()]
rerank_scores = cross_encoder.predict(rerank_inputs)
reranked_results = [
    {"text": text, "score": rerank_scores[i]}
    for i, text in enumerate(hybrid_results.keys())
]
reranked_results = sorted(reranked_results, key=lambda x: x["score"], reverse=True)[:3]

# Step 10: Generate Answer with Gemini (RAG)
context = "\n".join([result["text"] for result in reranked_results])
prompt = f"Based on the following context, provide a concise answer to the query: {query}\n\nContext:\n{context}\n\nAnswer:"

# Configure Gemini model for generation
model = genai.GenerativeModel("gemini-1.5-pro")
response = model.generate_content(prompt)

# Step 11: Display Results
print("\n🔍 Dense Retrieval Results for query:", query)
for hit in dense_hits:
    print(f"- {hit.payload['text']} (Score: {hit.score:.4f})")

print("\n🔍 BM25 Sparse Retrieval Results for query:", query)
for result in bm25_results:
    print(f"- {result['text']} (Score: {result['score']:.4f})")

print("\n🔍 Hybrid Retrieval Results for query:", query)
for text, score in sorted(hybrid_results.items(), key=lambda x: x[1], reverse=True)[:3]:
    print(f"- {text} (Hybrid Score: {score:.4f})")

print("\n🔍 Re-ranked Hybrid Results for query:", query)
for result in reranked_results:
    print(f"- {result['text']} (Re-ranked Score: {result['score']:.4f})")

print("\n📝 Generated Answer:")
print(response.text)

# **Post-Retrieval RAG**

In [None]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

# Initialize cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

COLLECTION_NAME = "Post-Retrieval"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")

# Step 2: Sample documents (Mars habitability context)
docs = [
    {"id": "1", "text": "Mars habitability is limited by its thin atmosphere and lack of liquid water."},
    {"id": "2", "text": "Evidence of ancient water flows on Mars suggests past habitability."},
    {"id": "3", "text": "Current Mars missions search for microbial life in subsurface ice."},
    {"id": "4", "text": "Terraforming Mars could make it habitable, but technology is decades away."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Query
query = "Mars habitability"

# Step 6: Initial Answer Generation
initial_prompt = f"Provide a brief answer to the query: {query}"
model = genai.GenerativeModel("gemini-1.5-pro")
initial_response = model.generate_content(initial_prompt)
initial_answer = initial_response.text

# Step 7: Dense Retrieval with Qdrant
query_response = genai.embed_content(model="models/embedding-001", content=query)
query_vector = query_response["embedding"]
dense_hits = qdrant.query_points(
    collection_name=COLLECTION_NAME,
    query=query_vector,
    limit=4,
    with_payload=True
).points

# Step 8: BM25 Sparse Retrieval
tokenized_query = query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_results = [
    {"text": docs[i]["text"], "score": bm25_scores[i]}
    for i in range(len(docs))
    if bm25_scores[i] > 0
]
bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:4]

# Step 9: Hybrid Retrieval
dense_scores = {hit.payload["text"]: hit.score for hit in dense_hits}
bm25_scores = {result["text"]: result["score"] for result in bm25_results}
all_texts = set(dense_scores.keys()).union(bm25_scores.keys())

max_dense = max(dense_scores.values(), default=1.0)
max_bm25 = max(bm25_scores.values(), default=1.0)

hybrid_results = {}
for text in all_texts:
    dense_score = dense_scores.get(text, 0) / max_dense
    bm25_score = bm25_scores.get(text, 0) / max_bm25
    hybrid_score = 0.6 * dense_score + 0.4 * bm25_score
    hybrid_results[text] = hybrid_score

# Step 10: Re-ranking with Cross-Encoder
rerank_inputs = [[query, text] for text in hybrid_results.keys()]
rerank_scores = cross_encoder.predict(rerank_inputs)
reranked_results = [
    {"text": text, "score": rerank_scores[i]}
    for i, text in enumerate(hybrid_results.keys())
]
reranked_results = sorted(reranked_results, key=lambda x: x["score"], reverse=True)[:3]

# Step 11: Refine Answer with Retrieved Evidence
context = "\n".join([result["text"] for result in reranked_results])
refine_prompt = f"""
Query: {query}
Initial Answer: {initial_answer}
Context: {context}

Refine the initial answer based on the provided context to ensure accuracy and include relevant details. If the initial answer contains inaccuracies, correct them. Provide a concise, factual response.
Answer:
"""
refined_response = model.generate_content(refine_prompt)
refined_answer = refined_response.text

# Step 12: Display Results
print("\n📝 Initial Answer:")
print(initial_answer)

print("\n🔍 Dense Retrieval Results for query:", query)
for hit in dense_hits:
    print(f"- {hit.payload['text']} (Score: {hit.score:.4f})")

print("\n🔍 BM25 Sparse Retrieval Results for query:", query)
for result in bm25_results:
    print(f"- {result['text']} (Score: {result['score']:.4f})")

print("\n🔍 Hybrid Retrieval Results for query:", query)
for text, score in sorted(hybrid_results.items(), key=lambda x: x[1], reverse=True)[:3]:
    print(f"- {text} (Hybrid Score: {score:.4f})")

print("\n🔍 Re-ranked Hybrid Results for query:", query)
for result in reranked_results:
    print(f"- {result['text']} (Re-ranked Score: {result['score']:.4f})")

print("\n📝 Refined Answer:")
print(refined_answer)

✅ Collection 'Post-Retrieval' created.
✅ Documents upserted to Qdrant for dense retrieval.

📝 Initial Answer:
Mars's past habitability is highly probable, with evidence of liquid water.  Current habitability is unlikely without significant terraforming due to thin atmosphere, high radiation, and lack of readily available liquid water.


🔍 Dense Retrieval Results for query: Mars habitability
- Mars habitability is limited by its thin atmosphere and lack of liquid water. (Score: 0.8981)
- Evidence of ancient water flows on Mars suggests past habitability. (Score: 0.7998)
- Terraforming Mars could make it habitable, but technology is decades away. (Score: 0.7916)
- Current Mars missions search for microbial life in subsurface ice. (Score: 0.7196)

🔍 BM25 Sparse Retrieval Results for query: Mars habitability
- Mars habitability is limited by its thin atmosphere and lack of liquid water. (Score: 0.9509)
- Evidence of ancient water flows on Mars suggests past habitability. (Score: 0.1892)
- 

# **Iterative Retrieval RAG**


In [None]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

# Initialize cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

COLLECTION_NAME = "Iterative-Retrival"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")

# Step 2: Sample documents (renewable energy in Germany and California)
docs = [
    {"id": "1", "text": "Germany’s renewable energy mix includes 46% wind and solar in 2023, driven by Energiewende policies."},
    {"id": "2", "text": "California aims for 60% renewable energy by 2030, with heavy investment in solar farms."},
    {"id": "3", "text": "Germany’s feed-in tariffs have boosted solar and wind adoption since the 2000s."},
    {"id": "4", "text": "California’s renewable energy faces grid reliability challenges due to solar intermittency."},
    {"id": "5", "text": "Germany leads in offshore wind, while California focuses on rooftop solar."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Query and Sub-Queries
main_query = "comparing renewable energy in Germany vs. California"
sub_queries = [
    "renewable energy in Germany",
    "renewable energy in California",
    "comparison of renewable energy in Germany and California"
]

# Step 6: Iterative Retrieval-Generation
model = genai.GenerativeModel("gemini-1.5-pro")
partial_answers = []

for cycle, sub_query in enumerate(sub_queries, 1):
    print(f"\n🔄 Cycle {cycle}: Processing sub-query: {sub_query}")

    # Step 6.1: Dense Retrieval with Qdrant
    query_response = genai.embed_content(model="models/embedding-001", content=sub_query)
    query_vector = query_response["embedding"]
    dense_hits = qdrant.query_points(
        collection_name=COLLECTION_NAME,
        query=query_vector,
        limit=4,
        with_payload=True
    ).points

    # Step 6.2: BM25 Sparse Retrieval
    tokenized_query = sub_query.lower().split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_results = [
        {"text": docs[i]["text"], "score": bm25_scores[i]}
        for i in range(len(docs))
        if bm25_scores[i] > 0
    ]
    bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:4]

    # Step 6.3: Hybrid Retrieval
    dense_scores = {hit.payload["text"]: hit.score for hit in dense_hits}
    bm25_scores = {result["text"]: result["score"] for result in bm25_results}
    all_texts = set(dense_scores.keys()).union(bm25_scores.keys())

    max_dense = max(dense_scores.values(), default=1.0)
    max_bm25 = max(bm25_scores.values(), default=1.0)

    hybrid_results = {}
    for text in all_texts:
        dense_score = dense_scores.get(text, 0) / max_dense
        bm25_score = bm25_scores.get(text, 0) / max_bm25
        hybrid_score = 0.6 * dense_score + 0.4 * bm25_score
        hybrid_results[text] = hybrid_score

    # Step 6.4: Re-ranking with Cross-Encoder
    rerank_inputs = [[sub_query, text] for text in hybrid_results.keys()]
    rerank_scores = cross_encoder.predict(rerank_inputs)
    reranked_results = [
        {"text": text, "score": rerank_scores[i]}
        for i, text in enumerate(hybrid_results.keys())
    ]
    reranked_results = sorted(reranked_results, key=lambda x: x["score"], reverse=True)[:3]

    # Step 6.5: Generate Partial Answer
    context = "\n".join([result["text"] for result in reranked_results])
    prompt = f"""
    Based on the following context, provide a concise answer to the query: {sub_query}
    Context: {context}
    Answer:
    """
    response = model.generate_content(prompt)
    partial_answer = response.text
    partial_answers.append(partial_answer)

    # Display Cycle Results
    print(f"\n🔍 Re-ranked Hybrid Results for sub-query: {sub_query}")
    for result in reranked_results:
        print(f"- {result['text']} (Re-ranked Score: {result['score']:.4f})")
    print(f"\n📝 Partial Answer: {partial_answer}")

# Step 7: Final Synthesis
synthesis_prompt = f"""
Query: {main_query}
Partial Answers:
1. {partial_answers[0]}
2. {partial_answers[1]}
3. {partial_answers[2]}

Synthesize the partial answers into a comprehensive, concise response comparing renewable energy in Germany and California. Highlight key similarities, differences, and notable policies or challenges.
Answer:
"""
final_response = model.generate_content(synthesis_prompt)
final_answer = final_response.text

# Step 8: Display Final Answer
print("\n📝 Final Synthesized Answer:")
print(final_answer)

✅ Collection 'Iterative-Retrival' created.
✅ Documents upserted to Qdrant for dense retrieval.

🔄 Cycle 1: Processing sub-query: renewable energy in Germany

🔍 Re-ranked Hybrid Results for sub-query: renewable energy in Germany
- Germany’s renewable energy mix includes 46% wind and solar in 2023, driven by Energiewende policies. (Re-ranked Score: 8.7021)
- Germany’s feed-in tariffs have boosted solar and wind adoption since the 2000s. (Re-ranked Score: 0.9584)
- Germany leads in offshore wind, while California focuses on rooftop solar. (Re-ranked Score: -0.6066)

📝 Partial Answer: Germany's renewable energy relies heavily on wind and solar, comprising 46% of its mix in 2023, driven by the Energiewende policy promoting renewables.


🔄 Cycle 2: Processing sub-query: renewable energy in California

🔍 Re-ranked Hybrid Results for sub-query: renewable energy in California
- California aims for 60% renewable energy by 2030, with heavy investment in solar farms. (Re-ranked Score: 8.3120)
- Ca

# **Extractive RAG**

In [None]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np
import re

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

# Initialize cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

COLLECTION_NAME = "Extractive"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")

# Step 2: Sample documents (osmosis context)
docs = [
    {"id": "1", "text": "Osmosis is the diffusion of water molecules across a selectively permeable membrane from an area of higher water concentration to an area of lower water concentration."},
    {"id": "2", "text": "In biology, osmosis plays a critical role in maintaining cell hydration and nutrient transport."},
    {"id": "3", "text": "Osmosis differs from active transport, which requires energy to move substances against a concentration gradient."},
    {"id": "4", "text": "The process of osmosis is essential for plant roots to absorb water from the soil."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Query
query = "osmosis definition"

# Step 6: Dense Retrieval with Qdrant
query_response = genai.embed_content(model="models/embedding-001", content=query)
query_vector = query_response["embedding"]
dense_hits = qdrant.query_points(
    collection_name=COLLECTION_NAME,
    query=query_vector,
    limit=4,
    with_payload=True
).points

# Step 7: BM25 Sparse Retrieval
tokenized_query = query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_results = [
    {"text": docs[i]["text"], "score": bm25_scores[i]}
    for i in range(len(docs))
    if bm25_scores[i] > 0
]
bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:4]

# Step 8: Hybrid Retrieval
dense_scores = {hit.payload["text"]: hit.score for hit in dense_hits}
bm25_scores = {result["text"]: result["score"] for result in bm25_results}
all_texts = set(dense_scores.keys()).union(bm25_scores.keys())

max_dense = max(dense_scores.values(), default=1.0)
max_bm25 = max(bm25_scores.values(), default=1.0)

hybrid_results = {}
for text in all_texts:
    dense_score = dense_scores.get(text, 0) / max_dense
    bm25_score = bm25_scores.get(text, 0) / max_bm25
    hybrid_score = 0.6 * dense_score + 0.4 * bm25_score
    hybrid_results[text] = hybrid_score

# Step 9: Re-ranking with Cross-Encoder
rerank_inputs = [[query, text] for text in hybrid_results.keys()]
rerank_scores = cross_encoder.predict(rerank_inputs)
reranked_results = [
    {"text": text, "score": rerank_scores[i]}
    for i, text in enumerate(hybrid_results.keys())
]
reranked_results = sorted(reranked_results, key=lambda x: x["score"], reverse=True)[:3]

# Step 10: Snippet Extraction
# Extract sentences containing "osmosis" and rank by relevance to "definition"
snippets = []
for result in reranked_results:
    text = result["text"]
    # Split text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    for sentence in sentences:
        if "osmosis" in sentence.lower() and any(word in sentence.lower() for word in ["is", "defined", "definition"]):
            snippets.append({"text": sentence.strip(), "score": result["score"]})

# Sort snippets by score and select the top one (or more if needed)
snippets = sorted(snippets, key=lambda x: x["score"], reverse=True)[:1]

# Step 11: Optional Validation with Gemini
# Use Gemini to format or confirm the snippet (minimal generation)
model = genai.GenerativeModel("gemini-1.5-pro")
if snippets:
    snippet_text = snippets[0]["text"]
    prompt = f"""
    Query: {query}
    Extracted Snippet: {snippet_text}

    Format the snippet as a quoted definition for the query. Ensure the text remains verbatim and add minimal context if needed.
    Answer:
    """
    response = model.generate_content(prompt)
    final_answer = response.text
else:
    final_answer = "No exact definition of osmosis found in the provided documents."

# Step 12: Display Results
print("\n🔍 Dense Retrieval Results for query:", query)
for hit in dense_hits:
    print(f"- {hit.payload['text']} (Score: {hit.score:.4f})")

print("\n🔍 BM25 Sparse Retrieval Results for query:", query)
for result in bm25_results:
    print(f"- {result['text']} (Score: {result['score']:.4f})")

print("\n🔍 Hybrid Retrieval Results for query:", query)
for text, score in sorted(hybrid_results.items(), key=lambda x: x[1], reverse=True)[:3]:
    print(f"- {text} (Hybrid Score: {score:.4f})")

print("\n🔍 Re-ranked Hybrid Results for query:", query)
for result in reranked_results:
    print(f"- {result['text']} (Re-ranked Score: {result['score']:.4f})")

print("\n📝 Extracted Answer:")
print(final_answer)

✅ Collection 'Extractive' created.
✅ Documents upserted to Qdrant for dense retrieval.

🔍 Dense Retrieval Results for query: osmosis definition
- Osmosis is the diffusion of water molecules across a selectively permeable membrane from an area of higher water concentration to an area of lower water concentration. (Score: 0.8940)
- The process of osmosis is essential for plant roots to absorb water from the soil. (Score: 0.8441)
- In biology, osmosis plays a critical role in maintaining cell hydration and nutrient transport. (Score: 0.8260)
- Osmosis differs from active transport, which requires energy to move substances against a concentration gradient. (Score: 0.8155)

🔍 BM25 Sparse Retrieval Results for query: osmosis definition
- In biology, osmosis plays a critical role in maintaining cell hydration and nutrient transport. (Score: 0.1620)
- Osmosis differs from active transport, which requires energy to move substances against a concentration gradient. (Score: 0.1576)
- The process 

# **Abstractive RAG**

In [None]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

# Initialize cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

COLLECTION_NAME = "Abstractive"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")

# Step 2: Sample documents (news articles on AI advancements in 2025)
docs = [
    {"id": "1", "text": "In 2025, AI models achieved breakthroughs in multimodal processing, integrating text, images, and audio for applications in healthcare and autonomous vehicles."},
    {"id": "2", "text": "Major tech companies in 2025 invested heavily in quantum AI, promising faster computation for complex problems like climate modeling."},
    {"id": "3", "text": "Ethical AI frameworks gained traction in 2025, with new regulations in Europe to ensure transparency in AI decision-making."},
    {"id": "4", "text": "AI-driven personalized education platforms expanded in 2025, tailoring curricula to individual student needs."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Query
query = "summarize AI advancements in 2025"

# Step 6: Dense Retrieval with Qdrant
query_response = genai.embed_content(model="models/embedding-001", content=query)
query_vector = query_response["embedding"]
dense_hits = qdrant.query_points(
    collection_name=COLLECTION_NAME,
    query=query_vector,
    limit=4,
    with_payload=True
).points

# Step 7: BM25 Sparse Retrieval
tokenized_query = query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_results = [
    {"text": docs[i]["text"], "score": bm25_scores[i]}
    for i in range(len(docs))
    if bm25_scores[i] > 0
]
bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:4]

# Step 8: Hybrid Retrieval
dense_scores = {hit.payload["text"]: hit.score for hit in dense_hits}
bm25_scores = {result["text"]: result["score"] for result in bm25_results}
all_texts = set(dense_scores.keys()).union(bm25_scores.keys())

max_dense = max(dense_scores.values(), default=1.0)
max_bm25 = max(bm25_scores.values(), default=1.0)

hybrid_results = {}
for text in all_texts:
    dense_score = dense_scores.get(text, 0) / max_dense
    bm25_score = bm25_scores.get(text, 0) / max_bm25
    hybrid_score = 0.6 * dense_score + 0.4 * bm25_score
    hybrid_results[text] = hybrid_score

# Step 9: Re-ranking with Cross-Encoder
rerank_inputs = [[query, text] for text in hybrid_results.keys()]
rerank_scores = cross_encoder.predict(rerank_inputs)
reranked_results = [
    {"text": text, "score": rerank_scores[i]}
    for i, text in enumerate(hybrid_results.keys())
]
reranked_results = sorted(reranked_results, key=lambda x: x["score"], reverse=True)[:3]

# Step 10: Abstractive Generation with Gemini
context = "\n".join([result["text"] for result in reranked_results])
prompt = f"""
Query: {query}
Context: {context}

Summarize the key AI advancements in 2025 based on the provided context. Provide a concise, coherent response that captures the main points without quoting verbatim.
Answer:
"""
model = genai.GenerativeModel("gemini-1.5-pro")
response = model.generate_content(prompt)
summary = response.text

# Step 11: Display Results
print("\n🔍 Dense Retrieval Results for query:", query)
for hit in dense_hits:
    print(f"- {hit.payload['text']} (Score: {hit.score:.4f})")

print("\n🔍 BM25 Sparse Retrieval Results for query:", query)
for result in bm25_results:
    print(f"- {result['text']} (Score: {result['score']:.4f})")

print("\n🔍 Hybrid Retrieval Results for query:", query)
for text, score in sorted(hybrid_results.items(), key=lambda x: x[1], reverse=True)[:3]:
    print(f"- {text} (Hybrid Score: {score:.4f})")

print("\n🔍 Re-ranked Hybrid Results for query:", query)
for result in reranked_results:
    print(f"- {result['text']} (Re-ranked Score: {result['score']:.4f})")

print("\n📝 Summarized Answer:")
print(summary)

✅ Collection 'Abstractive' created.
✅ Documents upserted to Qdrant for dense retrieval.

🔍 Dense Retrieval Results for query: summarize AI advancements in 2025
- In 2025, AI models achieved breakthroughs in multimodal processing, integrating text, images, and audio for applications in healthcare and autonomous vehicles. (Score: 0.8118)
- Ethical AI frameworks gained traction in 2025, with new regulations in Europe to ensure transparency in AI decision-making. (Score: 0.7499)
- Major tech companies in 2025 invested heavily in quantum AI, promising faster computation for complex problems like climate modeling. (Score: 0.7480)
- AI-driven personalized education platforms expanded in 2025, tailoring curricula to individual student needs. (Score: 0.7437)

🔍 BM25 Sparse Retrieval Results for query: summarize AI advancements in 2025
- Major tech companies in 2025 invested heavily in quantum AI, promising faster computation for complex problems like climate modeling. (Score: 1.0718)
- Ethical 

# **Mixed RAG**

In [None]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np
import re

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

# Initialize cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

COLLECTION_NAME = "mixed"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    # print(f"✅ Collection '{COLLECTION_NAME}' created.)

# Step 2: Sample documents (AI risks context)
docs = [
    {"id": "1", "text": "AI systems pose ethical risks, including bias in decision-making. 'Algorithms can perpetuate existing inequalities if not carefully designed,' warns a 2025 ethics report."},
    {"id": "2", "text": "Technical risks of AI include system failures and vulnerabilities to hacking. 'A single flaw in AI could lead to catastrophic consequences,' notes a cybersecurity expert."},
    {"id": "3", "text": "Societal risks from AI involve job displacement and privacy erosion. 'Automation may disrupt 30% of jobs by 2030,' predicts an economic study."},
    {"id": "4", "text": "Regulatory gaps in AI governance increase risks of misuse. 'Without global standards, AI could be weaponized,' states a policy brief."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"], "doc_id": doc["id"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Query
query = "summarize AI risks with quoted examples"

# Step 6: Dense Retrieval with Qdrant
query_response = genai.embed_content(model="models/embedding-001", content=query)
query_vector = query_response["embedding"]
dense_hits = qdrant.query_points(
    collection_name=COLLECTION_NAME,
    query=query_vector,
    limit=4,
    with_payload=True
).points

# Step 7: BM25 Sparse Retrieval
tokenized_query = query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_results = [
    {"text": docs[i]["text"], "score": bm25_scores[i], "doc_id": docs[i]["id"]}
    for i in range(len(docs))
    if bm25_scores[i] > 0
]
bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:4]

# Step 8: Hybrid Retrieval
dense_scores = {hit.payload["text"]: hit.score for hit in dense_hits}
bm25_scores = {result["text"]: result["score"] for result in bm25_results}
all_texts = set(dense_scores.keys()).union(bm25_scores.keys())

max_dense = max(dense_scores.values(), default=1.0)
max_bm25 = max(bm25_scores.values(), default=1.0)

hybrid_results = {}
doc_ids = {hit.payload["text"]: hit.payload["doc_id"] for hit in dense_hits}  # Map text to doc_id
for text in all_texts:
    dense_score = dense_scores.get(text, 0) / max_dense
    bm25_score = bm25_scores.get(text, 0) / max_bm25
    hybrid_score = 0.6 * dense_score + 0.4 * bm25_score
    hybrid_results[text] = {"score": hybrid_score, "doc_id": doc_ids.get(text, "unknown")}

# Step 9: Re-ranking with Cross-Encoder
rerank_inputs = [[query, text] for text in hybrid_results.keys()]
rerank_scores = cross_encoder.predict(rerank_inputs)
reranked_results = [
    {"text": text, "score": rerank_scores[i], "doc_id": hybrid_results[text]["doc_id"]}
    for i, text in enumerate(hybrid_results.keys())
]
reranked_results = sorted(reranked_results, key=lambda x: x["score"], reverse=True)[:3]

# Step 10: Snippet Extraction for Quotes
quotes = []
for result in reranked_results:
    text = result["text"]
    doc_id = result["doc_id"]
    # Extract quoted text within single quotes
    quoted_matches = re.findall(r"'(.*?)'", text)
    for quote in quoted_matches:
        if any(keyword in quote.lower() for keyword in ["risk", "ai", "bias", "failure", "job", "privacy", "misuse"]):
            quotes.append({"text": quote, "score": result["score"], "doc_id": doc_id})

# Sort quotes by score and select top 2
quotes = sorted(quotes, key=lambda x: x["score"], reverse=True)[:2]

# Step 11: Abstractive Summary with Gemini
context = "\n".join([result["text"] for result in reranked_results])
prompt = f"""
Query: {query}
Context: {context}

Provide a concise summary of AI risks based on the context, integrating the following quoted examples with citations:
{chr(10).join([f"- '{q['text']}' (Document {q['doc_id']})" for q in quotes])}

The response should paraphrase the main points, include the quoted examples, and cite the document IDs in parentheses.
Answer:
"""
model = genai.GenerativeModel("gemini-1.5-pro")
response = model.generate_content(prompt)
summary = response.text

# Step 12: Display Results
print("\n🔍 Dense Retrieval Results for query:", query)
for hit in dense_hits:
    print(f"- {hit.payload['text']} (Score: {hit.score:.4f})")

print("\n🔍 BM25 Sparse Retrieval Results for query:", query)
for result in bm25_results:
    print(f"- {result['text']} (Score: {result['score']:.4f})")

print("\n🔍 Hybrid Retrieval Results for query:", query)
for text, info in sorted(hybrid_results.items(), key=lambda x: x[1]["score"], reverse=True)[:3]:
    print(f"- {text} (Hybrid Score: {info['score']:.4f})")

print("\n🔍 Re-ranked Hybrid Results for query:", query)
for result in reranked_results:
    print(f"- {result['text']} (Re-ranked Score: {result['score']:.4f})")

print("\n📝 Mixed RAG Answer:")
print(summary)

✅ Documents upserted to Qdrant for dense retrieval.

🔍 Dense Retrieval Results for query: summarize AI risks with quoted examples
- Technical risks of AI include system failures and vulnerabilities to hacking. 'A single flaw in AI could lead to catastrophic consequences,' notes a cybersecurity expert. (Score: 0.7800)
- Societal risks from AI involve job displacement and privacy erosion. 'Automation may disrupt 30% of jobs by 2030,' predicts an economic study. (Score: 0.7466)
- Regulatory gaps in AI governance increase risks of misuse. 'Without global standards, AI could be weaponized,' states a policy brief. (Score: 0.7223)
- AI systems pose ethical risks, including bias in decision-making. 'Algorithms can perpetuate existing inequalities if not carefully designed,' warns a 2025 ethics report. (Score: 0.7053)

🔍 BM25 Sparse Retrieval Results for query: summarize AI risks with quoted examples
- Regulatory gaps in AI governance increase risks of misuse. 'Without global standards, AI coul

# **Agent-Based RAG**

In [None]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np
from typing import List, Dict
import json
import re

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

# Initialize cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

COLLECTION_NAME = "Agent-Based"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")

# Step 2: Sample local documents (vertical farming developments)
docs = [
    {"id": "1", "text": "In 2024, AeroFarms expanded its vertical farming operations with a new facility in Saudi Arabia, leveraging AI-driven analytics for crop optimization."},
    {"id": "2", "text": "LED lighting advancements in 2023 reduced energy costs for vertical farms by 20%, enabling wider adoption of hydroponics."},
    {"id": "3", "text": "Urban Crop Solutions launched a financing program with Siemens in 2022 to support scalable vertical farming infrastructure."},
    {"id": "4", "text": "The global vertical farming market grew to USD 7.51 billion in 2024, driven by demand for organic produce and sustainable practices."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"], "doc_id": doc["id"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Simulated Web Search Tool (using provided web results)
def web_search_tool(query: str) -> List[Dict]:
    web_results = [
        {
            "source": "grandviewresearch.com",
            "text": "The global vertical farming market size was valued at USD 6.92 billion in 2023 and is expected to grow at a CAGR of 20.1% from 2023 to 2030. Vertical farms are becoming technologically advanced, with the use of LED lights and automated control systems.",
            "url": "https://www.grandviewresearch.com"
        },
        {
            "source": "marketsandmarkets.com",
            "text": "The global vertical farming market size was estimated at USD 5.6 billion in 2024 and is poised to reach USD 13.7 billion by 2029, growing at a CAGR of 19.7%. Developments in IoT, AI, and hydroponics increase efficiency.",
            "url": "https://www.marketsandmarkets.com"
        },
        {
            "source": "straitsresearch.com",
            "text": "In urban settings, vertical farms develop a farm-to-table system, reducing food packaging and waste. LED technology advancements drive market growth.",
            "url": "https://straitsresearch.com"
        }
    ]
    return web_results

# Step 6: Gemini-Powered Agent
class GeminiAgent:
    def __init__(self, model_name: str = "gemini-1.5-pro"):
        self.model = genai.GenerativeModel(model_name)

    def plan_retrieval(self, query: str) -> Dict:
        prompt = f"""
        Query: {query}
        You are an agent planning data retrieval for a market report. Decide which tools to use:
        - Web search for real-time market data and trends
        - Local document search for internal data
        Provide a plan as a JSON object with 'tools' (list) and 'rationale' (string). Ensure the response is valid JSON without markdown or code blocks.
        Example:
        {{"tools": ["web_search", "local_search"], "rationale": "Web search for real-time data and local search for internal insights."}}
        """
        try:
            response = self.model.generate_content(prompt)
            # Clean response: remove markdown code blocks or other formatting
            cleaned_text = re.sub(r'```(?:json)?\n|\n```', '', response.text).strip()
            # Parse JSON safely
            plan = json.loads(cleaned_text)
            # Validate expected structure
            if not isinstance(plan, dict) or "tools" not in plan or "rationale" not in plan:
                raise ValueError("Invalid plan structure")
            return plan
        except Exception as e:
            print(f"⚠️ Error parsing plan: {e}")
            print(f"Raw response: {response.text}")
            # Fallback plan
            return {
                "tools": ["web_search", "local_search"],
                "rationale": "Fallback: Use web search for real-time data and local search for internal insights due to parsing error."
            }

    def execute_retrieval(self, plan: Dict, query: str) -> List[Dict]:
        results = []
        for tool in plan["tools"]:
            if tool == "web_search":
                web_results = web_search_tool(query)
                results.extend([
                    {"text": r["text"], "source": r["source"], "url": r["url"], "type": "web"}
                    for r in web_results
                ])
            elif tool == "local_search":
                # Dense Retrieval
                query_response = genai.embed_content(model="models/embedding-001", content=query)
                query_vector = query_response["embedding"]
                dense_hits = qdrant.query_points(
                    collection_name=COLLECTION_NAME,
                    query=query_vector,
                    limit=4,
                    with_payload=True
                ).points
                results.extend([
                    {"text": hit.payload["text"], "source": f"Local Doc {hit.payload['doc_id']}", "type": "local"}
                    for hit in dense_hits
                ])
                # Sparse Retrieval (BM25)
                tokenized_query = query.lower().split()
                bm25_scores = bm25.get_scores(tokenized_query)
                bm25_results = [
                    {"text": docs[i]["text"], "score": bm25_scores[i], "doc_id": docs[i]["id"]}
                    for i in range(len(docs))
                    if bm25_scores[i] > 0
                ]
                bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:4]
                results.extend([
                    {"text": r["text"], "source": f"Local Doc {r['doc_id']}", "type": "local"}
                    for r in bm25_results
                ])
        return results

    def generate_report(self, query: str, retrieved_data: List[Dict]) -> str:
        context = "\n".join([f"Source: {d['source']}\n{d['text']}" for d in retrieved_data])
        prompt = f"""
        Query: {query}
        Context: {context}

        Generate a concise market report on recent developments in vertical farming. Include:
        - A summary of market size and growth trends.
        - Key technological advancements.
        - Notable industry developments (e.g., partnerships, expansions).
        - Citations for sources in parentheses (e.g., grandviewresearch.com).
        The response should be coherent, paraphrased, and professional, avoiding verbatim quotes unless necessary.
        Answer:
        """
        response = self.model.generate_content(prompt)
        return response.text

# Step 7: Query
query = "market report on vertical farming recent developments"

# Step 8: Agent Execution
agent = GeminiAgent()
plan = agent.plan_retrieval(query)
print("\n📋 Retrieval Plan:", plan)

retrieved_data = agent.execute_retrieval(plan, query)

# Step 9: Re-ranking with Cross-Encoder
rerank_inputs = [[query, data["text"]] for data in retrieved_data]
rerank_scores = cross_encoder.predict(rerank_inputs)
reranked_data = [
    {"text": data["text"], "source": data["source"], "score": rerank_scores[i], "type": data["type"]}
    for i, data in enumerate(retrieved_data)
]
reranked_data = sorted(reranked_data, key=lambda x: x["score"], reverse=True)[:5]

# Step 10: Generate Market Report
report = agent.generate_report(query, reranked_data)

# Step 11: Display Results
print("\n🔍 Retrieved Data:")
for data in retrieved_data:
    print(f"- Source: {data['source']}\n  {data['text']}")

print("\n🔍 Re-ranked Data:")
for data in reranked_data:
    print(f"- Source: {data['source']} (Score: {data['score']:.4f})\n  {data['text']}")

print("\n📝 Market Report:")
print(report)

✅ Collection 'Agent-Based' created.
✅ Documents upserted to Qdrant for dense retrieval.

📋 Retrieval Plan: {'tools': ['web_search', 'local_document_search'], 'rationale': 'Use web search to gather recent developments, market trends, competitor analysis, and publicly available market data on vertical farming. Utilize local document search to find internal reports, presentations, and data that may offer proprietary insights and complement publicly available information for a comprehensive market report.'}

🔍 Retrieved Data:
- Source: grandviewresearch.com
  The global vertical farming market size was valued at USD 6.92 billion in 2023 and is expected to grow at a CAGR of 20.1% from 2023 to 2030. Vertical farms are becoming technologically advanced, with the use of LED lights and automated control systems.
- Source: marketsandmarkets.com
  The global vertical farming market size was estimated at USD 5.6 billion in 2024 and is poised to reach USD 13.7 billion by 2029, growing at a CAGR of 

# **Multi-Modal RAG**

In [None]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np
import re
import json

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

# Initialize cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

COLLECTION_NAME = "Multi-Modal"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"✅ Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")

# Step 2: Sample documents (bird descriptions)
docs = [
    {"id": "1", "text": "The Northern Cardinal is a medium-sized songbird with a bright red crest, red body, and black face mask. It is commonly found in North America."},
    {"id": "2", "text": "The American Robin is a migratory bird with a reddish-orange breast, dark wings, and a white eye ring. It is widespread across the United States."},
    {"id": "3", "text": "The Blue Jay is known for its striking blue and white plumage, with a distinctive crest and black collar. It inhabits woodlands and suburban areas."},
    {"id": "4", "text": "The Black-capped Chickadee is a small bird with a black cap, white cheeks, and gray wings. It is known for its cheerful 'chick-a-dee' call."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"], "doc_id": doc["id"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("✅ Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Simulated Image Input and Query
# In practice, replace image_description with an actual image file (e.g., Image.open("bird.jpg"))
image_description = "A small bird with a red crest and black wings."
text_query = "What bird is this?"
combined_query = f"{text_query} Description: {image_description}"

# Step 6: Simulated Visual Database (for image comparison)
# In practice, store image embeddings in Qdrant using a vision model (e.g., CLIP)
visual_db = [
    {"id": "img1", "description": "Bright red crest, black face, red body", "species": "Northern Cardinal"},
    {"id": "img2", "description": "Reddish-orange breast, dark wings", "species": "American Robin"},
    {"id": "img3", "description": "Blue and white plumage, black collar", "species": "Blue Jay"},
    {"id": "img4", "description": "Black cap, white cheeks, gray wings", "species": "Black-capped Chickadee"},
]

def match_image(image_description: str) -> list[dict]:
    # Simulate image matching by comparing descriptions
    matches = []
    for img in visual_db:
        if "red crest" in image_description.lower() and "red crest" in img["description"].lower():
            matches.append({"species": img["species"], "description": img["description"], "score": 0.9})
        elif "black wings" in image_description.lower() and "wings" in img["description"].lower():
            matches.append({"species": img["species"], "description": img["description"], "score": 0.7})
    return sorted(matches, key=lambda x: x["score"], reverse=True)[:1]

# Step 7: Dense Retrieval with Qdrant
query_response = genai.embed_content(model="models/embedding-001", content=combined_query)
query_vector = query_response["embedding"]
dense_hits = qdrant.query_points(
    collection_name=COLLECTION_NAME,
    query=query_vector,
    limit=4,
    with_payload=True
).points

# Step 8: BM25 Sparse Retrieval
tokenized_query = combined_query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)
bm25_results = [
    {"text": docs[i]["text"], "score": bm25_scores[i], "doc_id": docs[i]["id"]}
    for i in range(len(docs))
    if bm25_scores[i] > 0
]
bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:4]

# Step 9: Hybrid Retrieval
dense_scores = {hit.payload["text"]: hit.score for hit in dense_hits}
bm25_scores = {result["text"]: result["score"] for result in bm25_results}
all_texts = set(dense_scores.keys()).union(bm25_scores.keys())

max_dense = max(dense_scores.values(), default=1.0)
max_bm25 = max(bm25_scores.values(), default=1.0)

hybrid_results = {}
doc_ids = {hit.payload["text"]: hit.payload["doc_id"] for hit in dense_hits}
for text in all_texts:
    dense_score = dense_scores.get(text, 0) / max_dense
    bm25_score = bm25_scores.get(text, 0) / max_bm25
    hybrid_score = 0.6 * dense_score + 0.4 * bm25_score
    hybrid_results[text] = {"score": hybrid_score, "doc_id": doc_ids.get(text, "unknown")}

# Step 10: Re-ranking with Cross-Encoder
rerank_inputs = [[combined_query, text] for text in hybrid_results.keys()]
rerank_scores = cross_encoder.predict(rerank_inputs)
reranked_results = [
    {"text": text, "score": rerank_scores[i], "doc_id": hybrid_results[text]["doc_id"]}
    for i, text in enumerate(hybrid_results.keys())
]
reranked_results = sorted(reranked_results, key=lambda x: x["score"], reverse=True)[:3]

# Step 11: Image Matching
image_matches = match_image(image_description)
image_context = "\n".join([f"Species: {m['species']}, Description: {m['description']}" for m in image_matches])

# Step 12: Multi-Modal Generation with Gemini
context = "\n".join([result["text"] for result in reranked_results])
prompt = f"""
Query: {text_query}
Image Description: {image_description}
Textual Context: {context}
Image Context: {image_context}

Identify the bird based on the image description and provided context. Provide a concise response, including:
- The bird species.
- Key identifying features.
- A brief description from the context to support the identification.
Use the textual and image context to ensure accuracy. Return the response as plain text without markdown or code blocks.
"""
model = genai.GenerativeModel("gemini-1.5-pro")
# In practice, pass an actual image: response = model.generate_content([prompt, Image.open("bird.jpg")])
response = model.generate_content(prompt)
# Clean response to remove any markdown
cleaned_response = re.sub(r'```(?:text)?\n|\n```', '', response.text).strip()
identification = cleaned_response

# Step 13: Display Results
print("\n🔍 Dense Retrieval Results for query:", combined_query)
for hit in dense_hits:
    print(f"- {hit.payload['text']} (Score: {hit.score:.4f})")

print("\n🔍 BM25 Sparse Retrieval Results for query:", combined_query)
for result in bm25_results:
    print(f"- {result['text']} (Score: {result['score']:.4f})")

print("\n🔍 Hybrid Retrieval Results for query:", combined_query)
for text, info in sorted(hybrid_results.items(), key=lambda x: x[1]["score"], reverse=True)[:3]:
    print(f"- {text} (Hybrid Score: {info['score']:.4f})")

print("\n🔍 Re-ranked Hybrid Results for query:", combined_query)
for result in reranked_results:
    print(f"- {result['text']} (Re-ranked Score: {result['score']:.4f})")

print("\n🔍 Image Matching Results:")
for match in image_matches:
    print(f"- Species: {match['species']} (Score: {match['score']:.4f})")

print("\n📝 Bird Identification:")
print(identification)

✅ Collection 'Multi-Modal' created.
✅ Documents upserted to Qdrant for dense retrieval.

🔍 Dense Retrieval Results for query: What bird is this? Description: A small bird with a red crest and black wings.
- The Northern Cardinal is a medium-sized songbird with a bright red crest, red body, and black face mask. It is commonly found in North America. (Score: 0.7643)
- The Black-capped Chickadee is a small bird with a black cap, white cheeks, and gray wings. It is known for its cheerful 'chick-a-dee' call. (Score: 0.7496)
- The American Robin is a migratory bird with a reddish-orange breast, dark wings, and a white eye ring. It is widespread across the United States. (Score: 0.7245)
- The Blue Jay is known for its striking blue and white plumage, with a distinctive crest and black collar. It inhabits woodlands and suburban areas. (Score: 0.7099)

🔍 BM25 Sparse Retrieval Results for query: What bird is this? Description: A small bird with a red crest and black wings.
- The Black-capped Chi

# **Memory-Augmented RAG**

In [None]:
import os
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import numpy as np
import re
from typing import List, Dict

# ENV variables
QDRANT_HOST = ENV_QDRANT_HOST
QDRANT_API_KEY = ENV_QDRANT_API_KEY
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Qdrant client
qdrant = QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
)

# Initialize cross-encoder for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

COLLECTION_NAME = "Memory-Augmented"

# Step 1: Create collection (if not exists)
try:
    qdrant.get_collection(collection_name=COLLECTION_NAME)
    print(f"Collection '{COLLECTION_NAME}' already exists.")
except Exception:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )
    print(f"Collection '{COLLECTION_NAME}' created.")

# Step 2: Sample documents (router troubleshooting)
docs = [
    {"id": "1", "text": "If the router is not connecting to the internet after a reboot, check the Ethernet cable connection to the modem and ensure the modem is powered on."},
    {"id": "2", "text": "A common issue post-reboot is incorrect Wi-Fi settings. Verify the SSID and password in the router’s admin panel."},
    {"id": "3", "text": "If the router’s lights are blinking amber after reboot, perform a factory reset by holding the reset button for 10 seconds."},
    {"id": "4", "text": "Slow internet after rebooting the router may indicate interference. Change the Wi-Fi channel to 1, 6, or 11 in the router settings."},
]

# Step 3: Embed documents & upload to Qdrant (Dense Retrieval)
points = []
for doc in docs:
    response = genai.embed_content(model="models/embedding-001", content=doc["text"])
    embedding = response["embedding"]
    points.append(PointStruct(id=int(doc["id"]), vector=embedding, payload={"text": doc["text"], "doc_id": doc["id"]}))

qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
print("Documents upserted to Qdrant for dense retrieval.")

# Step 4: BM25 Setup for Sparse Retrieval
tokenized_docs = [doc["text"].lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

# Step 5: Memory-Augmented Chatbot Class
class TroubleshootingChatbot:
    def __init__(self, model_name: str = "gemini-1.5-pro", memory_size: int = 5):
        self.model = genai.GenerativeModel(model_name)
        self.memory: List[Dict] = []  # Store conversation history
        self.memory_size = memory_size  # Limit memory to last N exchanges

    def add_to_memory(self, user_query: str, system_response: str):
        """Add a user query and system response to memory."""
        self.memory.append({"user": user_query, "system": system_response})
        # Keep only the last memory_size exchanges
        self.memory = self.memory[-self.memory_size:]

    def get_context(self) -> str:
        """Generate context string from memory."""
        context = ""
        for exchange in self.memory:
            context += f"User: {exchange['user']}\nSystem: {exchange['system']}\n"
        return context.strip()

    def retrieve_documents(self, query: str) -> List[Dict]:
        """Retrieve documents using hybrid retrieval, augmented by conversation context."""
        # Combine current query with memory context
        context = self.get_context()
        augmented_query = f"{context}\nCurrent Query: {query}" if context else query

        # Dense Retrieval
        query_response = genai.embed_content(model="models/embedding-001", content=augmented_query)
        query_vector = query_response["embedding"]
        dense_hits = qdrant.query_points(
            collection_name=COLLECTION_NAME,
            query=query_vector,
            limit=4,
            with_payload=True
        ).points

        # Sparse Retrieval (BM25)
        tokenized_query = augmented_query.lower().split()
        bm25_scores = bm25.get_scores(tokenized_query)
        bm25_results = [
            {"text": docs[i]["text"], "score": bm25_scores[i], "doc_id": docs[i]["id"]}
            for i in range(len(docs))
            if bm25_scores[i] > 0
        ]
        bm25_results = sorted(bm25_results, key=lambda x: x["score"], reverse=True)[:4]

        # Hybrid Retrieval
        dense_scores = {hit.payload["text"]: hit.score for hit in dense_hits}
        bm25_scores = {result["text"]: result["score"] for result in bm25_results}
        all_texts = set(dense_scores.keys()).union(bm25_scores.keys())

        max_dense = max(dense_scores.values(), default=1.0)
        max_bm25 = max(bm25_scores.values(), default=1.0)

        hybrid_results = {}
        doc_ids = {hit.payload["text"]: hit.payload["doc_id"] for hit in dense_hits}
        for text in all_texts:
            dense_score = dense_scores.get(text, 0) / max_dense
            bm25_score = bm25_scores.get(text, 0) / max_bm25
            hybrid_score = 0.6 * dense_score + 0.4 * bm25_score
            hybrid_results[text] = {"score": hybrid_score, "doc_id": doc_ids.get(text, "unknown")}

        # Re-ranking with Cross-Encoder
        rerank_inputs = [[augmented_query, text] for text in hybrid_results.keys()]
        rerank_scores = cross_encoder.predict(rerank_inputs)
        reranked_results = [
            {"text": text, "score": rerank_scores[i], "doc_id": hybrid_results[text]["doc_id"]}
            for i, text in enumerate(hybrid_results.keys())
        ]
        return sorted(reranked_results, key=lambda x: x["score"], reverse=True)[:3]

    def generate_response(self, query: str, retrieved_docs: List[Dict]) -> str:
        """Generate a troubleshooting response using conversation history and retrieved documents."""
        context = self.get_context()
        doc_context = "\n".join([f"Doc {d['doc_id']}: {d['text']}" for d in retrieved_docs])
        prompt = f"""
        You are a troubleshooting chatbot helping with router issues. Use the conversation history and retrieved documents to provide a concise, relevant response to the current query. Avoid markdown or code blocks in the response.

        Conversation History:
        {context}

        Retrieved Documents:
        {doc_context}

        Current Query: {query}

        Provide a clear troubleshooting step or answer, referencing prior conversation details (e.g., "Since you rebooted the router") if relevant. Keep the response natural and concise.
        """
        try:
            response = self.model.generate_content(prompt)
            # Clean response to remove any markdown
            cleaned_response = re.sub(r'```(?:text)?\n|\n```', '', response.text).strip()
            return cleaned_response
        except Exception as e:
            print(f"Error generating response: {e}")
            return "Sorry, I encountered an issue. Please try again or provide more details."

# Step 6: Simulate Troubleshooting Interaction
chatbot = TroubleshootingChatbot()

# Simulated conversation
queries = [
    "My router isn’t connecting to the internet. I just rebooted it.",
    "The lights on the router are blinking amber now. What should I do?"
]

for query in queries:
    # Retrieve documents
    retrieved_docs = chatbot.retrieve_documents(query)

    # Generate response
    response = chatbot.generate_response(query, retrieved_docs)

    # Add to memory
    chatbot.add_to_memory(query, response)

    # Display results
    print(f"\nUser Query: {query}")
    print("\nRetrieved Documents:")
    for doc in retrieved_docs:
        print(f"- Doc {doc['doc_id']}: {doc['text']} (Score: {doc['score']:.4f})")
    print("\nSystem Response:")
    print(response)

# Step 7: Display Conversation History
print("\nConversation History:")
print(chatbot.get_context())

✅ Collection 'Memory-Augmented' created.
✅ Documents upserted to Qdrant for dense retrieval.

🔍 User Query: My router isn’t connecting to the internet. I just rebooted it.

🔍 Retrieved Documents:
- Doc 1: If the router is not connecting to the internet after a reboot, check the Ethernet cable connection to the modem and ensure the modem is powered on. (Score: 7.3244)
- Doc 4: Slow internet after rebooting the router may indicate interference. Change the Wi-Fi channel to 1, 6, or 11 in the router settings. (Score: 1.9415)
- Doc 2: A common issue post-reboot is incorrect Wi-Fi settings. Verify the SSID and password in the router’s admin panel. (Score: 0.1264)

📝 System Response:
Since you rebooted the router and it's not connecting to the internet, first check that the Ethernet cable is securely connected to both the router and your modem.  Also, make sure your modem is powered on.

🔍 User Query: The lights on the router are blinking amber now. What should I do?

🔍 Retrieved Documents:
-

# **Structured Data RAG**

In [None]:
import os
import google.generativeai as genai
import sqlite3
import re
import json
from datetime import datetime, timedelta
from typing import Dict, Any

# ENV variables
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Step 1: Set up SQLite database
def setup_database():
    """Create and populate a sample sales database."""
    conn = sqlite3.connect(":memory:")  # In-memory database for demo
    cursor = conn.cursor()

    # Create sales table
    cursor.execute("""
        CREATE TABLE sales (
            sale_id INTEGER PRIMARY KEY,
            sale_date DATE,
            amount FLOAT,
            product_category TEXT
        )
    """)

    # Insert sample data (sales from Q4 2024 and earlier)
    sample_sales = [
        ("2024-10-15", 1500.50, "Electronics"),
        ("2024-11-01", 800.25, "Clothing"),
        ("2024-12-10", 1200.75, "Electronics"),
        ("2024-07-05", 600.00, "Books"),
        ("2024-06-30", 900.00, "Clothing"),
    ]
    cursor.executemany("INSERT INTO sales (sale_date, amount, product_category) VALUES (?, ?, ?)", sample_sales)
    conn.commit()
    return conn

# Step 2: Structured Data RAG Class
class StructuredDataRAG:
    def __init__(self, model_name: str = "gemini-1.5-pro", db_conn: sqlite3.Connection = None):
        self.model = genai.GenerativeModel(model_name)
        self.conn = db_conn
        self.schema = """
        Table: sales
        Columns:
        - sale_id (INTEGER, PRIMARY KEY)
        - sale_date (DATE, e.g., '2024-07-15')
        - amount (FLOAT, sale amount in USD)
        - product_category (TEXT, e.g., 'Electronics', 'Clothing')
        """

    def generate_sql_query(self, query: str) -> str:
        """Generate an SQL query based on the natural language query."""
        prompt = f"""
        You are an expert SQL query generator. Given a natural language query and a database schema, generate a valid SQL query to retrieve the requested data. Return only the SQL query as plain text, without markdown, code blocks, or explanations.

        Database Schema:
        {self.schema}

        Query: {query}

        Example:
        For "total sales in 2024", return: SELECT SUM(amount) FROM sales WHERE strftime('%Y', sale_date) = '2024'

        Notes:
        - For "last quarter," assume the current date is {datetime.now().strftime('%Y-%m-%d')} and target the previous quarter (e.g., Q4 2024 for April 2025).
        - Use strftime for date comparisons.

        Generate the SQL query:
        """
        try:
            response = self.model.generate_content(prompt)
            cleaned_response = re.sub(r'```(?:sql)?\n|\n```', '', response.text).strip()
            return cleaned_response
        except Exception as e:
            print(f"⚠️ Error generating SQL query: {e}")
            return "SELECT 0 AS error"  # Fallback query

    def execute_query(self, sql_query: str) -> List[Dict]:
        """Execute the SQL query and return results as a list of dictionaries."""
        try:
            cursor = self.conn.cursor()
            cursor.execute(sql_query)
            columns = [desc[0] for desc in cursor.description]
            results = [dict(zip(columns, row)) for row in cursor.fetchall()]
            # Handle null or empty results
            if not results:
                return [{"total_sales": 0.0}]
            return results
        except Exception as e:
            print(f"⚠️ Error executing SQL query: {e}")
            return [{"error": "Failed to execute query"}]

    def generate_response(self, query: str, data: List[Dict]) -> str:
        """Generate a natural language response based on retrieved data."""
        # Convert data to JSON string without f-string to avoid format specifier issues
        data_str = json.dumps(data, indent=2)
        # Build prompt as a regular string concatenation to avoid f-string issues
        prompt = (
            "You are a data analyst. Given a natural language query and retrieved data from a database, "
            "generate a concise, natural language response summarizing the results. "
            "Return the response as plain text without markdown or code blocks.\n\n"
            "Query: " + query + "\n\n"
            "Retrieved Data: " + data_str + "\n\n"
            "Example:\n"
            "Query: total sales in 2024\n"
            "Data: [{\"sum\": 5000.0}]\n"
            "Response: The total sales in 2024 were $5,000.\n\n"
            "Generate the response:"
        )
        try:
            response = self.model.generate_content(prompt)
            cleaned_response = re.sub(r'```(?:text)?\n|\n```', '', response.text).strip()
            # Handle null or zero results in the response
            if any("total_sales" in d and d["total_sales"] == 0.0 for d in data):
                return "No sales were recorded for the last quarter."
            return cleaned_response
        except Exception as e:
            print(f"⚠️ Error generating response: {e}")
            return "Sorry, I couldn’t process the data. Please try again."

# Step 3: Simulate Query
query = "total sales last quarter"

# Step 4: Initialize Database and RAG
conn = setup_database()
rag = StructuredDataRAG(db_conn=conn)

# Step 5: Generate and Execute SQL Query
sql_query = rag.generate_sql_query(query)
print("\n🔍 Generated SQL Query:")
print(sql_query)

data = rag.execute_query(sql_query)
print("\n🔍 Retrieved Data:")
print(json.dumps(data, indent=2))

# Step 6: Generate Response
response = rag.generate_response(query, data)
print("\n📝 Response:")
print(response)

# Step 7: Clean up
conn.close()


🔍 Generated SQL Query:
SELECT SUM(amount) FROM sales WHERE strftime('%Y', sale_date) = '2024' AND strftime('%m', sale_date) BETWEEN '10' AND '12'

🔍 Retrieved Data:
[
  {
    "SUM(amount)": 3501.5
  }
]

📝 Response:
Total sales last quarter were $3,501.50.


# **Graph-Based RAG**

In [None]:
import os
import google.generativeai as genai
import re
import json
from typing import List, Dict, Any

# ENV variables
GEMINI_API_KEY = ENV_GEMINI_API_KEY

# Initialize Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Step 1: Mock Neo4j Knowledge Graph
class MockNeo4j:
    """Simulate a Neo4j knowledge graph with movie data."""
    def __init__(self):
        # Sample graph data: nodes (Actor, Movie, Director) and relationships
        self.graph_data = [
            {"actor": "Christian Bale", "movie": "The Dark Knight", "director": "Christopher Nolan", "year": 2008},
            {"actor": "Heath Ledger", "movie": "The Dark Knight", "director": "Christopher Nolan", "year": 2008},
            {"actor": "Leonardo DiCaprio", "movie": "Inception", "director": "Christopher Nolan", "year": 2010},
            {"actor": "Joseph Gordon-Levitt", "movie": "Inception", "director": "Christopher Nolan", "year": 2010},
            {"actor": "Cillian Murphy", "movie": "Oppenheimer", "director": "Christopher Nolan", "year": 2023},
            {"actor": "Robert Downey Jr.", "movie": "Oppenheimer", "director": "Christopher Nolan", "year": 2023},
            {"actor": "Keanu Reeves", "movie": "The Matrix", "director": "Wachowskis", "year": 1999},
        ]

    def run_query(self, cypher_query: str) -> List[Dict]:
        """Simulate executing a Cypher query against the graph."""
        try:
            results = []
            # Parse common Cypher patterns
            if "MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)<-[:DIRECTED]-(d:Director {name: 'Christopher Nolan'})" in cypher_query:
                for entry in self.graph_data:
                    if entry["director"] == "Christopher Nolan":
                        result = {"actor": entry["actor"], "movie": entry["movie"]}
                        if "m.year" in cypher_query:
                            result["year"] = entry["year"]
                        results.append(result)
            elif "MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)" in cypher_query and "Christopher Nolan" in cypher_query:
                for entry in self.graph_data:
                    if entry["director"] == "Christopher Nolan":
                        result = {"actor": entry["actor"], "movie": entry["movie"]}
                        if "m.year" in cypher_query:
                            result["year"] = entry["year"]
                        results.append(result)
            else:
                results.append({"error": "Unsupported query"})
            return results if results else [{"error": "No results found"}]
        except Exception as e:
            print(f"⚠️ Error executing Cypher query: {e}")
            return [{"error": "Failed to execute query"}]

# Step 2: Graph-Based RAG Class
class GraphBasedRAG:
    def __init__(self, model_name: str = "gemini-1.5-pro", graph_db: Any = None):
        self.model = genai.GenerativeModel(model_name)
        self.graph_db = graph_db
        self.schema = """
        Knowledge Graph Schema:
        Nodes:
        - Actor (properties: name)
        - Movie (properties: title, year)
        - Director (properties: name)
        Relationships:
        - (:Actor)-[:ACTED_IN]->(:Movie)
        - (:Director)-[:DIRECTED]->(:Movie)
        Example:
        (a:Actor {name: 'Christian Bale'})-[:ACTED_IN]->(m:Movie {title: 'The Dark Knight', year: 2008})<-[:DIRECTED]-(d:Director {name: 'Christopher Nolan'})
        """

    def is_safe_cypher(self, query: str) -> bool:
        """Validate Cypher query for safety."""
        dangerous_keywords = ["CREATE", "DELETE", "REMOVE", "SET", "MERGE"]
        return not any(keyword in query.upper() for keyword in dangerous_keywords)

    def generate_cypher_query(self, query: str) -> str:
        """Generate a Cypher query based on the natural language query."""
        prompt = f"""
        You are an expert Cypher query generator for a Neo4j knowledge graph. Given a natural language query and a graph schema, generate a valid Cypher query to retrieve the requested data. Return only the Cypher query as plain text, without markdown, code blocks, or explanations.

        Graph Schema:
        {self.schema}

        Query: {query}

        Example:
        For "actors in Nolan’s movies", return:
        MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)<-[:DIRECTED]-(d:Director {{name: 'Christopher Nolan'}})
        RETURN a.name AS actor, m.title AS movie

        Generate the Cypher query:
        """
        try:
            response = self.model.generate_content(prompt)
            cleaned_response = re.sub(r'```(?:cypher)?\n|\n```', '', response.text).strip()
            if not self.is_safe_cypher(cleaned_response):
                print("⚠️ Unsafe Cypher query detected")
                return "MATCH () RETURN 'error' AS error"
            return cleaned_response
        except Exception as e:
            print(f"⚠️ Error generating Cypher query: {e}")
            return "MATCH () RETURN 'error' AS error"

    def execute_query(self, cypher_query: str) -> List[Dict]:
        """Execute the Cypher query against the graph database."""
        return self.graph_db.run_query(cypher_query)

    def generate_response(self, query: str, data: List[Dict]) -> str:
        """Generate a natural language response based on retrieved graph data."""
        data_str = json.dumps(data, indent=2)
        prompt = (
            "You are a data analyst. Given a natural language query and retrieved data from a knowledge graph, "
            "generate a concise, natural language response summarizing the results. "
            "Return the response as plain text without markdown or code blocks.\n\n"
            "Query: " + query + "\n\n"
            "Retrieved Data: " + data_str + "\n\n"
            "Example:\n"
            "Query: actors in Nolan’s movies\n"
            "Data: [{\"actor\": \"Christian Bale\", \"movie\": \"The Dark Knight\"}, {\"actor\": \"Leonardo DiCaprio\", \"movie\": \"Inception\"}]\n"
            "Response: Actors in Christopher Nolan’s movies include Christian Bale (The Dark Knight) and Leonardo DiCaprio (Inception).\n\n"
            "Generate the response:"
        )
        try:
            response = self.model.generate_content(prompt)
            cleaned_response = re.sub(r'```(?:text)?\n|\n```', '', response.text).strip()
            if any("error" in d for d in data):
                return "No relevant data found for the query."
            return cleaned_response
        except Exception as e:
            print(f"⚠️ Error generating response: {e}")
            return "Sorry, I couldn’t process the data. Please try again."

    def verify_response(self, query: str, cypher_query: str, data: List[Dict], response: str) -> str:
        """Verify the accuracy of the generated response."""
        data_str = json.dumps(data, indent=2)
        prompt = (
            "Verify if the response '" + response + "' accurately reflects the query '" + query + "', "
            "Cypher query '" + cypher_query + "', and data " + data_str + ".\n"
            "Return a plain text verdict without markdown or code blocks."
        )
        try:
            verification = self.model.generate_content(prompt)
            return re.sub(r'```(?:text)?\n|\n```', '', verification.text).strip()
        except Exception as e:
            print(f"⚠️ Error verifying response: {e}")
            return "Verification failed."

# Step 3: Simulate Query
query = "actors in Nolan’s movies"

# Step 4: Initialize Graph and RAG
graph_db = MockNeo4j()
rag = GraphBasedRAG(graph_db=graph_db)

# Step 5: Generate and Execute Cypher Query
cypher_query = rag.generate_cypher_query(query)
print("\n🔍 Generated Cypher Query:")
print(cypher_query)

data = rag.execute_query(cypher_query)
print("\n🔍 Retrieved Data:")
print(json.dumps(data, indent=2))

# Step 6: Generate Response
response = rag.generate_response(query, data)
print("\n📝 Response:")
print(response)

# Step 7: Verify Response
verification = rag.verify_response(query, cypher_query, data, response)
print("\n🔍 Verification:")
print(verification)


🔍 Generated Cypher Query:
MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)<-[:DIRECTED]-(d:Director {name: 'Christopher Nolan'}) RETURN a.name

🔍 Retrieved Data:
[
  {
    "actor": "Christian Bale",
    "movie": "The Dark Knight"
  },
  {
    "actor": "Heath Ledger",
    "movie": "The Dark Knight"
  },
  {
    "actor": "Leonardo DiCaprio",
    "movie": "Inception"
  },
  {
    "actor": "Joseph Gordon-Levitt",
    "movie": "Inception"
  },
  {
    "actor": "Cillian Murphy",
    "movie": "Oppenheimer"
  },
  {
    "actor": "Robert Downey Jr.",
    "movie": "Oppenheimer"
  }
]

📝 Response:
Actors in Christopher Nolan's movies include Christian Bale (The Dark Knight), Heath Ledger (The Dark Knight), Leonardo DiCaprio (Inception), Joseph Gordon-Levitt (Inception), Cillian Murphy (Oppenheimer), and Robert Downey Jr. (Oppenheimer).

🔍 Verification:
Yes, the response accurately reflects the query and the data.
