# hybrid search and reranking in rag (demo)
## rutgers ieee ml/ai workshop by mehek ☃️

week 2 of natural language processing track

february 18, 2026

## install packages

In [None]:
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib
!pip install sentence-transformers


In [None]:
!pip install rank-bm25

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from googleapiclient.discovery import build

drive_service = build('drive', 'v3')


## loading all google docs tht you own

In [None]:
def list_my_docs():
    docs = []
    page_token = None

    while True:
        response = drive_service.files().list(
            q="mimeType='application/vnd.google-apps.document' and trashed=false",
            spaces='drive',
            fields='nextPageToken, files(id, name, owners)',
            pageToken=page_token
        ).execute()

        for file in response.get('files', []):
            if file['owners'][0]['me']:
                docs.append(file)

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break

    return docs

all_docs = list_my_docs()
print("Total Docs You Own:", len(all_docs))

In [None]:
MAX_DOCS = 200  # increase cautiously
emails = []

for i, file in enumerate(all_docs):
    if i >= MAX_DOCS:
        break

    try:
        request = drive_service.files().export_media(
            fileId=file['id'],
            mimeType='text/plain'
        )

        content = request.execute()
        text = content.decode('utf-8').strip()

        if len(text) > 100:
            emails.append(text)

    except Exception:
        continue

print("Loaded documents:", len(emails))


## chunking

In [None]:
def chunk_text(text, chunk_size=1000, overlap=150):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunked_docs = []

for doc in emails:
    chunked_docs.extend(chunk_text(doc))

print("Total chunks:", len(chunked_docs))

## setting up dense search

creating embeddings for all the chunks takes the longest amount of time in our code!

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(chunked_docs, convert_to_numpy=True)

In [None]:
def dense_search(query, top_k=5):
    query_emb = embedding_model.encode([query])
    sims = cosine_similarity(query_emb, embeddings)[0]
    top_indices = np.argsort(sims)[::-1][:top_k]
    return [(i, sims[i], chunked_docs[i]) for i in top_indices]

## setting up sparse search

In [None]:
from rank_bm25 import BM25Okapi

tokenized_chunks = [chunk.split() for chunk in chunked_docs]
bm25 = BM25Okapi(tokenized_chunks)

def bm25_search(query, top_k=5):
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    top_indices = np.argsort(scores)[::-1][:top_k]
    return [(i, scores[i], chunked_docs[i]) for i in top_indices]


## setting up hybrid search!

In [None]:
def hybrid_search(query, top_k=5, k=60):
    dense_results = dense_search(query, top_k=20)
    sparse_results = bm25_search(query, top_k=20)

    rrf_scores = {}

    for rank, (idx, _, _) in enumerate(dense_results):
        rrf_scores[idx] = rrf_scores.get(idx, 0) + 1 / (k + rank)

    for rank, (idx, _, _) in enumerate(sparse_results):
        rrf_scores[idx] = rrf_scores.get(idx, 0) + 1 / (k + rank)

    ranked = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

    return [(idx, score, chunked_docs[idx]) for idx, score in ranked]


In [None]:
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [None]:
def rerank(query, candidates):
    pairs = [(query, chunked_docs[idx]) for idx, _, _ in candidates]
    scores = reranker.predict(pairs)

    reranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)

    return [(idx, score, chunked_docs[idx]) for ((idx, _, _), score) in reranked]


## query query query query query query

In [None]:
query = "What research is Mehek doing?"

print("=== Dense ===")
for r in dense_search(query):
    print(r[2][:200], "\n")

print("=== Hybrid ===")
hybrid_results = hybrid_search(query)
for r in hybrid_results:
    print(r[2][:200], "\n")

print("=== Hybrid + Rerank ===")
reranked = rerank(query, hybrid_results)
for r in reranked:
    print(r[2][:200], "\n")


## visualizing results

In [None]:
from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components=2)
points_2d = pca.fit_transform(embeddings)

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

def visualize_dense_retrieval(query, k=3):
    # Get dense search results
    results = dense_search(query, top_k=k)
    neighbors = [r[0] for r in results]

    # Embed query
    q_vec = embedding_model.encode([query], convert_to_numpy=True)

    # Stack doc embeddings + query
    all_vecs = np.vstack([embeddings, q_vec])

    # PCA to 2D
    pca = PCA(n_components=2)
    all_2d = pca.fit_transform(all_vecs)

    doc_2d = all_2d[:-1]
    query_2d = all_2d[-1]

    # Plot
    plt.figure(figsize=(7,7))

    plt.scatter(doc_2d[:,0], doc_2d[:,1], alpha=0.3)
    plt.scatter(doc_2d[neighbors,0], doc_2d[neighbors,1],
                s=150, edgecolor='black')
    plt.scatter(query_2d[0], query_2d[1],
                s=250, marker="*")

    plt.title("Dense Retrieval (Cosine Similarity)")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.grid(True)
    plt.show()

    # Print chunks
    print("\nTop Retrieved Chunks (Dense):\n")
    for rank, idx in enumerate(neighbors):
        print(f"Rank {rank+1}:")
        print(chunked_docs[idx][:500])
        print("-" * 60)


In [None]:
def visualize_sparse_retrieval(query, k=3):
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    neighbors = np.argsort(scores)[::-1][:k]

    # Embed query for visualization only
    q_vec = embedding_model.encode([query], convert_to_numpy=True)

    # Stack doc embeddings + query
    all_vecs = np.vstack([embeddings, q_vec])

    # PCA
    pca = PCA(n_components=2)
    all_2d = pca.fit_transform(all_vecs)

    doc_2d = all_2d[:-1]
    query_2d = all_2d[-1]

    # Plot
    plt.figure(figsize=(7,7))

    plt.scatter(doc_2d[:,0], doc_2d[:,1], alpha=0.3)
    plt.scatter(doc_2d[neighbors,0], doc_2d[neighbors,1],
                s=150, edgecolor='black')
    plt.scatter(query_2d[0], query_2d[1],
                s=250, marker="*")

    plt.title("Sparse Retrieval (BM25)")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.grid(True)
    plt.show()

    # Print chunks
    print("\nTop Retrieved Chunks (Sparse / BM25):\n")
    for rank, idx in enumerate(neighbors):
        print(f"Rank {rank+1}:")
        print(chunked_docs[idx][:500])
        print("-" * 60)


Query: "What research is Mehek doing?"

In [None]:
query = "What do I like to do?"


In [None]:
visualize_dense_retrieval(query)

In [None]:
visualize_sparse_retrieval(query)

Query: "What is due on Feb 19?"

In [None]:
query = "What is due on Feb 19?"

In [None]:
visualize_dense_retrieval(query)

In [None]:
visualize_sparse_retrieval(query)

##