In [1]:
!pip install \
  numpy \
  rank-bm25 \
  sentence-transformers \
  chromadb \
  torch \
  transformers \
  accelerate



In [2]:
documents = [
    "Transformers are neural networks based on attention",
    "BM25 is a sparse retrieval algorithm",
    "Vector databases store embeddings",
    "Hybrid search combines dense and sparse retrieval",
    "Rerankers improve precision in retrieval systems"
]

In [3]:
from rank_bm25 import BM25Okapi

tokenized_docs = [doc.lower().split() for doc in documents]
bm25 = BM25Okapi(tokenized_docs)

query = "how does hybrid retrieval work"
bm25_scores = bm25.get_scores(query.lower().split())

bm25_ranking = sorted(
    enumerate(bm25_scores),
    key=lambda x: x[1],
    reverse=True
)

print("BM25 ranking:")
for idx, score in bm25_ranking:
    print(score, documents[idx])

BM25 ranking:
1.258530873323502 Hybrid search combines dense and sparse retrieval
0.2543084001546551 BM25 is a sparse retrieval algorithm
0.2543084001546551 Rerankers improve precision in retrieval systems
0.0 Transformers are neural networks based on attention
0.0 Vector databases store embeddings


In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

doc_embeddings = model.encode(documents, normalize_embeddings=True)
query_embedding = model.encode(query, normalize_embeddings=True)

dense_scores = doc_embeddings @ query_embedding

dense_ranking = sorted(
    enumerate(dense_scores),
    key=lambda x: x[1],
    reverse=True
)

print("\nDense ranking:")
for idx, score in dense_ranking:
    print(score, documents[idx])

2026-01-29 09:57:54.605339: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769680674.790666     100 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769680674.845142     100 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769680675.273985     100 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769680675.274026     100 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769680675.274028     100 computation_placer.cc:177] computation placer alr

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Dense ranking:
0.66249144 Hybrid search combines dense and sparse retrieval
0.45180082 Rerankers improve precision in retrieval systems
0.45042294 BM25 is a sparse retrieval algorithm
0.23870549 Vector databases store embeddings
0.14750189 Transformers are neural networks based on attention


In [9]:
def rrf(rankings, k=60):
    scores = {}
    for ranking in rankings:
        for rank, doc_id in enumerate(ranking):
            scores.setdefault(doc_id, 0)
            scores[doc_id] += 1 / (k + rank + 1)
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

bm25_order = [idx for idx, _ in bm25_ranking]
dense_order = [idx for idx, _ in dense_ranking]

rrf_result = rrf([bm25_order, dense_order])

print("\nRRF Hybrid ranking:")
for idx, score in rrf_result:
    print(score, documents[idx])


RRF Hybrid ranking:
0.03278688524590164 Hybrid search combines dense and sparse retrieval
0.03200204813108039 BM25 is a sparse retrieval algorithm
0.03200204813108039 Rerankers improve precision in retrieval systems
0.031009615384615385 Transformers are neural networks based on attention
0.031009615384615385 Vector databases store embeddings


In [10]:
queries = [
    query,
    "explain hybrid search",
    "combining bm25 and embeddings"
]

all_rankings = []

for q in queries:
    qe = model.encode(q, normalize_embeddings=True)
    scores = doc_embeddings @ qe
    ranking = [i for i, _ in sorted(enumerate(scores), key=lambda x: x[1], reverse=True)]
    all_rankings.append(ranking)

multi_query_rrf = rrf(all_rankings)

print("\nMulti-query RRF:")
for idx, score in multi_query_rrf:
    print(score, documents[idx])


Multi-query RRF:
0.04865990111891751 Hybrid search combines dense and sparse retrieval
0.04839549075403121 BM25 is a sparse retrieval algorithm
0.04762704813108039 Rerankers improve precision in retrieval systems
0.0471386476426799 Vector databases store embeddings
0.04639423076923077 Transformers are neural networks based on attention


In [11]:
from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")

hyde_prompt = f"Answer this question in detail:\n{query}"
hyde_doc = generator(hyde_prompt, max_new_tokens=100)[0]["generated_text"]

hyde_embedding = model.encode(hyde_doc, normalize_embeddings=True)
hyde_scores = doc_embeddings @ hyde_embedding

hyde_ranking = sorted(
    enumerate(hyde_scores),
    key=lambda x: x[1],
    reverse=True
)

print("\nHyDE ranking:")
for idx, score in hyde_ranking:
    print(score, documents[idx])

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



HyDE ranking:
0.572436 Hybrid search combines dense and sparse retrieval
0.35178772 BM25 is a sparse retrieval algorithm
0.35107332 Rerankers improve precision in retrieval systems
0.19534172 Vector databases store embeddings
0.088492095 Transformers are neural networks based on attention


In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

reranker_name = "BAAI/bge-reranker-large"
tokenizer = AutoTokenizer.from_pretrained(reranker_name)
reranker = AutoModelForSequenceClassification.from_pretrained(reranker_name)

def rerank(query, docs):
    inputs = tokenizer(
        [(query, d) for d in docs],
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    with torch.no_grad():
        scores = reranker(**inputs).logits.squeeze()
    return scores.tolist()

top_k_docs = [documents[i] for i, _ in rrf_result[:3]]
rerank_scores = rerank(query, top_k_docs)

print("\nBGE Reranker:")
for score, doc in sorted(zip(rerank_scores, top_k_docs), reverse=True):
    print(score, doc)


tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]


BGE Reranker:
0.8847523331642151 Hybrid search combines dense and sparse retrieval
-4.678332805633545 Rerankers improve precision in retrieval systems
-5.8633623123168945 BM25 is a sparse retrieval algorithm


In [4]:
from sentence_transformers import SentenceTransformer

token_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def colbert_score(query, doc):
    q_tokens = query.split()
    d_tokens = doc.split()

    q_vecs = token_model.encode(q_tokens, normalize_embeddings=True)
    d_vecs = token_model.encode(d_tokens, normalize_embeddings=True)

    score = 0
    for qv in q_vecs:
        score += max(qv @ dv for dv in d_vecs)
    return score

scores = [(doc, colbert_score(query, doc)) for doc in documents]
scores.sort(key=lambda x: x[1], reverse=True)

print("\nColBERT-style ranking:")
for doc, score in scores:
    print(score, doc)

2026-01-29 11:32:24.817881: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769686344.840250     230 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769686344.847032     230 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769686344.867089     230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769686344.867112     230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769686344.867115     230 computation_placer.cc:177] computation placer alr


ColBERT-style ranking:
3.0616643 Hybrid search combines dense and sparse retrieval
2.6402342 BM25 is a sparse retrieval algorithm
2.3578563 Rerankers improve precision in retrieval systems
1.8432432 Transformers are neural networks based on attention
1.4194515 Vector databases store embeddings
