In [1]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [2]:
bm25_scores = np.array([12.0, 5.5, 2.3, 8.7])
dense_scores = np.array([0.82, 0.45, 0.71, 0.30])

In [3]:
scaler = MinMaxScaler()
bm25_norm = scaler.fit_transform(bm25_scores.reshape(-1, 1)).flatten()
dense_norm = scaler.fit_transform(dense_scores.reshape(-1, 1)).flatten()

# Weighted fusion
alpha = 0.5 
final_scores = alpha * dense_norm + (1 - alpha) * bm25_norm

In [4]:
ranked_indices = np.argsort(final_scores)[::-1]

print("Final fused scores:", final_scores)
print("Ranking order:", ranked_indices)

Final fused scores: [1.         0.30917922 0.39423077 0.32989691]
Ranking order: [0 2 3 1]


In [6]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
documents = [
    "The Eiffel Tower is located in Paris.",
    "The capital of France is Paris.",
    "Machine learning enables computers to learn from data.",
    "Deep learning is a subset of machine learning.",
    "The Colosseum is a famous landmark in Rome."
]

query = "Where is the Eiffel Tower?"

In [10]:
tokenized_corpus = [doc.split(" ") for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)

bm25_scores = bm25.get_scores(query.split(" "))

In [11]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [12]:
doc_embeddings = model.encode(documents, convert_to_numpy=True, normalize_embeddings=True)
query_embedding = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)


In [13]:
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  
index.add(doc_embeddings)

In [14]:
k = len(documents)
scores_dense, indices_dense = index.search(query_embedding, k)
dense_scores = scores_dense.flatten()

In [16]:
scaler = MinMaxScaler()

bm25_norm = scaler.fit_transform(bm25_scores.reshape(-1, 1)).flatten()
dense_norm = scaler.fit_transform(dense_scores.reshape(-1, 1)).flatten()

In [18]:
alpha = 0.5  # weight for semantic similarity
final_scores = alpha * dense_norm + (1 - alpha) * bm25_norm

ranked_indices = np.argsort(final_scores)[::-1]

In [19]:
print("\n=== Hybrid Retrieval Results ===")
for idx in ranked_indices:
    print(f"Score: {final_scores[idx]:.4f} | Doc: {documents[idx]}")


=== Hybrid Retrieval Results ===
Score: 1.0000 | Doc: The Eiffel Tower is located in Paris.
Score: 0.2688 | Doc: The capital of France is Paris.
Score: 0.1480 | Doc: Machine learning enables computers to learn from data.
Score: 0.0818 | Doc: Deep learning is a subset of machine learning.
Score: 0.0742 | Doc: The Colosseum is a famous landmark in Rome.
