## Keyword Similarity

In [68]:
!pip install rank_bm25



In [69]:
from rank_bm25 import BM25Okapi

corpus = [

    "The cat, commonly referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal.",
    "The dog is a domesticated descendant of the wolf.",
    "Humans are the most common and widespread species of primate, and the last surviving species of the genus Homo.",
    "The scientific name Felis catus was proposed by Carl Linnaeus in 1758"

]

tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

query = "What is the scientifc name for cats?"
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)
print(doc_scores)

[0.15633023 0.20898199 0.26805423 0.9185659 ]


In [5]:
tokenized_query

['The', 'Cat']

In [70]:
query = "Feline"
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)
print(doc_scores)

[0. 0. 0. 0.]


# Semantic Simalrity

In [71]:
!pip install sentence-transformers



In [72]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
#  produce 384-dimensional dense embeddings

In [74]:
# The documents to encode
corpus = [
    "The cat, commonly referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal.",
    "The dog is a domesticated descendant of the wolf.",
    "Humans are the most common and widespread species of primate, and the last surviving species of the genus Homo.",
    "The scientific name Felis catus was proposed by Carl Linnaeus in 1758"
]

# Calculate embeddings by calling model.encode()
document_embeddings = model.encode(corpus)

# Sanity check
print(document_embeddings.shape)

(4, 384)


In [75]:
query = "What is the scientifc name for cats?"
query_embedding = model.encode(query)

In [85]:
from sentence_transformers.util import cos_sim

# Compute cosine_similarity between documents and query
cos_scores = cos_sim(document_embeddings, query_embedding)

print(cos_scores)

tensor([[0.4007],
        [0.3837],
        [0.0966],
        [0.3804]])


In [78]:
query_embedding = model.encode("feline")

cos_scores = cos_sim(document_embeddings, query_embedding)

print(cos_scores)

tensor([[0.4007],
        [0.3837],
        [0.0966],
        [0.3804]])


## Hybrid Similarity

In [97]:
import numpy as np

def scores_to_ranking(scores: list[float]) -> list[int]:
    """Convert float scores into int rankings (rank 1 is the best)"""
    return np.argsort(scores)[::-1] + 1


def rrf(keyword_rank: int, semantic_rank: int) -> float:
    """Combine keyword rank and semantic rank into a hybrid score."""
    k = 60
    rrf_score = 1 / (k + keyword_rank) + 1 / (k + semantic_rank)
    return rrf_score

In [80]:
bm25_ranking = scores_to_ranking(doc_scores)
print(bm25_ranking)

[4 3 2 1]


In [86]:
cos_scores = cos_scores.flatten().tolist()

In [87]:
cos_sim_ranking = scores_to_ranking(cos_scores)
print(cos_sim_ranking)

[1 2 4 3]


In [88]:
hybrid_scores = []
for i, doc in enumerate(corpus):
    hybrid_scores.append(rrf(bm25_ranking[i], cos_sim_ranking[i]))
hybrid_scores

[0.032018442622950824,
 0.03200204813108039,
 0.031754032258064516,
 0.032266458495966696]

In [89]:
hybrid_ranking = scores_to_ranking(hybrid_scores)
print(hybrid_ranking)

[4 1 2 3]


In [99]:
corpus = [
    "The cat, commonly referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal.",
    "The dog is a domesticated descendant of the wolf.",
    "Humans are the most common and widespread species of primate, and the last surviving species of the genus Homo.",
    "The scientific name Felis catus was proposed by Carl Linnaeus in 1758"
]

In [100]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def hybrid_search(
    query: str, corpus: list[str], encoder_model: SentenceTransformer
) -> list[int]:
    # bm25
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    tokenized_query = query.split(" ")
    bm25 = BM25Okapi(tokenized_corpus)
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_ranking = scores_to_ranking(bm25_scores)
    print("Keyword Ranking", bm25_ranking)

    # embeddings
    document_embeddings = model.encode(corpus)
    query_embedding = model.encode(query)
    cos_sim_scores = cos_sim(document_embeddings, query_embedding).flatten().tolist()
    cos_sim_ranking = scores_to_ranking(cos_sim_scores)
    print("Semantic Ranking", cos_sim_ranking)

    # combine rankings into RRF scores
    hybrid_scores = []
    for i, doc in enumerate(corpus):
        document_ranking = rrf(bm25_ranking[i], cos_sim_ranking[i])
        print(f"Document {i} has the rrf score {document_ranking}")
        hybrid_scores.append(document_ranking)

    # convert RRF scores into final rankings
    hybrid_ranking = scores_to_ranking(hybrid_scores)
    return hybrid_ranking

In [101]:
hybrid_ranking = hybrid_search(
    query="Is human are animal?", corpus=corpus, encoder_model=model
)
print(hybrid_ranking)

Keyword Ranking [3 4 2 1]
Semantic Ranking [3 2 1 4]
Document 0 has the rrf score 0.031746031746031744
Document 1 has the rrf score 0.031754032258064516
Document 2 has the rrf score 0.03252247488101534
Document 3 has the rrf score 0.032018442622950824
[3 4 2 1]


In [None]:
corpus = [
    "The cat, commonly referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal.",
    "The dog is a domesticated descendant of the wolf.",
    "Humans are the most common and widespread species of primate, and the last surviving species of the genus Homo.",
    "The scientific name Felis catus was proposed by Carl Linnaeus in 1758"
]