Building a Semantic Search System


In [1]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import numpy as np

In [7]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util

def create_search_system(retriever_name='ibm-granite/granite-embedding-english-r2',
                         reranker_name='ibm-granite/granite-embedding-reranker-english-r2'):
    """
    Create a search system with specified retriever and reranker models.
    
    Args:
        retriever_name (str): Name of the retriever model
        reranker_name (str): Name of the reranker model
    
    Returns:
        tuple: (retriever, reranker) model instances
    """
    import torch
    def get_kwargs(model_name: str):
        if torch.cuda.is_available() and model_name.find("granite") != -1:
            model_kwargs = {"attn_implementation": "flash_attention_2", 'dtype': torch.bfloat16}
        else:
            model_kwargs = {}
        return model_kwargs

    model_kwargs = get_kwargs(retriever_name)
    # print(f"Using {retriever_name} retriever, model kwargs: {model_kwargs}")
    retriever = SentenceTransformer(retriever_name, model_kwargs=get_kwargs(retriever_name))
    reranker = CrossEncoder(reranker_name, model_kwargs=get_kwargs(reranker_name), trust_remote_code=True)

    return retriever, reranker


def run_experiment(corpus, question, name,
                   retriever_name='ibm-granite/granite-embedding-english-r2',
                   reranker_name='ibm-granite/granite-embedding-reranker-english-r2'):
    """
    Run a search experiment with specified models and corpus.
    
    Args:
        corpus (list): List of documents to search in
        question (str): Query to search for
        name (str): Name of the experiment
        retriever_name (str): Name of the retriever model
        reranker_name (str): Name of the reranker model
    """
    print(f"Running experiment {name}, question: '{question}'")

    # Create retriever and reranker
    retriever, reranker = create_search_system(retriever_name, reranker_name)

    # Step 1: Encode corpus once (can be cached)
    corpus_embeddings = retriever.encode_document(corpus, convert_to_tensor=True)

    # Step 2: Retrieve top-k candidates
    def search(query, top_k=20):
        query_embedding = retriever.encode_query(query, convert_to_tensor=True)

        # Find top-k with retriever
        hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]

        # Step 3: Rerank with cross-encoder
        cross_inp = [(query, corpus[hit['corpus_id']]) for hit in hits]
        cross_scores = reranker.predict(cross_inp)

        # Sort by reranker scores
        for idx, score in enumerate(cross_scores):
            hits[idx]['rerank_score'] = score

        hits = sorted(hits, key=lambda x: x['rerank_score'], reverse=True)

        return hits[:5]  # Return top 5 after reranking

    # Use it
    results = search(question)
    for hit in results:
        print(f"\tScore: {hit['rerank_score']:.6f} | {corpus[hit['corpus_id']]}")

# Your document corpus
corpus = [
    "Python is a high-level programming language",
    "Machine learning models require training data",
    "Machine learning is an engineeering discipline that studies best coding practices.",
    "Natural language processing enables text understanding",
    "Deep learning uses neural networks with multiple layers",
    "Data science combines statistics and programming",
    "Ana are mere."
    # ... your documents here
]
corpus2 = [
    "Venus is often called Earth's twin because of its similar size and proximity.",
    "Mars, known for its reddish appearance, is often referred to as the Red Planet.",
    "Jupiter, the largest planet in our solar system, has a prominent red spot.",
    "Saturn, famous for its rings, is sometimes mistaken for the Red Planet."
]
corpus3 = [
    "Romeo and Juliet is a play by William Shakespeare.",
    "Climate change refers to long-term shifts in temperatures.",
    "Shakespeare also wrote Hamlet and Macbeth.",
    "Water is an inorganic compound with the chemical formula H2O.",
    "In liquid form, H2O is also called 'water' at standard temperature and pressure."
]

In [8]:
# reranker = 'ibm-granite/granite-embedding-reranker-english-r2'
reranker = '../models/149m_reranker_updated'
run_experiment(corpus, "What is machine learning?", name="ML", reranker_name=reranker)
run_experiment(corpus2, "What planet is known as the Red Planet?", name="RedPlanet", reranker_name=reranker)
run_experiment(corpus3, "what is the chemical formula of water?", name="Water", reranker_name=reranker)

Running experiment ML, question: 'What is machine learning?'
	Score: 1.000000 | Machine learning is an engineeering discipline that studies best coding practices.
	Score: 1.000000 | Machine learning models require training data
	Score: 1.000000 | Data science combines statistics and programming
	Score: 1.000000 | Deep learning uses neural networks with multiple layers
	Score: 1.000000 | Natural language processing enables text understanding
Running experiment RedPlanet, question: 'What planet is known as the Red Planet?'
	Score: 1.000000 | Mars, known for its reddish appearance, is often referred to as the Red Planet.
	Score: 1.000000 | Saturn, famous for its rings, is sometimes mistaken for the Red Planet.
	Score: 1.000000 | Jupiter, the largest planet in our solar system, has a prominent red spot.
	Score: 0.999975 | Venus is often called Earth's twin because of its similar size and proximity.
Running experiment Water, question: 'what is the chemical formula of water?'
	Score: 1.00000

In [9]:
# reranker = 'ibm-granite/granite-embedding-reranker-english-r2'
reranker = 'Alibaba-NLP/gte-multilingual-reranker-base'
run_experiment(corpus, "What is machine learning?", name="ML", reranker_name=reranker)
run_experiment(corpus2, "What planet is known as the Red Planet?", name="RedPlanet", reranker_name=reranker)
run_experiment(corpus3, "what is the chemical formula of water?", name="Water", reranker_name=reranker)

Running experiment ML, question: 'What is machine learning?'
	Score: 0.628242 | Machine learning is an engineeering discipline that studies best coding practices.
	Score: 0.487659 | Data science combines statistics and programming
	Score: 0.415116 | Deep learning uses neural networks with multiple layers
	Score: 0.393726 | Machine learning models require training data
	Score: 0.358712 | Natural language processing enables text understanding
Running experiment RedPlanet, question: 'What planet is known as the Red Planet?'
	Score: 0.832634 | Mars, known for its reddish appearance, is often referred to as the Red Planet.
	Score: 0.693576 | Jupiter, the largest planet in our solar system, has a prominent red spot.
	Score: 0.627483 | Saturn, famous for its rings, is sometimes mistaken for the Red Planet.
	Score: 0.370192 | Venus is often called Earth's twin because of its similar size and proximity.
Running experiment Water, question: 'what is the chemical formula of water?'
	Score: 0.70635