In [1]:
from numpy import ndarray
# Load necessary packages
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import numpy as np
from sentence_transformers import util
from torch import Tensor

In [7]:
# Load retriever and reranker
import torch

if torch.cuda.is_available():
    model_kwargs = {"dtype": torch.bfloat16, 'attn_implementation': 'flash_attention_2'}
else:
    model_kwargs = {}

granite_retriever = SentenceTransformer('ibm-granite/granite-embedding-english-r2',
                                        model_kwargs=model_kwargs)
granite_reranker = CrossEncoder('ibm-granite/granite-embedding-reranker-english-r2',
                                model_kwargs=model_kwargs)
gemma_retriever = SentenceTransformer("google/embeddinggemma-300m",
                                      model_kwargs=model_kwargs)

In [None]:
# Encode some text
documents = [
    "Granite models are designed for enterprise applications",
    "Information retrieval systems need fast and accurate embeddings",
    "Machine learning models can process natural language"
]

In [None]:
# Encode query and documents
model = granite_retriever
query = "What's the purpose of the granite models?"
query_embedding = model.encode(query)
doc_embeddings = model.encode(documents)
# Compute cosine similarity
similarities = util.cos_sim(query_embedding, doc_embeddings)
print(f"Similarities: {similarities}")
# Get most relevant document
best_idx = np.argmax(similarities)
print(f"Most relevant: {documents[best_idx]}")

Building a Semantic Search System


In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import numpy as np

In [None]:
# Your document corpus
corpus = [
    "Python is a high-level programming language",
    "Machine learning models require training data",
    "Natural language processing enables text understanding",
    "Deep learning uses neural networks with multiple layers",
    "Data science combines statistics and programming",
    # ... your documents here
]

In [None]:
# Step 1: Encode corpus once (can be cached)
corpus_embeddings = granite_retriever.encode(corpus, convert_to_tensor=True)

In [None]:
corpus_embeddings[0]

In [12]:
# Step 2: Retrieve top-k candidates
def search(corpus_embeddings, query, top_k=20,
           retriever: SentenceTransformer = granite_retriever,
           reranker: CrossEncoder|None = granite_reranker):
    query_embedding = retriever.encode_query(query, convert_to_tensor=True)

    # Find top-k with retriever
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]

    # Step 3: Rerank with cross-encoder
    if reranker is not None:
        cross_inp = [[query, corpus[hit['corpus_id']]] for hit in hits]
        cross_scores = reranker.predict(cross_inp)

        # Sort by reranker scores
        for idx, score in enumerate(cross_scores):
            hits[idx]['rerank_score'] = score

        hits = sorted(hits, key=lambda x: x['rerank_score'], reverse=True)

    return hits[:5]  # Return top 5 after reranking

In [None]:
# Use it
results = search(corpus_embeddings, "What is machine learning?")
for hit in results:
    print(f"Score: {hit['rerank_score']:.4f} | {corpus[hit['corpus_id']]}")

In [None]:
# The granite embedding r2 blog is the input
long_document="""
IBM Research introduces next-generation embedding models that don’t compromise between speed and accuracy

When it comes to enterprise information retrieval, organizations face a persistent challenge: existing embedding models force you to choose between accuracy and speed, between long-context support and commercial licensing, between general-purpose performance and domain-specific excellence.

On August 15th , we’ve introduced the Granite Embedding R2 models — a comprehensive family of retrieval models designed to eliminate these tradeoffs.

What’s New in R2?
The Granite Embedding R2 release includes three models, all available under Apache 2.0 license:

granite-embedding-english-r2 (149M parameters): Our flagship model with 768-dimensional embeddings
granite-embedding-small-english-r2 (47M parameters): A first-of-its-kind efficient model with 384-dimensional embeddings
granite-embedding-reranker-english-r2 (149M parameters): A cross-encoder for precision ranking
These models deliver three critical improvements over our first-generation release:

16x expanded context length from 512 to 8,192 tokens — meeting modern document processing requirements
19–44% faster inference than comparable models, without sacrificing accuracy
State-of-the-art performance across text, code, long-documents, conversational queries, and tabular data
Getting Started: Basic Usage
Using Granite Embedding models is straightforward with the Sentence-Transformers library:

from sentence_transformers import SentenceTransformer
# Load the model
model = SentenceTransformer('ibm-granite/granite-embedding-english-r2')
# Encode some text
documents = [
    "Granite models are designed for enterprise applications",
    "Information retrieval systems need fast and accurate embeddings",
    "Machine learning models can process natural language"
]
# Generate embeddings
embeddings = model.encode(documents)
print(f"Embedding shape: {embeddings.shape}")  # (3, 768)
For semantic search, you can compute similarity scores:

import numpy as np
from sentence_transformers import util
# Encode query and documents
query = "What are enterprise AI models?"
query_embedding = model.encode(query)
doc_embeddings = model.encode(documents)
# Compute cosine similarity
similarities = util.cos_sim(query_embedding, doc_embeddings)
print(f"Similarities: {similarities}")
# Get most relevant document
best_idx = np.argmax(similarities)
print(f"Most relevant: {documents[best_idx]}")
Building a Semantic Search System
Here’s a complete example building a retrieval system with the reranker:

from sentence_transformers import SentenceTransformer, CrossEncoder
import numpy as np
# Load retriever and reranker
retriever = SentenceTransformer('ibm-granite/granite-embedding-english-r2')
reranker = CrossEncoder('ibm-granite/granite-embedding-reranker-english-r2')
# Your document corpus
corpus = [
    "Python is a high-level programming language",
    "Machine learning models require training data",
    "Natural language processing enables text understanding",
    "Deep learning uses neural networks with multiple layers",
    "Data science combines statistics and programming",
    # ... your documents here
]
# Step 1: Encode corpus once (can be cached)
corpus_embeddings = retriever.encode(corpus, convert_to_tensor=True)
# Step 2: Retrieve top-k candidates
def search(query, top_k=20):
    query_embedding = retriever.encode(query, convert_to_tensor=True)

    # Find top-k with retriever
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]

    # Step 3: Rerank with cross-encoder
    cross_inp = [[query, corpus[hit['corpus_id']]] for hit in hits]
    cross_scores = reranker.predict(cross_inp)

    # Sort by reranker scores
    for idx, score in enumerate(cross_scores):
        hits[idx]['rerank_score'] = score

    hits = sorted(hits, key=lambda x: x['rerank_score'], reverse=True)

    return hits[:5]  # Return top 5 after reranking
# Use it
results = search("What is machine learning?")
for hit in results:
    print(f"Score: {hit['rerank_score']:.4f} | {corpus[hit['corpus_id']]}")
Long Context Documents
Granite R2 handles up to 8,192 tokens, perfect for processing full documents:

# Load model with long context support
model = SentenceTransformer('ibm-granite/granite-embedding-english-r2')
# Process a long document (e.g., research paper, technical documentation)
long_document =
[Your 5000+ word document here]
This could be an entire research paper, technical manual,
or any long-form content...

# Encode the full document (no chunking needed for <8192 tokens)
doc_embedding = model.encode(long_document, show_progress_bar=True)
# Compare with shorter query
query = "What are the main findings of this research?"
query_embedding = model.encode(query)
similarity = util.cos_sim(query_embedding, doc_embedding)
print(f"Relevance score: {similarity.item():.4f}")
Code Search Example
Granite R2 excels at code retrieval:

# Code snippets corpus
code_snippets = [

    def binary_search(arr, target):
        left, right = 0, len(arr) - 1
        while left <= right:
            mid = (left + right) // 2
            if arr[mid] == target:
                return mid
            elif arr[mid] < target:
                left = mid + 1
            else:
                right = mid - 1
        return -1

    def quicksort(arr):
        if len(arr) <= 1:
            return arr
        pivot = arr[len(arr) // 2]
        left = [x for x in arr if x < pivot]
        middle = [x for x in arr if x == pivot]
        right = [x for x in arr if x > pivot]
        return quicksort(left) + middle + quicksort(right)
    class LinkedList:
        def __init__(self):
            self.head = None

        def append(self, data):
            if not self.head:
                self.head = Node(data)
                return
            current = self.head
            while current.next:
                current = current.next
            current.next = Node(data)
]
# Encode code
code_embeddings = model.encode(code_snippets)
# Natural language query
query = "How do I implement a binary search algorithm?"
query_embedding = model.encode(query)
# Find most relevant code
similarities = util.cos_sim(query_embedding, code_embeddings)[0]
best_match = np.argmax(similarities)
print(f"Most relevant code snippet:\n{code_snippets[best_match]}")
Table Retrieval
Handle structured data with ease:

# Tables in markdown format
tables = [
    | Product | Q1 Revenue | Q2 Revenue |
    |---------|-----------|-----------|
    | Product A | $500K | $650K |
    | Product B | $300K | $420K |
    | Employee | Department | Salary |
    |----------|-----------|--------|
    | John Doe | Engineering | $120K |
    | Jane Smith | Marketing | $95K |
    | Country | Population | GDP |
    |---------|-----------|-----|
    | USA | 331M | $21T |
    | China | 1.4B | $14T |

]
# Encode tables
table_embeddings = model.encode(tables)
# Query for specific information
query = "What was the revenue growth for our products?"
query_embedding = model.encode(query)
similarities = util.cos_sim(query_embedding, table_embeddings)[0]
best_table = np.argmax(similarities)
print(f"Most relevant table:\n{tables[best_table]}")
Batch Processing for Production
For production deployments processing large volumes:

from sentence_transformers import SentenceTransformer
import torch
# Load model with GPU support
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('ibm-granite/granite-embedding-english-r2', device=device)
# Large batch of documents
documents = [...] # Your thousands of documents
# Efficient batch encoding
batch_size = 128
all_embeddings = model.encode(
    documents,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_tensor=True,
    normalize_embeddings=True  # For cosine similarity
)
# Save embeddings for later use
torch.save(all_embeddings, 'document_embeddings.pt')
# Load and search later
embeddings = torch.load('document_embeddings.pt')
query_emb = model.encode(query, convert_to_tensor=True)
hits = util.semantic_search(query_emb, embeddings, top_k=10)
Multi-Turn Conversational Search
Handle conversational context:

conversation_history = [
    "What are the system requirements for the new software?",
    "Does it work on Mac?",
    "What about Linux distributions?"
]
# Concatenate conversation context
context = " ".join(conversation_history)
current_query = conversation_history[-1]
# Encode with full context
context_embedding = model.encode(context)
# Search in your knowledge base
results = search_with_context(context_embedding, knowledge_base)
Built on Modern Foundations
The R2 models leverage the ModernBERT architecture, incorporating recent advances in encoder design:

Alternating attention mechanisms for efficiency
Rotary positional embeddings enabling flexible context lengths
Flash Attention support for optimized inference
We trained these models on 2 trillion tokens from high-quality sources including GneissWeb, Wikipedia, and Granite Code data. Every dataset underwent comprehensive governance review, with screening for personal information and profanity — because enterprise deployments demand transparency and responsible AI practices.

A Novel Training Pipeline
What sets Granite R2 apart is our five-stage training methodology:

1. Retrieval-Oriented Pretraining: Using RetroMAE to train rich [CLS] representations without explicit contrastive objectives

2. Tabular Pretraining: A breakthrough approach for handling structured data. Traditional embedding models struggle with tables containing numerical data and limited context. Our solution? We generated synthetic summaries for 8 million tables using Mistral-7B, then modified the RetroMAE objective to predict masked tokens over summaries rather than table content itself. This forces the encoder to align table structure with natural language descriptions.

3. Contrastive Finetuning: Training on large-scale semi-supervised pairs with improved contrastive loss

4. Contrastive Distillation: Rather than simply finetuning on hard negatives, we distill knowledge from a Mistral-7B teacher model trained on high-quality triples. This approach yields larger performance gains than traditional hard-negative training.

5. Domain Adaptation: Specialized training for multi-turn conversational retrieval

This pipeline enables a single model family to excel across remarkably diverse tasks.

Performance That Speaks for Itself
We evaluated Granite R2 across comprehensive benchmarks:

General Retrieval (MTEB-v2)

granite-english-r2: 56.4 average score
granite-small-r2: 53.9 average score
Code Retrieval (COIR)

54.8 / 53.4 across text-to-code, code-to-text, and hybrid tasks
Zero-shot evaluation (no COIR training data used)
Long-Context (MLDR, LongEmbed)

granite-english-r2: 41.6 MLDR, 67.8 LongEmbed
granite-small-r2: 40.1 MLDR, 61.9 LongEmbed
State-of-the-art on LongEmbed benchmark
Table Retrieval

78.5 / 75.5 across OpenWikiTables, NQTables, OTT-QA, MultiHierTT, and AIT-QA
Multi-Turn Conversational (MT-RAG)

granite-english-r2: 57.6 Recall@5
Substantial improvement over first-generation models
Speed Without Compromise
Performance benchmarks often overlook a critical real-world constraint: encoding speed. When you’re ingesting millions of documents with frequent updates, speed directly impacts operational costs and user experience.

We benchmarked encoding speed using 23,000 IBM technical documents (averaging 6,393 characters, ranging from 10 to 475,001 characters):

granite-english-r2: 144 documents/second
granite-small-r2: 199 documents/second
These speeds represent 19–44% improvements over leading competitors, despite the R2 models having slightly more parameters than R1. The ModernBERT architecture’s optimizations — particularly Flash Attention — enable this efficiency gain.

Complete Retrieval Ecosystem
The reranker model completes the retrieval pipeline. Built on granite-embedding-english-r2, it uses a PListMLE loss objective for position-aware ranking:

BEIR: 55.4 (vs. 53.1 for retriever alone)
MLDR: 44.4 (vs. 41.6 for retriever alone)
This retrieve-and-rerank framework maximizes both recall and precision without severe computational overhead.

Enterprise-Ready from Day One
Every Granite model prioritizes enterprise requirements:

Data Governance: Comprehensive clearance process capturing content description, intended use, data classification, licensing information, usage restrictions, and personal information assessment

Licensing: Apache 2.0 — no restrictions on commercial use, no proprietary training data limitations

Transparency: Fully documented training data sources, architectural decisions, and evaluation methodology

Why This Matters
Information retrieval isn’t just about finding documents — it’s about enabling AI systems to access relevant knowledge efficiently. Whether you’re building RAG applications, semantic search engines, or recommendation systems, embedding quality and speed determine what’s possible.

Granite R2 models don’t force you to choose between accuracy and speed, between long-context support and efficiency, between general-purpose capability and domain-specific performance. They deliver all of it.

In an era where milliseconds matter and accuracy cannot be compromised, Granite R2 doesn’t just meet the standard — it sets it.

Get Started
All Granite Embedding R2 models are available now on Hugging Face under Apache 2.0 license:

granite-embedding-english-r2
granite-embedding-small-english-r2
granite-embedding-reranker-english-r2
For technical details, architecture description, and comprehensive benchmark results, see our research paper.

The Granite Embedding R2 models represent collaborative work across IBM Research teams in multiple geographies. For questions or feedback, visit our GitHub repository.
"""
# Encode the full document (no chunking needed for <8192 tokens)
doc_embedding = model.encode(long_document, show_progress_bar=True)
# Compare with shorter query
query = "What are the good blogs on granite R2 embeddings?"
query_embedding = model.encode(query)
similarity = util.cos_sim(query_embedding, doc_embedding)
print(f"Relevance score for '{query}: {similarity.item():.4f}")
query = "How do you lose weight fast?"
query_embedding = model.encode(query)
similarity = util.cos_sim(query_embedding, doc_embedding)
print(f"Relevance score for '{query}': {similarity.item():.4f}")

Code Search Example

In [None]:
# Code snippets corpus
code_snippets = [
    """
    def binary_search(arr, target):
        left, right = 0, len(arr) - 1
        while left <= right:
            mid = (left + right) // 2
            if arr[mid] == target:
                return mid
            elif arr[mid] < target:
                left = mid + 1
            else:
                right = mid - 1
        return -1
    """,
    """
    def quicksort(arr):
        if len(arr) <= 1:
            return arr
        pivot = arr[len(arr) // 2]
        left = [x for x in arr if x < pivot]
        middle = [x for x in arr if x == pivot]
        right = [x for x in arr if x > pivot]
        return quicksort(left) + middle + quicksort(right)
    """,
    """
    class LinkedList:
        def __init__(self):
            self.head = None

        def append(self, data):
            if not self.head:
                self.head = Node(data)
                return
            current = self.head
            while current.next:
                current = current.next
            current.next = Node(data)
    """
]

In [None]:
# Encode code
code_embeddings = model.encode(code_snippets)
# Natural language query
query = "How do I implement a binary search algorithm?"
query_embedding = model.encode(query)
# Find most relevant code
similarities = util.cos_sim(query_embedding, code_embeddings)[0]
best_match = np.argmax(similarities)
print(f"Most relevant code snippet:\n{code_snippets[best_match]}")


Table Retrieval

In [None]:
# Tables in markdown format
tables = [
    """
    | Product | Q1 Revenue | Q2 Revenue |
    |---------|-----------|-----------|
    | Product A | $500K | $650K |
    | Product B | $300K | $420K |
    """,
    """
    | Employee | Department | Salary |
    |----------|-----------|--------|
    | John Doe | Engineering | $120K |
    | Jane Smith | Marketing | $95K |
    """,
    """
    | Country | Population | GDP |
    |---------|-----------|-----|
    | USA | 331M | $21T |
    | China | 1.4B | $14T |
    """
]

In [None]:
# Encode tables
table_embeddings = model.encode(tables)
# Query for specific information
query = "What was the revenue growth for our products?"
query_embedding = model.encode(query)
similarities = util.cos_sim(query_embedding, table_embeddings)[0]
best_table = np.argmax(similarities)
print(f"Most relevant table:\n{tables[best_table]}")

Batch Processing for Production

In [None]:
from sentence_transformers import SentenceTransformer
import torch

In [None]:
# Load model with GPU support
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('ibm-granite/granite-embedding-english-r2', device=device,
                            model_kwargs=model_kwargs)
# Large batch of documents
documents = corpus * 20
# documents = generate_text(1024, num_docs=128)

# Efficient batch encoding
batch_size = 64
all_embeddings = model.encode_document(
    documents,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_tensor=True,
    normalize_embeddings=True  # For cosine similarity
)
# Save embeddings for later use
torch.save(all_embeddings, 'document_embeddings.pt')
# Load and search later
embeddings = torch.load('document_embeddings.pt')
query_emb = model.encode(query, convert_to_tensor=True)
hits = util.semantic_search(query_emb, embeddings, top_k=10)
print(hits)

In [2]:
conversation_corpus = [
    {'_id': '3',
     'text': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software. Perhaps educational systems in the U.S. (or their students) should worry a little about getting marketable skills in exchange for their massive investment in education, rather than getting out with thousands in student debt and then complaining that they aren't qualified to do anything.",
     },
    {'_id': '31',
    'text': "So nothing preventing false ratings besides additional scrutiny from the market/investors, but there are some newer controls in place to prevent institutions from using them. Under the DFA banks can no longer solely rely on credit ratings as due diligence to buy a financial instrument, so that's a plus. The intent being that if financial institutions do their own leg work then *maybe* they'll figure out that a certain CDO is garbage or not.  Edit: lead in",
     },
    {'_id': '56',
    'text': "You can never use a health FSA for individual health insurance premiums.  Moreover, FSA plan sponsors can limit what they are will to reimburse.  While you can't use a health FSA for premiums, you could previously use a 125 cafeteria plan to pay premiums, but it had to be a separate election from the health FSA. However, under N. 2013-54, even using a cafeteria plan to pay for indivdiual premiums is effectively prohibited.",
     },
    {'_id': '59',
    'text': 'Samsung created the LCD and other flat screen technology like OLED. a few years ago every flat screen came from Samsung factories and were reshelled. I think the 21 Hanns screen I am looking at now is Samsung and it is only a couple of years old. Samsung seem to be a good company.',
     },
    {'_id': '63',
    'text': 'Here are the SEC requirements: The federal securities laws define the term accredited investor in   Rule 501 of Regulation D as: a bank, insurance company, registered investment company, business development company, or small business investment company; an employee benefit plan, within the meaning of the Employee Retirement Income Security Act, if a bank, insurance company, or   registered investment adviser makes the investment decisions, or if   the plan has total assets in excess of $5 million; a charitable organization, corporation, or partnership with assets exceeding $5 million; a director, executive officer, or general partner of the company selling the securities; a business in which all the equity owners are accredited investors; a natural person who has individual net worth, or joint net worth with the person’s spouse, that exceeds $1 million at the time of the   purchase, excluding the value of the primary residence of such person; a natural person with income exceeding $200,000 in each of the two most recent years or joint income with a spouse exceeding $300,000 for   those years and a reasonable expectation of the same income level in   the current year; or a trust with assets in excess of $5 million, not formed to acquire the securities offered, whose purchases a sophisticated person makes. No citizenship/residency requirements.',
     },
    {'_id': '100',
    'text': '"Only relevant to those with fantasy economy teams. Seriously, Rand\'s fictional works never translate well into reality because, no matter how hard people try, that ""fiction"" element just can\'t be ignored.  Test it yourself: Strip John Galt and his followers of everything they have which was created by or within the ""society"" they so revile, drop them in the desert -- and they\'ll all be dead of exposure and starvation in less than two weeks because they will be naked, without tools and without food.  The only reason the libertarians get away with pushing their tripe as a rational philosophy is because no one will point out what it is wrong with their thinking. Why? Well, for most of my lifetime, their ""philosophy"" was considered nuttery in line with the John Birchers and so why bother. It\'s only with the ascendency of these billionaire-funded politicians that this crap thinking has become acceptable, and even then, only to them."',
     },
    {'_id': '108',
    'text': "Futures contracts are a member of a larger class of financial assets called derivatives. Derivatives are called such because their payoffs depend on the price of other assets (financial or real). Other kinds of derivatives are call options, put options. Fixed income assets that mimic the behavior of derivatives are callable bonds, puttable bonds etc.  A futures contract is a contract that specifies the following: Just like with any other contract, there are two parties involved. One party commits to delivering the underlying asset to the other party on expiration date in exchange for the futures price. The other party commits to paying the futures price in exchange for the asset. There is no price that any of the two parties pay upfront to engage in the contract. The language used is so that the agent committing to receiving the delivery of the underlying asset is said to have bought the contract. The agent that commits to make the delivery is said to have sold the contract.  So answer your question, buying on June 1 a futures contract at the futures price of $100, with a maturity date on August 1 means you commit to paying $100 for the underlying asset on August 1. You don't have to pay anything upfront. Futures price is simply what the contract prescribes the underlying asset will exchange hands for.",
     },
    {'_id': '125',
    'text': 'This month when you join Scentsy you get a free defuser with your kit!   This has never been done before.  You also get spring / summer and Fall / Winter testers plus all your kit items!    Be your own boss!  You choose what hours you work, when and where you work them.   Join my Scentsy family today!  [Amanda C. Robar Scentsy Business ](http://www.amandacrobar.scentsy.ca)',
     },
    {'_id': '132',
    'text': 'Whenever you pay or withdraw some fund from your account, paypal takes approx 3% of the current currency value along with the fees. i.e. If you are paying/withdraw 100 unit of US Dollars to British pounds and if the current convertion rate is 1$=0.82GBP, then consider reducing 3% of the actual currency rate. So, the approximate magnitude will be 0.82*97% (100-3=97) = 0.7954. So, 1$=0.7954GBP. This formula will not give you 100% accurate value but will help of course. Captain',
     },
    {'_id': '138',
    'text': 'So you asked him in 2010 how he was gong to compete with DVD rental distributors like Netflix (which is what Netflix primarily was at the time) and Lovefilm and you were surprised that he was he said they were going to continue to compete as a DVD rental distributor just like the mentioned competitors?',
     }
    ]

In [27]:
# retriever = granite_retriever
retriever = gemma_retriever

In [28]:
# Concatenate conversation context
conv_embeddings = retriever.encode_document([s['text'] for s in conversation_corpus], convert_to_tensor=True)

In [20]:
def run_query(query, corpus_embeddings=conv_embeddings, retriever=retriever, reranker=None):
    results = search(corpus_embeddings=corpus_embeddings,
                     query=query, retriever=retriever, reranker=reranker)
    print(f"Results for the query: '{query}'")
    for hit in results:
        print(f"Score: {hit['score']:.4f} | {conversation_corpus[hit['corpus_id']]['text'][:80]} [...]")
    print()

In [29]:
# Search in your knowledge base
query = "Who invented the OLED screen?"
query1 = "How real is Rand's writing?"
query2 = "What's a futures contract?"
run_query(query)
run_query(query1)
run_query(query2)

Results for the query: 'Who invented the OLED screen?'
Score: 0.8672 | Samsung created the LCD and other flat screen technology like OLED. a few years  [...]
Score: 0.6875 | So you asked him in 2010 how he was gong to compete with DVD rental distributors [...]
Score: 0.6719 | This month when you join Scentsy you get a free defuser with your kit!   This ha [...]
Score: 0.6523 | I'm not saying I don't like the idea of on-the-job training too, but you can't e [...]
Score: 0.6523 | "Only relevant to those with fantasy economy teams. Seriously, Rand's fictional  [...]

Results for the query: 'How real is Rand's writing?'
Score: 0.8633 | "Only relevant to those with fantasy economy teams. Seriously, Rand's fictional  [...]
Score: 0.6914 | So nothing preventing false ratings besides additional scrutiny from the market/ [...]
Score: 0.6875 | Futures contracts are a member of a larger class of financial assets called deri [...]
Score: 0.6836 | So you asked him in 2010 how he was gong to compete

### Text Generation Section
Use this section to be able to quickly generate random pieces of text to use in the notebook above, if you don't have your own large corpus.

This is how you can easily set up a similarity computation with one of the granite embedding model. Feel free to experiment with other models, of course :).

In [30]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm.auto import tqdm

default_llm = "ibm-granite/granite-4.0-h-micro"

In [31]:
def create_causal_lm_model(model_name, device="cuda"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # drop device_map if running on CPU
    granite_model = AutoModelForCausalLM.from_pretrained(model_name,
    device_map=device)
    return granite_model, tokenizer


granite_model, tokenizer = create_causal_lm_model(default_llm)

The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 23.52 GiB of which 11.31 MiB is free. Process 3618285 has 386.00 MiB memory in use. Process 3618617 has 386.00 MiB memory in use. Process 3758166 has 3.74 GiB memory in use. Process 3776779 has 2.84 GiB memory in use. Process 1290329 has 9.03 GiB memory in use. Process 1721000 has 2.12 GiB memory in use. Including non-PyTorch memory, this process has 4.98 GiB memory in use. Of the allocated memory 4.49 GiB is allocated by PyTorch, and 36.37 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def generate_text(size=1024, num_docs = 1000, device="cuda"):
    model.eval()
    # change input text as desired
    chat = [
        { "role": "user", "content": "Please generate creative text." },
    ]
    chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    # tokenize the text
    input_tokens = tokenizer(chat, return_tensors="pt").to(device)
    # generate output tokens
    output = []
    for _ in tqdm(range(num_docs), desc="Generating text"):
        o = granite_model.generate(**input_tokens,
                                   max_new_tokens=size)
        # decode output tokens into text
        response_text = tokenizer.batch_decode(o, skip_special_tokens=False)[0]
        assistant_turn_marker = "<|start_of_role|>assistant"
        if assistant_turn_marker in response_text:
            # Get the text after the last assistant turn marker
            new_assistant_turn = response_text.rsplit(assistant_turn_marker, 1)[-1].strip()
            # Clean up any remaining tokens or unwanted text
            final_response = new_assistant_turn.replace("<|end_of_role|>", "").strip()
        else:
            final_response = response_text.strip()
        output.append(final_response)
    # print output
    # print(output[0])
    return output[0] if len(output) == 1 else output

In [3]:
docs = generate_text(size=128, num_docs=10)

### Generate text using LMStudio
.. or any other OpenAI-based generator

In [None]:
!pip install openai

In [None]:
from openai import OpenAI
from tqdm.auto import tqdm

In [None]:
# Configuration constants
LM_STUDIO_BASE_URL = "http://localhost:1234/v1"
LM_STUDIO_API_KEY = "not-needed"
MODEL_NAME = "local-model"
SYSTEM_MESSAGE = "You are a creative assistant."
DEFAULT_TEMPERATURE = 0.7

# Point to your local LM Studio server
client = OpenAI(base_url=LM_STUDIO_BASE_URL, api_key=LM_STUDIO_API_KEY)


def generate_text_lmstudio(size=1024, num_docs=1000):
    # generate output tokens
    output = []
    for _ in tqdm(range(num_docs), desc="Generating text"):
        response = client.chat.completions.create(
            model=MODEL_NAME,  # LM Studio ignores this, uses loaded model
            messages=[
                {"role": "system", "content": SYSTEM_MESSAGE},
                {"role": "user", "content": f"Generate an interesting story of at most {size} tokens."}
            ],
            temperature=DEFAULT_TEMPERATURE,
            max_tokens=size
        )
        output.append(response.choices[0].message.content)
    return output
    # print(output[0])#%%
docs = generate_text_lmstudio(size=512, num_docs=10)

### Putting it all together in one class
This class makes all the previous code snippets easier to run



In [4]:
from typing import List, Dict, Optional, Union
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch
import numpy as np

DEFAULT_RETRIEVER_MODEL = 'ibm-granite/granite-embedding-english-r2'
DEFAULT_RERANKER_MODEL = 'ibm-granite/granite-embedding-reranker-english-r2'
DEFAULT_GEMMA_MODEL = "google/embeddinggemma-300m"
DEFAULT_EMBEDDING_DIM = 768
DEFAULT_TOP_K = 20
DEFAULT_TOP_RESULTS = 5


class SemanticSearchManager:
    """
    A comprehensive semantic search system that combines retrieval and reranking
    for enhanced search accuracy and performance.
    """

    def __init__(self,
                 retriever_model: str = DEFAULT_RETRIEVER_MODEL,
                 reranker_model: str = DEFAULT_RERANKER_MODEL,
                 use_gpu: bool = True):
        """
        Initialize the semantic search manager with retriever and reranker models.

        Args:
            retriever_model: Name of the SentenceTransformer model for retrieval
            reranker_model: Name of the CrossEncoder model for reranking
            use_gpu: Whether to use GPU acceleration if available
        """
        self.retriever_model_name = retriever_model
        self.reranker_model_name = reranker_model
        self.corpus_embeddings = None
        self.corpus = None

        # Configure model kwargs based on hardware availability
        self.model_kwargs = self._configure_model_kwargs(use_gpu)

        # Initialize models
        self.retriever = self._load_retriever()
        self.reranker = self._load_reranker()

    def _configure_model_kwargs(self, use_gpu: bool) -> Dict:
        """Configure model parameters based on hardware availability."""
        if use_gpu and torch.cuda.is_available():
            return {
                "dtype": torch.bfloat16,
                'attn_implementation': 'flash_attention_2'
            }
        return {}

    def _load_retriever(self) -> SentenceTransformer:
        """Load and configure the retriever model."""
        return SentenceTransformer(self.retriever_model_name,
                                   model_kwargs=self.model_kwargs,
                                   device=SemanticSearchManager._get_device())

    def _load_reranker(self) -> Optional[CrossEncoder]:
        """Load and configure the reranker model."""
        try:
            return CrossEncoder(self.reranker_model_name,
                                model_kwargs=self.model_kwargs,
                                device=SemanticSearchManager._get_device())
        except Exception as e:
            print(f"Warning: Could not load reranker model: {e}")
            return None

    @staticmethod
    def _get_device():
        device = ('cuda' if torch.cuda.is_available() else
                  "mps" if torch.backends.mps.is_available() else
                  'cpu'
                  )
        return device

    def encode_corpus(self, corpus: List[str], show_progress: bool = True):
        """
        Encode the document corpus for semantic search.

        Args:
            corpus: List of documents to encode
            show_progress: Whether to show encoding progress
        """
        self.corpus = [self.get_text(s) for s in corpus]
        self.corpus_embeddings = self.retriever.encode_document(
            self.corpus,
            convert_to_tensor=True,
            show_progress_bar=show_progress
        )
        return self.corpus_embeddings

    def encode_query(self, query: str) -> torch.Tensor:
        """Encode a single query for semantic search."""
        return self.retriever.encode_query(query, convert_to_tensor=True)

    def search(self,
               query: str,
               top_k: int = DEFAULT_TOP_K,
               top_results: int = DEFAULT_TOP_RESULTS,
               use_reranker: bool = True,
               print_results: bool=False,
               max_size: int=-1) -> List[Dict]:
        """
        Perform semantic search with optional reranking.

        Args:
            query: Search query string
            top_k: Number of candidates to retrieve initially
            top_results: Final number of results to return
            use_reranker: Whether to use reranker for final ranking

        Returns:
            List of search results with scores and content
        """
        if self.corpus_embeddings is None:
            raise ValueError("Corpus must be encoded first using encode_corpus()")

        # Step 1: Retrieve top-k candidates
        query_embedding = self.encode_query(query)
        hits = util.semantic_search(query_embedding, self.corpus_embeddings, top_k=top_k)[0]

        # Step 2: Optionally rerank with cross-encoder
        if use_reranker and self.reranker is not None:
            hits = self._rerank_results(query, hits)

        if print_results:
            self.print_results(query, hits[:top_results], max_size=max_size)

        return hits[:top_results]

    def _rerank_results(self, query: str, hits: List[Dict]) -> List[Dict]:
        """Apply reranking to search results."""
        cross_input = [[query, self.corpus[hit['corpus_id']]] for hit in hits]
        cross_scores = self.reranker.predict(cross_input)

        # Update hits with reranking scores
        for idx, score in enumerate(cross_scores):
            hits[idx]['rerank_score'] = score

        # Sort by reranker scores
        return sorted(hits, key=lambda x: x['rerank_score'], reverse=True)

    def get_text(self, item):
        if isinstance(item, str):
            return item
        elif isinstance(item, dict) and 'text' in item:
            return item['text']

    def print_results(self, query: str, results, top_results: int = DEFAULT_TOP_RESULTS,
                      max_size=-1):

        print(f"Results for the query: '{query}'")
        for hit in results:
            txt = self.get_text(self.corpus[hit['corpus_id']])
            if max_size > 0 and len(txt) > max_size:
                txt = f"{txt[:max_size]} [...]"
            print(f"\tScore: {hit['score']:.4f} | {txt}")
        print()

    def simple_similarity_search(self, query: str, documents: List[str]) -> Dict:
        """
        Perform simple cosine similarity search without corpus pre-encoding.
        Useful for one-time searches on small document sets.
        """
        query_embedding = self.encode_query(query)
        _ = self.encode_corpus(documents)

        similarities = util.cos_sim(query_embedding, self.corpus_embeddings)
        best_idx = torch.argmax(similarities)

        return {
            'similarities': similarities,
            'best_match_index': int(best_idx),
            'best_match': documents[best_idx],
            'best_score': float(similarities[0][best_idx])
        }


In [5]:
# Example: Simple similarity search
def run_experiment(manager):
    documents = [
        "Granite models are designed for enterprise applications",
        "Information retrieval systems need fast and accurate embeddings",
        "Machine learning models can process natural language"
    ]

    query = "What's the purpose of the granite models?"
    print("=" * 80, "\n", f"Running with {manager.retriever_model_name}", "=" * 80)
    manager.encode_corpus(documents)
    manager.search(query, top_k=5, use_reranker=False, print_results=True, max_size=-1)

    manager.encode_corpus(conversation_corpus)
    query = "Who invented the OLED screen?"
    query1 = "How real is Rand's writing?"
    manager.search(query, top_k=5, print_results=True, max_size=80, use_reranker=False)
    manager.search(query1, top_k=5, print_results=True, max_size=80, use_reranker=False)

search_manager = SemanticSearchManager()
run_experiment(search_manager)

gemma_manager = SemanticSearchManager(retriever_model=DEFAULT_GEMMA_MODEL)
run_experiment(gemma_manager)




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Results for the query: 'What's the purpose of the granite models?'
	Score: 0.8750 | Granite models are designed for enterprise applications
	Score: 0.7344 | Machine learning models can process natural language
	Score: 0.7188 | Information retrieval systems need fast and accurate embeddings



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Results for the query: 'Who invented the OLED screen?'
	Score: 0.8672 | Samsung created the LCD and other flat screen technology like OLED. a few years  [...]
	Score: 0.6875 | So you asked him in 2010 how he was gong to compete with DVD rental distributors [...]
	Score: 0.6719 | This month when you join Scentsy you get a free defuser with your kit!   This ha [...]
	Score: 0.6523 | I'm not saying I don't like the idea of on-the-job training too, but you can't e [...]
	Score: 0.6523 | "Only relevant to those with fantasy economy teams. Seriously, Rand's fictional  [...]

Results for the query: 'How real is Rand's writing?'
	Score: 0.8633 | "Only relevant to those with fantasy economy teams. Seriously, Rand's fictional  [...]
	Score: 0.6914 | So nothing preventing false ratings besides additional scrutiny from the market/ [...]
	Score: 0.6875 | Futures contracts are a member of a larger class of financial assets called deri [...]
	Score: 0.6836 | So you asked him in 2010 how he was gong t

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Results for the query: 'What's the purpose of the granite models?'
	Score: 0.6719 | Granite models are designed for enterprise applications
	Score: 0.2334 | Machine learning models can process natural language
	Score: 0.1699 | Information retrieval systems need fast and accurate embeddings



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Results for the query: 'Who invented the OLED screen?'
	Score: 0.4961 | Samsung created the LCD and other flat screen technology like OLED. a few years  [...]
	Score: 0.1465 | I'm not saying I don't like the idea of on-the-job training too, but you can't e [...]
	Score: 0.1025 | Here are the SEC requirements: The federal securities laws define the term accre [...]
	Score: 0.0996 | So you asked him in 2010 how he was gong to compete with DVD rental distributors [...]
	Score: 0.0830 | This month when you join Scentsy you get a free defuser with your kit!   This ha [...]

Results for the query: 'How real is Rand's writing?'
	Score: 0.5703 | "Only relevant to those with fantasy economy teams. Seriously, Rand's fictional  [...]
	Score: 0.1709 | So nothing preventing false ratings besides additional scrutiny from the market/ [...]
	Score: 0.1396 | Samsung created the LCD and other flat screen technology like OLED. a few years  [...]
	Score: 0.1309 | Here are the SEC requirements: The federal

In [6]:
result

{'similarities': tensor([[0.8750, 0.7188, 0.7344]], device='cuda:0', dtype=torch.bfloat16),
 'best_match_index': 0,
 'best_match': 'Granite models are designed for enterprise applications',
 'best_score': 0.875}