In [2]:
# Load the model
from sentence_transformers import SentenceTransformer, util

In [2]:
import torch
model_kwargs={"attn_implementation": "flash_attention_2",
              "dtype": torch.bfloat16}
model = SentenceTransformer('ibm-granite/granite-embedding-english-r2',
                            model_kwargs=model_kwargs)
# Encode some text
documents = [
    "Granite models are designed for enterprise applications",
    "Information retrieval systems need fast and accurate embeddings",
    "Machine learning models can process natural language"
]
# Generate embeddings
embeddings = model.encode(documents)
print(f"Embedding shape: {embeddings.shape}")  # (3, 768)

Embedding shape: (3, 768)


### Generate Text

This section provides the tools to generate text samples using the Granite language model. Feel free to customize the prompts and parameters to generate your own text content.


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [4]:
default_llm = "ibm-granite/granite-4.0-h-micro"
def create_granite4_model(llm_to_use):
    device = "cuda"

    tokenizer = AutoTokenizer.from_pretrained(llm_to_use)
    # drop device_map if running on CPU
    granite_model = AutoModelForCausalLM.from_pretrained(llm_to_use,
                                                         device_map=device)
    return granite_model, tokenizer

In [5]:
granite_model, tokenizer = create_granite4_model(default_llm)

The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [44]:
from tqdm.auto import tqdm
def generate_text(size=1024, num_docs = 1000, device="cuda"):
    model.eval()
    # change input text as desired
    chat = [
        { "role": "user", "content": "Please generate creative text." },
    ]
    chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    # tokenize the text
    input_tokens = tokenizer(chat, return_tensors="pt").to(device)
    # generate output tokens
    output = []
    for _ in tqdm(range(num_docs), desc="Generating text"):
        o = granite_model.generate(**input_tokens,
                                   max_new_tokens=size)
        # decode output tokens into text
        response_text = tokenizer.batch_decode(o, skip_special_tokens=False)[0]
        assistant_turn_marker = "<|start_of_role|>assistant"
        if assistant_turn_marker in response_text:
            # Get the text after the last assistant turn marker
            new_assistant_turn = response_text.rsplit(assistant_turn_marker, 1)[-1].strip()
            # Clean up any remaining tokens or unwanted text
            final_response = new_assistant_turn.replace("<|end_of_role|>", "").strip()
        else:
            final_response = response_text.strip()
        output.append(final_response)
    # print output
    # print(output[0])
    return output[0] if len(output) == 1 else output

In [47]:
docs = generate_text(size=128, num_docs=10)

Generating text:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
docs[1]==docs[0]

### Generate text using LMStudio

In [None]:
!pip install openai

In [5]:
from openai import OpenAI
from tqdm.auto import tqdm

# Point to your local LM Studio server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

In [22]:
def generate_text_lmstudio(size=1024, num_docs=1000):
    # generate output tokens
    output = []
    for _ in tqdm(range(num_docs), desc="Generating text"):
        response = client.chat.completions.create(
            model="local-model",  # LM Studio ignores this, uses loaded model
            messages=[
                {"role": "system", "content": "You are a creative assistant."},
                {"role": "user", "content": f"Generate an interesting story of at most {size} tokens."}
            ],
            temperature=0.7,
            max_tokens=size
        )

        output.append(response.choices[0].message.content)
    return output
    # print(output[0])

In [21]:
docs = generate_text_lmstudio(size=512, num_docs=10)

Generating text:   0%|          | 0/10 [00:00<?, ?it/s]

### Semantic Search Example
This is how you can easily set up a similarity computation with the granite embedding models.

In [24]:
import numpy as np
from sentence_transformers import util

In [28]:
# Encode query and documents
query = "What's the purpose of the granite models?"
query_embedding = model.encode(query)
doc_embeddings = model.encode(documents)
# Compute cosine similarity
similarities = util.cos_sim(query_embedding, doc_embeddings)
print(f"Similarities: {similarities}")
# Get most relevant document
best_idx = np.argmax(similarities)
print(f"Most relevant: {documents[best_idx]}")

Similarities: tensor([[0.8735, 0.7191, 0.7353]])
Most relevant: Granite models are designed for enterprise applications


Building a Semantic Search System


In [3]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import numpy as np

In [8]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import numpy as np

def create_search_system(retriever_name='ibm-granite/granite-embedding-english-r2',
                         reranker_name='ibm-granite/granite-embedding-reranker-english-r2'):
    """
    Create a search system with specified retriever and reranker models.
    
    Args:
        retriever_name (str): Name of the retriever model
        reranker_name (str): Name of the reranker model
    
    Returns:
        tuple: (retriever, reranker) model instances
    """
    import torch
    model_kwargs = {"attn_implementation": "flash_attention_2", 'dtype': torch.bfloat16}

    retriever = SentenceTransformer(retriever_name, model_kwargs=model_kwargs)
    reranker = CrossEncoder(reranker_name, model_kwargs=model_kwargs, trust_remote_code=True)

    return retriever, reranker


def run_experiment(corpus, question, name,
                   retriever_name='ibm-granite/granite-embedding-english-r2',
                   reranker_name='ibm-granite/granite-embedding-reranker-english-r2'):
    """
    Run a search experiment with specified models and corpus.
    
    Args:
        corpus (list): List of documents to search in
        question (str): Query to search for
        name (str): Name of the experiment
        retriever_name (str): Name of the retriever model
        reranker_name (str): Name of the reranker model
    """
    print(f"Running experiment {name}")

    # Create retriever and reranker
    retriever, reranker = create_search_system(retriever_name, reranker_name)

    # Step 1: Encode corpus once (can be cached)
    corpus_embeddings = retriever.encode_document(corpus, convert_to_tensor=True)

    # Step 2: Retrieve top-k candidates
    def search(query, top_k=20):
        query_embedding = retriever.encode_query(query, convert_to_tensor=True)

        # Find top-k with retriever
        hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]

        # Step 3: Rerank with cross-encoder
        cross_inp = [(query, corpus[hit['corpus_id']]) for hit in hits]
        cross_scores = reranker.predict(cross_inp)

        # Sort by reranker scores
        for idx, score in enumerate(cross_scores):
            hits[idx]['rerank_score'] = score

        hits = sorted(hits, key=lambda x: x['rerank_score'], reverse=True)

        return hits[:5]  # Return top 5 after reranking

    # Use it
    results = search(question)
    for hit in results:
        print(f"\tScore: {hit['rerank_score']:.4f} | {corpus[hit['corpus_id']]}")

# Your document corpus
corpus = [
    "Python is a high-level programming language",
    "Machine learning models require training data",
    "Machine learning is an engineeering discipline that studies best coding practices.",
    "Natural language processing enables text understanding",
    "Deep learning uses neural networks with multiple layers",
    "Data science combines statistics and programming",
    "Ana are mere."
    # ... your documents here
]
corpus2 = [
    "Venus is often called Earth's twin because of its similar size and proximity.",
    "Mars, known for its reddish appearance, is often referred to as the Red Planet.",
    "Jupiter, the largest planet in our solar system, has a prominent red spot.",
    "Saturn, famous for its rings, is sometimes mistaken for the Red Planet."
]
corpus3 = [
    "Romeo and Juliet is a play by William Shakespeare.",
    "Climate change refers to long-term shifts in temperatures.",
    "Shakespeare also wrote Hamlet and Macbeth.",
    "Water is an inorganic compound with the chemical formula H2O.",
    "In liquid form, H2O is also called 'water' at standard temperature and pressure."
]

In [7]:
# reranker = 'ibm-granite/granite-embedding-reranker-english-r2'
reranker = '../models/149m_reranker_updated'
run_experiment(corpus, "What is machine learning?", name="ML", reranker_name=reranker)
run_experiment(corpus2, "What planet is known as the Red Planet?", name="RedPlanet", reranker_name=reranker)
run_experiment(corpus3, "what is the chemical formula of water?", name="Water", reranker_name=reranker)

Running experiment ML
	Score: 1.0000 | Machine learning is an engineeering discipline that studies best coding practices.
	Score: 1.0000 | Machine learning models require training data
	Score: 1.0000 | Data science combines statistics and programming
	Score: 1.0000 | Deep learning uses neural networks with multiple layers
	Score: 1.0000 | Natural language processing enables text understanding
Running experiment RedPlanet
	Score: 1.0000 | Mars, known for its reddish appearance, is often referred to as the Red Planet.
	Score: 1.0000 | Saturn, famous for its rings, is sometimes mistaken for the Red Planet.
	Score: 1.0000 | Jupiter, the largest planet in our solar system, has a prominent red spot.
	Score: 1.0000 | Venus is often called Earth's twin because of its similar size and proximity.
Running experiment Water
	Score: 1.0000 | Water is an inorganic compound with the chemical formula H2O.
	Score: 1.0000 | In liquid form, H2O is also called 'water' at standard temperature and pressure.


In [56]:
# reranker = 'ibm-granite/granite-embedding-reranker-english-r2'
reranker = 'Alibaba-NLP/gte-multilingual-reranker-base'
run_experiment(corpus, "What is machine learning?", name="ML", reranker_name=reranker)
run_experiment(corpus2, "What planet is known as the Red Planet?", name="RedPlanet", reranker_name=reranker)

Running experiment ML


ValueError: Alibaba-NLP/new-impl You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-multilingual-reranker-base.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.

In [None]:
# Load model with long context support
model = SentenceTransformer('ibm-granite/granite-embedding-english-r2')

In [27]:
long_document="""
IBM Research introduces next-generation embedding models that don’t compromise between speed and accuracy

When it comes to enterprise information retrieval, organizations face a persistent challenge: existing embedding models force you to choose between accuracy and speed, between long-context support and commercial licensing, between general-purpose performance and domain-specific excellence.

On August 15th , we’ve introduced the Granite Embedding R2 models — a comprehensive family of retrieval models designed to eliminate these tradeoffs.

What’s New in R2?
The Granite Embedding R2 release includes three models, all available under Apache 2.0 license:

granite-embedding-english-r2 (149M parameters): Our flagship model with 768-dimensional embeddings
granite-embedding-small-english-r2 (47M parameters): A first-of-its-kind efficient model with 384-dimensional embeddings
granite-embedding-reranker-english-r2 (149M parameters): A cross-encoder for precision ranking
These models deliver three critical improvements over our first-generation release:

16x expanded context length from 512 to 8,192 tokens — meeting modern document processing requirements
19–44% faster inference than comparable models, without sacrificing accuracy
State-of-the-art performance across text, code, long-documents, conversational queries, and tabular data
Getting Started: Basic Usage
Using Granite Embedding models is straightforward with the Sentence-Transformers library:

from sentence_transformers import SentenceTransformer
# Load the model
model = SentenceTransformer('ibm-granite/granite-embedding-english-r2')
# Encode some text
documents = [
    "Granite models are designed for enterprise applications",
    "Information retrieval systems need fast and accurate embeddings",
    "Machine learning models can process natural language"
]
# Generate embeddings
embeddings = model.encode(documents)
print(f"Embedding shape: {embeddings.shape}")  # (3, 768)
For semantic search, you can compute similarity scores:

import numpy as np
from sentence_transformers import util
# Encode query and documents
query = "What are enterprise AI models?"
query_embedding = model.encode(query)
doc_embeddings = model.encode(documents)
# Compute cosine similarity
similarities = util.cos_sim(query_embedding, doc_embeddings)
print(f"Similarities: {similarities}")
# Get most relevant document
best_idx = np.argmax(similarities)
print(f"Most relevant: {documents[best_idx]}")
Building a Semantic Search System
Here’s a complete example building a retrieval system with the reranker:

from sentence_transformers import SentenceTransformer, CrossEncoder
import numpy as np
# Load retriever and reranker
retriever = SentenceTransformer('ibm-granite/granite-embedding-english-r2')
reranker = CrossEncoder('ibm-granite/granite-embedding-reranker-english-r2')
# Your document corpus
corpus = [
    "Python is a high-level programming language",
    "Machine learning models require training data",
    "Natural language processing enables text understanding",
    "Deep learning uses neural networks with multiple layers",
    "Data science combines statistics and programming",
    # ... your documents here
]
# Step 1: Encode corpus once (can be cached)
corpus_embeddings = retriever.encode(corpus, convert_to_tensor=True)
# Step 2: Retrieve top-k candidates
def search(query, top_k=20):
    query_embedding = retriever.encode(query, convert_to_tensor=True)

    # Find top-k with retriever
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]

    # Step 3: Rerank with cross-encoder
    cross_inp = [[query, corpus[hit['corpus_id']]] for hit in hits]
    cross_scores = reranker.predict(cross_inp)

    # Sort by reranker scores
    for idx, score in enumerate(cross_scores):
        hits[idx]['rerank_score'] = score

    hits = sorted(hits, key=lambda x: x['rerank_score'], reverse=True)

    return hits[:5]  # Return top 5 after reranking
# Use it
results = search("What is machine learning?")
for hit in results:
    print(f"Score: {hit['rerank_score']:.4f} | {corpus[hit['corpus_id']]}")
Long Context Documents
Granite R2 handles up to 8,192 tokens, perfect for processing full documents:

# Load model with long context support
model = SentenceTransformer('ibm-granite/granite-embedding-english-r2')
# Process a long document (e.g., research paper, technical documentation)
long_document =
[Your 5000+ word document here]
This could be an entire research paper, technical manual,
or any long-form content...

# Encode the full document (no chunking needed for <8192 tokens)
doc_embedding = model.encode(long_document, show_progress_bar=True)
# Compare with shorter query
query = "What are the main findings of this research?"
query_embedding = model.encode(query)
similarity = util.cos_sim(query_embedding, doc_embedding)
print(f"Relevance score: {similarity.item():.4f}")
Code Search Example
Granite R2 excels at code retrieval:

# Code snippets corpus
code_snippets = [

    def binary_search(arr, target):
        left, right = 0, len(arr) - 1
        while left <= right:
            mid = (left + right) // 2
            if arr[mid] == target:
                return mid
            elif arr[mid] < target:
                left = mid + 1
            else:
                right = mid - 1
        return -1

    def quicksort(arr):
        if len(arr) <= 1:
            return arr
        pivot = arr[len(arr) // 2]
        left = [x for x in arr if x < pivot]
        middle = [x for x in arr if x == pivot]
        right = [x for x in arr if x > pivot]
        return quicksort(left) + middle + quicksort(right)
    class LinkedList:
        def __init__(self):
            self.head = None

        def append(self, data):
            if not self.head:
                self.head = Node(data)
                return
            current = self.head
            while current.next:
                current = current.next
            current.next = Node(data)
]
# Encode code
code_embeddings = model.encode(code_snippets)
# Natural language query
query = "How do I implement a binary search algorithm?"
query_embedding = model.encode(query)
# Find most relevant code
similarities = util.cos_sim(query_embedding, code_embeddings)[0]
best_match = np.argmax(similarities)
print(f"Most relevant code snippet:\n{code_snippets[best_match]}")
Table Retrieval
Handle structured data with ease:

# Tables in markdown format
tables = [
    | Product | Q1 Revenue | Q2 Revenue |
    |---------|-----------|-----------|
    | Product A | $500K | $650K |
    | Product B | $300K | $420K |
    | Employee | Department | Salary |
    |----------|-----------|--------|
    | John Doe | Engineering | $120K |
    | Jane Smith | Marketing | $95K |
    | Country | Population | GDP |
    |---------|-----------|-----|
    | USA | 331M | $21T |
    | China | 1.4B | $14T |

]
# Encode tables
table_embeddings = model.encode(tables)
# Query for specific information
query = "What was the revenue growth for our products?"
query_embedding = model.encode(query)
similarities = util.cos_sim(query_embedding, table_embeddings)[0]
best_table = np.argmax(similarities)
print(f"Most relevant table:\n{tables[best_table]}")
Batch Processing for Production
For production deployments processing large volumes:

from sentence_transformers import SentenceTransformer
import torch
# Load model with GPU support
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('ibm-granite/granite-embedding-english-r2', device=device)
# Large batch of documents
documents = [...] # Your thousands of documents
# Efficient batch encoding
batch_size = 128
all_embeddings = model.encode(
    documents,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_tensor=True,
    normalize_embeddings=True  # For cosine similarity
)
# Save embeddings for later use
torch.save(all_embeddings, 'document_embeddings.pt')
# Load and search later
embeddings = torch.load('document_embeddings.pt')
query_emb = model.encode(query, convert_to_tensor=True)
hits = util.semantic_search(query_emb, embeddings, top_k=10)
Multi-Turn Conversational Search
Handle conversational context:

conversation_history = [
    "What are the system requirements for the new software?",
    "Does it work on Mac?",
    "What about Linux distributions?"
]
# Concatenate conversation context
context = " ".join(conversation_history)
current_query = conversation_history[-1]
# Encode with full context
context_embedding = model.encode(context)
# Search in your knowledge base
results = search_with_context(context_embedding, knowledge_base)
Built on Modern Foundations
The R2 models leverage the ModernBERT architecture, incorporating recent advances in encoder design:

Alternating attention mechanisms for efficiency
Rotary positional embeddings enabling flexible context lengths
Flash Attention support for optimized inference
We trained these models on 2 trillion tokens from high-quality sources including GneissWeb, Wikipedia, and Granite Code data. Every dataset underwent comprehensive governance review, with screening for personal information and profanity — because enterprise deployments demand transparency and responsible AI practices.

A Novel Training Pipeline
What sets Granite R2 apart is our five-stage training methodology:

1. Retrieval-Oriented Pretraining: Using RetroMAE to train rich [CLS] representations without explicit contrastive objectives

2. Tabular Pretraining: A breakthrough approach for handling structured data. Traditional embedding models struggle with tables containing numerical data and limited context. Our solution? We generated synthetic summaries for 8 million tables using Mistral-7B, then modified the RetroMAE objective to predict masked tokens over summaries rather than table content itself. This forces the encoder to align table structure with natural language descriptions.

3. Contrastive Finetuning: Training on large-scale semi-supervised pairs with improved contrastive loss

4. Contrastive Distillation: Rather than simply finetuning on hard negatives, we distill knowledge from a Mistral-7B teacher model trained on high-quality triples. This approach yields larger performance gains than traditional hard-negative training.

5. Domain Adaptation: Specialized training for multi-turn conversational retrieval

This pipeline enables a single model family to excel across remarkably diverse tasks.

Performance That Speaks for Itself
We evaluated Granite R2 across comprehensive benchmarks:

General Retrieval (MTEB-v2)

granite-english-r2: 56.4 average score
granite-small-r2: 53.9 average score
Code Retrieval (COIR)

54.8 / 53.4 across text-to-code, code-to-text, and hybrid tasks
Zero-shot evaluation (no COIR training data used)
Long-Context (MLDR, LongEmbed)

granite-english-r2: 41.6 MLDR, 67.8 LongEmbed
granite-small-r2: 40.1 MLDR, 61.9 LongEmbed
State-of-the-art on LongEmbed benchmark
Table Retrieval

78.5 / 75.5 across OpenWikiTables, NQTables, OTT-QA, MultiHierTT, and AIT-QA
Multi-Turn Conversational (MT-RAG)

granite-english-r2: 57.6 Recall@5
Substantial improvement over first-generation models
Speed Without Compromise
Performance benchmarks often overlook a critical real-world constraint: encoding speed. When you’re ingesting millions of documents with frequent updates, speed directly impacts operational costs and user experience.

We benchmarked encoding speed using 23,000 IBM technical documents (averaging 6,393 characters, ranging from 10 to 475,001 characters):

granite-english-r2: 144 documents/second
granite-small-r2: 199 documents/second
These speeds represent 19–44% improvements over leading competitors, despite the R2 models having slightly more parameters than R1. The ModernBERT architecture’s optimizations — particularly Flash Attention — enable this efficiency gain.

Complete Retrieval Ecosystem
The reranker model completes the retrieval pipeline. Built on granite-embedding-english-r2, it uses a PListMLE loss objective for position-aware ranking:

BEIR: 55.4 (vs. 53.1 for retriever alone)
MLDR: 44.4 (vs. 41.6 for retriever alone)
This retrieve-and-rerank framework maximizes both recall and precision without severe computational overhead.

Enterprise-Ready from Day One
Every Granite model prioritizes enterprise requirements:

Data Governance: Comprehensive clearance process capturing content description, intended use, data classification, licensing information, usage restrictions, and personal information assessment

Licensing: Apache 2.0 — no restrictions on commercial use, no proprietary training data limitations

Transparency: Fully documented training data sources, architectural decisions, and evaluation methodology

Why This Matters
Information retrieval isn’t just about finding documents — it’s about enabling AI systems to access relevant knowledge efficiently. Whether you’re building RAG applications, semantic search engines, or recommendation systems, embedding quality and speed determine what’s possible.

Granite R2 models don’t force you to choose between accuracy and speed, between long-context support and efficiency, between general-purpose capability and domain-specific performance. They deliver all of it.

In an era where milliseconds matter and accuracy cannot be compromised, Granite R2 doesn’t just meet the standard — it sets it.

Get Started
All Granite Embedding R2 models are available now on Hugging Face under Apache 2.0 license:

granite-embedding-english-r2
granite-embedding-small-english-r2
granite-embedding-reranker-english-r2
For technical details, architecture description, and comprehensive benchmark results, see our research paper.

The Granite Embedding R2 models represent collaborative work across IBM Research teams in multiple geographies. For questions or feedback, visit our GitHub repository.
"""
# Encode the full document (no chunking needed for <8192 tokens)
doc_embedding = model.encode(long_document, show_progress_bar=True)
# Compare with shorter query
query = "What are the main findings of this research?"
query_embedding = model.encode(query)
similarity = util.cos_sim(query_embedding, doc_embedding)
print(f"Relevance score: {similarity.item():.4f}")

Batches: 100%|██████████| 1/1 [00:00<00:00, 22.11it/s]

Relevance score: 0.7477





Code Search Example

In [28]:
# Code snippets corpus
code_snippets = [
    """
    def binary_search(arr, target):
        left, right = 0, len(arr) - 1
        while left <= right:
            mid = (left + right) // 2
            if arr[mid] == target:
                return mid
            elif arr[mid] < target:
                left = mid + 1
            else:
                right = mid - 1
        return -1
    """,
    """
    def quicksort(arr):
        if len(arr) <= 1:
            return arr
        pivot = arr[len(arr) // 2]
        left = [x for x in arr if x < pivot]
        middle = [x for x in arr if x == pivot]
        right = [x for x in arr if x > pivot]
        return quicksort(left) + middle + quicksort(right)
    """,
    """
    class LinkedList:
        def __init__(self):
            self.head = None

        def append(self, data):
            if not self.head:
                self.head = Node(data)
                return
            current = self.head
            while current.next:
                current = current.next
            current.next = Node(data)
    """
]

In [37]:
# Encode code
code_embeddings = model.encode(code_snippets)
# Natural language query
query = "How do I implement a binary search algorithm?"
query_embedding = model.encode(query)
# Find most relevant code
similarities = util.cos_sim(query_embedding, code_embeddings)[0]
best_match = np.argmax(similarities)
print(f"Most relevant code snippet:\n{code_snippets[best_match]}")


Most relevant code snippet:

    class LinkedList:
        def __init__(self):
            self.head = None

        def append(self, data):
            if not self.head:
                self.head = Node(data)
                return
            current = self.head
            while current.next:
                current = current.next
            current.next = Node(data)
    


Table Retrieval

In [39]:
# Tables in markdown format
tables = [
    """
    | Product | Q1 Revenue | Q2 Revenue |
    |---------|-----------|-----------|
    | Product A | $500K | $650K |
    | Product B | $300K | $420K |
    """,
    """
    | Employee | Department | Salary |
    |----------|-----------|--------|
    | John Doe | Engineering | $120K |
    | Jane Smith | Marketing | $95K |
    """,
    """
    | Country | Population | GDP |
    |---------|-----------|-----|
    | USA | 331M | $21T |
    | China | 1.4B | $14T |
    """
]

In [41]:
# Encode tables
table_embeddings = model.encode(tables)
# Query for specific information
query = "What was the revenue growth for our products?"
query_embedding = model.encode(query)
similarities = util.cos_sim(query_embedding, table_embeddings)[0]
best_table = np.argmax(similarities)
print(f"Most relevant table:\n{tables[best_table]}")

Most relevant table:

    | Product | Q1 Revenue | Q2 Revenue |
    |---------|-----------|-----------|
    | Product A | $500K | $650K |
    | Product B | $300K | $420K |
    


Batch Processing for Production

In [43]:
from sentence_transformers import SentenceTransformer
import torch

In [44]:
# Load model with GPU support
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('ibm-granite/granite-embedding-english-r2', device=device,
                            model_kwargs=model_kwargs)
# Large batch of documents
# documents = [...] # Your thousands of documents
documents = generate_text(1024, num_docs=128)
# Efficient batch encoding
batch_size = 64
all_embeddings = model.encode_document(
    documents,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_tensor=True,
    normalize_embeddings=True  # For cosine similarity
)
# Save embeddings for later use
torch.save(all_embeddings, 'document_embeddings.pt')
# Load and search later
embeddings = torch.load('document_embeddings.pt')
query_emb = model.encode(query, convert_to_tensor=True)
hits = util.semantic_search(query_emb, embeddings, top_k=10)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


TypeError: 'ellipsis' object is not subscriptable