In [3]:
import torch
from torch import nn
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from sentence_transformers import SentenceTransformer
from time import time
import numpy as np
import psutil

# Example search queries
test_queries = [
    "What are the inflation rates in India?",
    "CPI changes over the years?",
    "How does the CPI affect rural and urban areas?",
    "What is the inflation rate for different states?",
    "How is the consumer price index measured?"
]

# 1. Load the models (We will try BERT, RoBERTa, and Sentence-BERT)
class SemanticSearchModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super(SemanticSearchModel, self).__init__()
        self.model_name = model_name
        if model_name == 'bert-base-uncased':
            self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif model_name == 'roberta-base':
            self.model = RobertaForSequenceClassification.from_pretrained('roberta-base')
            self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        elif model_name == 'sentence-transformers/all-MiniLM-L6-v2':
            self.model = SentenceTransformer('all-MiniLM-L6-v2')
            self.tokenizer = None  # Sentence-BERT doesn't need separate tokenizer

    def forward(self, input_texts=None):
        if self.tokenizer:
            # Tokenize input text for BERT and RoBERTa models
            encoding = self.tokenizer(input_texts, return_tensors='pt', padding=True, truncation=True)
            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']
            output = self.model(input_ids, attention_mask=attention_mask)[0]
        else:
            # For Sentence-BERT, directly encode the texts to get embeddings
            output = self.model.encode(input_texts)
        return output

# 2. Define a function to measure inference time and model size
def get_model_size(model):
    params = sum(p.numel() for p in model.parameters())
    size = params * 4 / (1024**2)  # Convert params to MB
    return size

def measure_inference_time(model, texts):
    start_time = time()
    with torch.no_grad():
        model(texts)
    inference_time = time() - start_time
    return inference_time

# 3. Test the models and get performance stats
models = ['bert-base-uncased', 'roberta-base', 'sentence-transformers/all-MiniLM-L6-v2']
results = {}

for model_name in models:
    model = SemanticSearchModel(model_name)
    
    # For Sentence-BERT, we pass the texts directly
    emb = model(test_queries)
    
    model_size = get_model_size(model)
    inference_time = measure_inference_time(model, test_queries)
    
    results[model_name] = {
        'Model Size (MB)': model_size,
        'Inference Time (s)': inference_time,
        'Test Results': emb  # Embeddings for test queries
    }

# 4. Output the results
for model_name, stats in results.items():
    print(f"Model: {model_name}")
    print(f"Model Size (MB): {stats['Model Size (MB)']}")
    print(f"Inference Time (s): {stats['Inference Time (s)']}")
    print("Test Results (Embeddings):")
    for i, query in enumerate(test_queries):
        print(f"Query: {query}")
        print(f"Embedding: {stats['Test Results'][i]}")
        print('-' * 50)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: bert-base-uncased
Model Size (MB): 417.64746856689453
Inference Time (s): 0.07618069648742676
Test Results (Embeddings):
Query: What are the inflation rates in India?
Embedding: tensor([-0.1021, -0.0769], grad_fn=<SelectBackward0>)
--------------------------------------------------
Query: CPI changes over the years?
Embedding: tensor([-0.0436, -0.1450], grad_fn=<SelectBackward0>)
--------------------------------------------------
Query: How does the CPI affect rural and urban areas?
Embedding: tensor([-0.0610, -0.0918], grad_fn=<SelectBackward0>)
--------------------------------------------------
Query: What is the inflation rate for different states?
Embedding: tensor([-0.0251, -0.0303], grad_fn=<SelectBackward0>)
--------------------------------------------------
Query: How is the consumer price index measured?
Embedding: tensor([-0.0953, -0.0899], grad_fn=<SelectBackward0>)
--------------------------------------------------
Model: roberta-base
Model Size (MB): 475.49121856689

In [4]:
import torch
from torch import nn
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from sentence_transformers import SentenceTransformer, util
from time import time
import numpy as np
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

# Dummy corpus and test queries
search_texts = [
    "all india year on year inflation rates % for g",
    "all india inflation rate based on cpi base 201",
    "year on year inflation rates % of major states",
    "general cpi for states for rural urban and com",
    "all india general group and sub group level cp"
]

test_queries = [
    "inflation in India based on CPI",
    "rural and urban consumer price index",
    "general price index for states",
    "inflation rates for all India",
    "state-wise inflation trend"
]

# Define SemanticSearchModel wrapper
class SemanticSearchModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super(SemanticSearchModel, self).__init__()
        self.model_name = model_name
        self.sentence_bert = False

        if model_name == 'bert-base-uncased':
            self.model = BertModel.from_pretrained(model_name)
            self.tokenizer = BertTokenizer.from_pretrained(model_name)
        elif model_name == 'roberta-base':
            self.model = RobertaModel.from_pretrained(model_name)
            self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        elif model_name.startswith("sentence-transformers"):
            self.model = SentenceTransformer(model_name)
            self.tokenizer = None
            self.sentence_bert = True

    def get_embeddings(self, texts):
        if self.sentence_bert:
            return self.model.encode(texts, convert_to_tensor=True)
        else:
            tokens = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
            with torch.no_grad():
                outputs = self.model(**tokens)
            embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
            return embeddings

# Get model size
def get_model_size(model):
    if isinstance(model, SentenceTransformer):
        return "N/A (pre-compiled)"
    params = sum(p.numel() for p in model.parameters())
    return round(params * 4 / (1024 ** 2), 2)  # in MB

# Inference time
def measure_inference_time(model, texts):
    start = time()
    model.get_embeddings(texts)
    return round(time() - start, 4)

# Compare a single test query with corpus and return best match
def get_best_matches(model, query, corpus_embeddings, corpus_texts, top_k=1):
    query_embedding = model.get_embeddings([query])
    scores = cosine_similarity(query_embedding.cpu(), corpus_embeddings.cpu())[0]
    top_idx = np.argsort(scores)[::-1][:top_k]
    return [(corpus_texts[i], scores[i]) for i in top_idx]

# Models to evaluate
model_names = [
    'bert-base-uncased',
    'roberta-base',
    'sentence-transformers/all-MiniLM-L6-v2',
    'sentence-transformers/paraphrase-MiniLM-L6-v2',
    'sentence-transformers/all-mpnet-base-v2',
    'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
    'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
]

# Run benchmark
for model_name in model_names:
    print(f"\n🔍 Evaluating model: {model_name}")
    model = SemanticSearchModel(model_name)

    model_size = get_model_size(model.model)
    inference_time = measure_inference_time(model, search_texts)
    corpus_embeddings = model.get_embeddings(search_texts)

    print(f"📦 Model Size: {model_size} MB")
    print(f"⚡ Inference Time (on corpus): {inference_time} s")
    print("📈 Test Results:")

    for query in test_queries:
        match_text, score = get_best_matches(model, query, corpus_embeddings, search_texts)[0]
        print(f"\nQuery: {query}")
        print(f"Top Match: {match_text}")
        print(f"Similarity: {round(score, 4)}")



🔍 Evaluating model: bert-base-uncased
📦 Model Size: 417.64 MB
⚡ Inference Time (on corpus): 1.2504 s
📈 Test Results:

Query: inflation in India based on CPI
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.8324000239372253

Query: rural and urban consumer price index
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.7429999709129333

Query: general price index for states
Top Match: year on year inflation rates % of major states
Similarity: 0.7874000072479248

Query: inflation rates for all India
Top Match: all india year on year inflation rates % for g
Similarity: 0.842199981212616

Query: state-wise inflation trend
Top Match: year on year inflation rates % of major states
Similarity: 0.7551000118255615

🔍 Evaluating model: roberta-base


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📦 Model Size: 475.49 MB
⚡ Inference Time (on corpus): 1.2978 s
📈 Test Results:

Query: inflation in India based on CPI
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.9854000210762024

Query: rural and urban consumer price index
Top Match: year on year inflation rates % of major states
Similarity: 0.9745000004768372

Query: general price index for states
Top Match: year on year inflation rates % of major states
Similarity: 0.9761000275611877

Query: inflation rates for all India
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.984000027179718

Query: state-wise inflation trend
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.9747999906539917

🔍 Evaluating model: sentence-transformers/all-MiniLM-L6-v2
📦 Model Size: N/A (pre-compiled) MB
⚡ Inference Time (on corpus): 0.3056 s
📈 Test Results:

Query: inflation in India based on CPI
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.8023999929428101


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


📦 Model Size: N/A (pre-compiled) MB
⚡ Inference Time (on corpus): 0.0452 s
📈 Test Results:

Query: inflation in India based on CPI
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.8091999888420105

Query: rural and urban consumer price index
Top Match: general cpi for states for rural urban and com
Similarity: 0.6218000054359436

Query: general price index for states
Top Match: year on year inflation rates % of major states
Similarity: 0.5242999792098999

Query: inflation rates for all India
Top Match: all india year on year inflation rates % for g
Similarity: 0.847599983215332

Query: state-wise inflation trend
Top Match: year on year inflation rates % of major states
Similarity: 0.7756999731063843

🔍 Evaluating model: sentence-transformers/all-mpnet-base-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


📦 Model Size: N/A (pre-compiled) MB
⚡ Inference Time (on corpus): 0.2689 s
📈 Test Results:

Query: inflation in India based on CPI
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.7483000159263611

Query: rural and urban consumer price index
Top Match: general cpi for states for rural urban and com
Similarity: 0.807699978351593

Query: general price index for states
Top Match: general cpi for states for rural urban and com
Similarity: 0.7250000238418579

Query: inflation rates for all India
Top Match: all india year on year inflation rates % for g
Similarity: 0.8540999889373779

Query: state-wise inflation trend
Top Match: year on year inflation rates % of major states
Similarity: 0.7803999781608582

🔍 Evaluating model: sentence-transformers/multi-qa-MiniLM-L6-cos-v1


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


📦 Model Size: N/A (pre-compiled) MB
⚡ Inference Time (on corpus): 0.0416 s
📈 Test Results:

Query: inflation in India based on CPI
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.8515999913215637

Query: rural and urban consumer price index
Top Match: general cpi for states for rural urban and com
Similarity: 0.6211000084877014

Query: general price index for states
Top Match: general cpi for states for rural urban and com
Similarity: 0.5223000049591064

Query: inflation rates for all India
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.8173999786376953

Query: state-wise inflation trend
Top Match: year on year inflation rates % of major states
Similarity: 0.7167999744415283

🔍 Evaluating model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


📦 Model Size: N/A (pre-compiled) MB
⚡ Inference Time (on corpus): 0.0855 s
📈 Test Results:

Query: inflation in India based on CPI
Top Match: all india inflation rate based on cpi base 201
Similarity: 0.8590999841690063

Query: rural and urban consumer price index
Top Match: general cpi for states for rural urban and com
Similarity: 0.48080000281333923

Query: general price index for states
Top Match: year on year inflation rates % of major states
Similarity: 0.6478999853134155

Query: inflation rates for all India
Top Match: all india year on year inflation rates % for g
Similarity: 0.8781999945640564

Query: state-wise inflation trend
Top Match: year on year inflation rates % of major states
Similarity: 0.8495000004768372
