In [None]:
!pip install transformers sentence-transformers

from transformers import PreTrainedTokenizerFast, BertModel
from sentence_transformers import SentenceTransformer, util
import torch

# Load pre-trained ModernBERT model and tokenizer
model_name = 'answerdotai/ModernBERT-base'  # Changed to ModernBERT
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Or use a sentence transformer model directly for better performance
# model = SentenceTransformer('answerdotai/ModernBERT-base') # Example: ModernBERT-base


def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling of token embeddings
    return embeddings


def calculate_similarity(text1, text2):
    embedding1 = get_bert_embedding(text1)
    embedding2 = get_bert_embedding(text2)

    # Calculate cosine similarity
    similarity_score = util.cos_sim(embedding1, embedding2).item()
    return similarity_score


# Example usage
text1 = "This is a positive sentence."
text2 = "This is another positive sentence."
text3 = "This is a negative sentence."

similarity_1_2 = calculate_similarity(text1, text2)
similarity_1_3 = calculate_similarity(text1, text3)

print(f"Similarity between '{text1}' and '{text2}': {similarity_1_2}")
print(f"Similarity between '{text1}' and '{text3}': {similarity_1_3}")



You are using a model of type modernbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encod

Similarity between 'This is a positive sentence.' and 'This is another positive sentence.': 0.98695307970047
Similarity between 'This is a positive sentence.' and 'This is a negative sentence.': 0.9858779907226562
