<a href="https://colab.research.google.com/github/prashgs/MachineLearning/blob/main/sentence_transformers_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#pip install -U sentence-transformers

#Sentence Transformers

In [7]:
from sentence_transformers import SentenceTransformer, SimilarityFunction
import warnings

warnings.filterwarnings('ignore')

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")




In [8]:
# Two lists of sentences
sentences1 = """
Come back to experience Google's most capable AI models and get priority access to new features for $19.99 $9.99/month for 2 months.

"""

sentences2 = """
Get Google's best AI and early features for $9.99/month (2 months).
"""
# Compute embeddings for both lists
embeddings1 = model.encode(sentences1)
embeddings2 = model.encode(sentences2)

# Compute cosine similarities
similarities = model.similarity(embeddings1, embeddings2)

similarities


tensor([[0.8632]])

#HuggingFace Sentence transformers

In [9]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity


In [10]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [11]:
# Sentences we want sentence embeddings for
sentences = ["The quick brown fox jumps over the lazy dog. This is a test sentence.", "A swift brown fox leaps over the indolent canine. Here's another test."]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)


In [12]:
# prompt: Compare embeddings in sentence_embeddings

# Compute cosine similarities using sklearn
similarities_sklearn = cosine_similarity(sentence_embeddings[0].reshape(1,-1), sentence_embeddings[1].reshape(1,-1))
print(f"Cosine Similarity using sklearn: {similarities_sklearn[0][0]}")
print(f"Cosine Similarity using Sentence Transformers library: {similarities}")


Cosine Similarity using sklearn: 0.7729694247245789
Cosine Similarity using Sentence Transformers library: tensor([[0.8632]])


#Huggingface



In [13]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity

def get_paragraph_embedding(paragraph, model, tokenizer):
    """Generates a sentence embedding for a given paragraph."""
    inputs = tokenizer(paragraph, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

def compare_paragraphs(paragraph1, paragraph2, model_name="sentence-transformers/all-mpnet-base-v2"):
    """Compares two paragraphs using a Hugging Face model and cosine similarity."""

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    embedding1 = get_paragraph_embedding(paragraph1, model, tokenizer)
    embedding2 = get_paragraph_embedding(paragraph2, model, tokenizer)

    similarity_score = cosine_similarity(embedding1, embedding2).item()

    return similarity_score




In [14]:

paragraph1 = "The quick brown fox jumps over the lazy dog. This is a test sentence."
paragraph2 = "A swift brown fox leaps over the indolent canine. Here's another test."
paragraph3 = "The weather is nice today. Let's go for a walk."

similarity_1_2 = compare_paragraphs(paragraph1, paragraph2)
similarity_1_3 = compare_paragraphs(paragraph1, paragraph3)

print(f"Similarity between paragraph 1 and 2: {similarity_1_2}")
print(f"Similarity between paragraph 1 and 3: {similarity_1_3}")

#Example using a different model.
similarity_1_2_bert = compare_paragraphs(paragraph1, paragraph2, model_name="google-bert/bert-base-uncased")
print(f"Similarity between paragraph 1 and 2 (BERT): {similarity_1_2_bert}")

#Example using a different model.
similarity_1_3_bert = compare_paragraphs(paragraph1, paragraph3, model_name="google-bert/bert-base-uncased")
print(f"Similarity between paragraph 1 and 3 (BERT): {similarity_1_3_bert}")

#Example using a different model.
similarity_1_2_bert = compare_paragraphs(paragraph1, paragraph2, model_name="sentence-transformers/all-MiniLM-L6-v2")
print(f"Similarity between paragraph 1 and 2 (All MiniLM): {similarity_1_2_bert}")

#Example using a different model.
similarity_1_3_bert = compare_paragraphs(paragraph1, paragraph3, model_name="sentence-transformers/all-MiniLM-L6-v2")
print(f"Similarity between paragraph 1 and 3 (All MiniLM): {similarity_1_3_bert}")

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Similarity between paragraph 1 and 2: 0.7782570719718933
Similarity between paragraph 1 and 3: 0.12652546167373657


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Similarity between paragraph 1 and 2 (BERT): 0.8255898952484131
Similarity between paragraph 1 and 3 (BERT): 0.48839324712753296
Similarity between paragraph 1 and 2 (BERT): 0.7729693651199341
Similarity between paragraph 1 and 3 (BERT): 0.09579998254776001
