# Develop better semantic similarity evaluation



In [1]:
# Imports
from sentence_transformers import SentenceTransformer
import time
from bert_score import BERTScorer



  from tqdm.autonotebook import tqdm, trange


### Now: 

In [9]:
def semantic_similarity(model, sentence1, sentence2):
    # multi-qa-MiniLM-L6-cos-v1, cheap model for dev
    # all-mpnet-base-v2 , more performant model, but slower
    sentence1_vec = model.encode([sentence1])

    sentence2_vec = model.encode([sentence2])

    similarity_score = model.similarity(
        sentence1_vec, sentence2_vec
    )  # Default is cosine simi
    # print(f"\n Similarity Score = {similarity_score} ")

    return similarity_score


sentence = "Yes, Lincoln was eventually chosen as the Republican candidate for the 1860 election."
reference = "Yes"
model = SentenceTransformer("all-mpnet-base-v2")
start = time.time()
for i in range(100):
    score = semantic_similarity(model, sentence, reference)
end = time.time()
print(f"Time taken = {end-start}")
score



Time taken = 7.904077768325806


tensor([[0.1648]])

## Improved: 

In [4]:
def semantic_similarity(scorer, sentence, reference):
    # Example texts
    # BERTScore calculation
    P, R, F1 = scorer.score([sentence], [reference])
    # print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")
    return P,R,F1



sentence = "no"
reference = "yes"
scorer = BERTScorer(model_type='roberta-large', lang='en')
start = time.time()

p,r,f1= semantic_similarity(scorer,reference,sentence)
print(f"BERTScore Precision: {p.mean():.4f}, Recall: {r.mean():.4f}, F1: {f1.mean():.4f}")
p,r,f1 = semantic_similarity(scorer, sentence,reference)
print(f"BERTScore Precision: {p.mean():.4f}, Recall: {r.mean():.4f}, F1: {f1.mean():.4f}")
end = time.time()
print(f"Time taken = {end-start}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.9955, Recall: 0.9955, F1: 0.9955
BERTScore Precision: 0.9955, Recall: 0.9955, F1: 0.9955
Time taken = 0.2005908489227295
