## Sentence Embeddings and semantic text similarity

* Read chapter 5 of course book



In [55]:
## >>pip install torch
## >>pip install transformers
## >>pip install protobuf
## !pip install sentence-transformers
## >>pip install dataset
## !pip install flair

In [56]:

import pandas as pd
import torch
import torch
import numpy as np
## import SentenceTransformer
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
from flair.embeddings import DocumentRNNEmbeddings
from flair.embeddings import TransformerDocumentEmbeddings
from flair.embeddings import SentenceTransformerDocumentEmbeddings
from transformers import AutoModelForSequenceClassification, AutoTokenizer


In [57]:

similar=[
("A black dog walking beside a pool.",            "A black dog is walking along the side of a pool."),
("A blonde woman looks for medical supplies for work in a suitcase.	",
                                 " The blond woman is searching for medical supplies in a suitcase."),
  ("A doubly decker red bus driving down the road.","A red double decker bus driving down a street."),
("There is a black dog jumping into a swimming pool.","A black dog is leaping into a swimming pool."),
    ("The man used a sword to slice a plastic bottle.","A man sliced a plastic bottle with a sword.")
]



In [58]:

print(   pd.DataFrame(similar, columns=["sen1", "sen2"])    )


                                                sen1  \
0                 A black dog walking beside a pool.   
1  A blonde woman looks for medical supplies for ...   
2     A doubly decker red bus driving down the road.   
3  There is a black dog jumping into a swimming p...   
4    The man used a sword to slice a plastic bottle.   

                                                sen2  
0   A black dog is walking along the side of a pool.  
1   The blond woman is searching for medical supp...  
2     A red double decker bus driving down a street.  
3       A black dog is leaping into a swimming pool.  
4        A man sliced a plastic bottle with a sword.  


In [59]:

dissimilar= [
("A little girl and boy are reading books. ", "An older child is playing with a doll while gazing out the window."),
("Two horses standing in a field with trees in the background.", "A black and white bird on a body of water with grass in the background."),
("Two people are walking by the ocean." , "Two men in fleeces and hats looking at the camera."),
("A cat is pouncing on a trampoline.","A man is slicing a tomato."),
("A woman is riding on a horse.","A man is turning over tables in anger.")
]


In [60]:

print(    pd.DataFrame(dissimilar, columns=["sen1", "sen2"])    )


                                                sen1  \
0          A little girl and boy are reading books.    
1  Two horses standing in a field with trees in t...   
2               Two people are walking by the ocean.   
3                 A cat is pouncing on a trampoline.   
4                      A woman is riding on a horse.   

                                                sen2  
0  An older child is playing with a doll while ga...  
1  A black and white bird on a body of water with...  
2  Two men in fleeces and hats looking at the cam...  
3                         A man is slicing a tomato.  
4             A man is turning over tables in anger.  


In [61]:

def sim(s1,s2):
  # cosine similarity function outputs in the range 0-1
  s1=s1.embedding.unsqueeze(0)
  s2=s2.embedding.unsqueeze(0)
  sim = torch.cosine_similarity(s1,s2).item() 
  return np.round(sim,2)


In [62]:


def evaluate(embeddings, myPairList):
  # it evaluates embeddings for a given list of sentence pair
  scores=[]
  for s1, s2 in myPairList:
    s1,s2=Sentence(s1), Sentence(s2)
    embeddings.embed(s1)
    embeddings.embed(s2)
    score=sim(s1,s2)
    scores.append(score)
  return scores, np.round(np.mean(scores),2)


## Average word embeddings with GloVe


In [63]:

glove_embedding = WordEmbeddings('glove')
glove_pool_embeddings = DocumentPoolEmbeddings([glove_embedding])


In [64]:


print("Glove just average embeddings")
print(    evaluate(glove_pool_embeddings, similar)     )
print(    evaluate(glove_pool_embeddings, dissimilar)  )


Glove just average embeddings
([0.97, 0.99, 0.97, 0.99, 0.98], 0.98)
([0.94, 0.97, 0.94, 0.92, 0.93], 0.94)



## Considers sequence RNN based GRU


In [65]:

gru_embeddings = DocumentRNNEmbeddings([glove_embedding])


In [66]:


print("GRU RNN embeddings") 

print(    evaluate(gru_embeddings, similar)      )
print(    evaluate(gru_embeddings, dissimilar)   )


GRU RNN embeddings
([0.99, 1.0, 0.95, 1.0, 0.89], 0.97)
([0.89, 1.0, 0.92, 0.81, 0.87], 0.9)



## The following execution instantiates a "bert-base-uncased" model that pools the final layer as A non-specific BERT


* BERT non-specialized embeddings
* notice it is not better than GloVe
* actually a bit worse


In [67]:

bert_embeddings = TransformerDocumentEmbeddings('bert-base-uncased')


In [68]:

print(   evaluate(bert_embeddings, similar)       )
print(   evaluate(bert_embeddings, dissimilar)    )


([0.85, 0.9, 0.96, 0.91, 0.89], 0.9)
([0.93, 0.94, 0.86, 0.93, 0.92], 0.92)



## Sentence BERT (Now a specialized BERT for this task)

In [69]:

sbert_embeddings = SentenceTransformerDocumentEmbeddings('bert-base-nli-mean-tokens')


In [70]:

print("--------------------------------")
print(  "Notice, this one actually does what we want"  )
print(   evaluate(sbert_embeddings, similar)   )
print(   evaluate(sbert_embeddings, dissimilar)   )


--------------------------------
Notice, this one actually does what we want
([0.98, 0.95, 0.96, 0.99, 0.98], 0.97)
([0.48, 0.41, 0.19, -0.05, 0.0], 0.21)


## now we will do a harder similarity test

* tricky sentences
* contradicting sentences

In [71]:


tricky_pairs=[
("An elephant is bigger than a lion", "A lion is bigger than an elephant"),
("the cat sat on the mat", "the mat sat on the cat")
]


In [72]:

print("GRU here does better, because sequence matters in RNNs?")

print(    evaluate(glove_pool_embeddings, tricky_pairs)    )

print(    evaluate(gru_embeddings, tricky_pairs)           )

print(    evaluate(bert_embeddings, tricky_pairs)          )

print(    evaluate(sbert_embeddings, tricky_pairs)         )



GRU here does better, because sequence matters in RNNs?
([1.0, 1.0], 1.0)
([0.79, 0.68], 0.74)
([1.0, 0.98], 0.99)
([0.93, 0.97], 0.95)



## We need a BERT specialized for this problem of condradicting sentences

* there is a model from XNLI for this task
* there is a model to detect the semantics of 2 sentence pairs with 3 classes: neutral, contradiction, entailment
* we use a fine tune XLM-Roberta model trained on XNLI


In [73]:

## requires login and tooken 
## see huggingface 'joeddav/xlm-roberta-large-xnli'

'''
nli_model = AutoModelForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')
tokenizer = AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')
'''


"\nnli_model = AutoModelForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')\ntokenizer = AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')\n"

In [74]:

'''

for premise, hypothesis in tricky_pairs:
    x = tokenizer.encode(premise, hypothesis, return_tensors='pt', truncation_strategy='only_first')
    
    logits = nli_model(x)[0]
    print(f"Premise: {premise}")
    print(f"Hypothesis: {hypothesis}")
    print("Top Class")
    ids = np.argmax(   logits[0].detach().numpy()   )
    print(   nli_model.config.id2label[ids]   )
    print("full softmax scores: ")
    for i in range(3):
        print(nli_model.config.id2label[i],        
              logits.softmax(dim=1)[0][i].detach().numpy()
        )
    print("="*20)

'''



'\n\nfor premise, hypothesis in tricky_pairs:\n    x = tokenizer.encode(premise, hypothesis, return_tensors=\'pt\', truncation_strategy=\'only_first\')\n    \n    logits = nli_model(x)[0]\n    print(f"Premise: {premise}")\n    print(f"Hypothesis: {hypothesis}")\n    print("Top Class")\n    ids = np.argmax(   logits[0].detach().numpy()   )\n    print(   nli_model.config.id2label[ids]   )\n    print("full softmax scores: ")\n    for i in range(3):\n        print(nli_model.config.id2label[i],        \n              logits.softmax(dim=1)[0][i].detach().numpy()\n        )\n    print("="*20)\n\n'