<a href="https://colab.research.google.com/github/Mwadz/Sematic-Text-Similarity/blob/main/Copy_of_STS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from datasets import load_dataset
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

# Load the English STSB dataset
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
stsb_test = pd.DataFrame(stsb_dataset['test'])

# Check loaded data
print(stsb_train.shape, stsb_test.shape)
stsb_test.head()



  0%|          | 0/3 [00:00<?, ?it/s]

(5749, 3) (1379, 3)


Unnamed: 0,sentence1,sentence2,similarity_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2
4,A man is playing a harp.,A man is playing a keyboard.,1.5


# Creating helper functions
- The first function is to pre-process texts by lemmatizing, lowercasing, and removing numbers and stop words.
- The second function takes in two columns of text embeddings and returns the row-wise cosine similarity between the two columns.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")

def text_processing(sentence):
    """
    Lemmatize, lowercase, remove numbers and stop words
    
    Args:
      sentence: The sentence we want to process.
    
    Returns:
      A list of processed words
    """
    sentence = [token.lemma_.lower()
                for token in nlp(sentence) 
                if token.is_alpha and not token.is_stop]
    
    return sentence


def cos_sim(sentence1_emb, sentence2_emb):
    """
    Cosine similarity between two columns of sentence embeddings
    
    Args:
      sentence1_emb: sentence1 embedding column
      sentence2_emb: sentence2 embedding column
    
    Returns:
      The row-wise cosine similarity between the two columns.
      For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
      Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
    """
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)

In [None]:
from sentence_transformers import CrossEncoder

# Load the pre-trained model
model = CrossEncoder('cross-encoder/stsb-roberta-base')

sentence_pairs = []
for sentence1, sentence2 in zip(stsb_test['sentence1'], stsb_test['sentence2']):
    sentence_pairs.append([sentence1, sentence2])
    
stsb_test['SBERT CrossEncoder_score'] = model.predict(sentence_pairs, show_progress_bar=True)

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

In [None]:
data = pd.read_csv("/content/SBERT data.csv")
prompt = input("Enter prompt: ")
data['prompt']= prompt
data.rename(columns = {'description':'sentence1', 'prompt':'sentence2'}, inplace = True)
# data.head()

from sentence_transformers import CrossEncoder
XpathFinder = CrossEncoder("cross-encoder/stsb-roberta-base")
sentence_pairs = []
for description, prompt in zip(data['sentence1'],data['sentence2']):
  sentence_pairs.append([sentence1, sentence2])

data['SBERT CrossEncoder_Score'] = XpathFinder.predict(sentence_pairs, show_progress_bar = True)

Enter prompt: The date of the additional charge being reported to the Investor.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
sim_score = data.sort_values(by=['SBERT CrossEncoder_Score'], ascending=False)
sim_score.head()

Unnamed: 0,xpath,sentence1,sentence2,SBERT CrossEncoder_Score
0,//DEAL/LOANS/LOAN[1]/QUALIFIED_MORTGAGE/EXEMPT...,When true indicates the purpose of the extensi...,The date of the additional charge being report...,0.99395
304,//DEAL/SERVICES/SERVICE[1]/TITLE/TITLE_RESPONS...,The abbreviated name of the depository institu...,The date of the additional charge being report...,0.99395
332,//DEAL/SERVICES/SERVICE[1]/AUTOMATED_UNDERWRIT...,The subaccount name for the receiver of the wire.,The date of the additional charge being report...,0.99395
331,//DEAL/LOANS/LOAN[1]/ACH/ACHReceiverSubaccount...,The subaccount name for the receiver of the wire.,The date of the additional charge being report...,0.99395
330,//MESSAGE/DEAL_SETS/DEAL_SET_SERVICES/DEAL_SET...,Identifies the date on which automated draftin...,The date of the additional charge being report...,0.99395
