In [1]:
# Imports
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

# Load the English STSB dataset
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
#stsb_test=pd.DataFrame(stsb_dataset['test'])
stsb_test = pd.read_csv("MFDS_testdataset.csv")

# Check loaded data
print(stsb_train.shape, stsb_test.shape)  #gives the size of the data
stsb_train.head()

Reusing dataset stsb_multi_mt (C:\Users\hp\.cache\huggingface\datasets\stsb_multi_mt\en\1.0.0\a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)


  0%|          | 0/3 [00:00<?, ?it/s]

(5749, 3) (10, 2)


Unnamed: 0,sentence1,sentence2,similarity_score
0,A plane is taking off.,An air plane is taking off.,5.0
1,A man is playing a large flute.,A man is playing a flute.,3.8
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8
3,Three men are playing chess.,Two men are playing chess.,2.6
4,A man is playing the cello.,A man seated is playing the cello.,4.25


In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")

def text_processing(sentence):
    """
    Lemmatize, lowercase, remove numbers and stop words
    """
    sentence = [token.lemma_.lower()          
                for token in nlp(sentence) 
                    if token.is_alpha and not token.is_stop]
    
    return sentence

def cos_sim(sentence1_emb, sentence2_emb):
    """
    Cosine similarity between two columns of sentence embeddings
    Returns:
      The row-wise cosine similarity between the two columns.
      For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
      Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
    """
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)*100


In [3]:
import textdistance

def jaccard_sim(row):
    # Text Processing
    sentence1 = text_processing(row['sentence1'])
    sentence2 = text_processing(row['sentence2'])
    print(sentence1)
    
    # Jaccard similarity
    return textdistance.jaccard.normalized_similarity(sentence1, sentence2)*100


# Jaccard Similarity
stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 73.94it/s]

['primary', 'sludge', 'pebble', 'soil', 'settle', 'primary', 'treatment', 'sewage', 'activate', 'sludge', 'sediment', 'bacterial', 'floc', 'settle', 'tank', 'biological', 'treatment']
['primary', 'sludge', 'solid', 'like', 'soil', 'small', 'pebble', 'etc', 'activated', 'sludge', 'sediment', 'bacterial', 'floc']
['soil', 'small', 'pebble', 'settle', 'settle', 'tank', 'primary', 'treatment', 'sewage', 'constitute', 'primary', 'sludge', 'activate', 'sludge', 'consist', 'bacterial', 'floc', 'settle', 'tank', 'biological', 'treatment']
['sludge', 'sediment', 'form', 'settling', 'tank', 'treatment', 'sewage']
['activated', 'sludge', 'form', 'bacterial', 'floc', 'biological', 'treatment', 'primary', 'sludge', 'form', 'solid', 'like', 'soil', 'small', 'pebble', 'etc']
['primary', 'sludge', 'activate', 'sludge', 'different', 'kind', 'sediment', 'get', 'treatment', 'sewage', 'get', 'primary', 'treatment', 'sewage', 'form', 'biological', 'treatment']
['primary', 'sludge', 'form', 'prmary', 'treat




In [4]:
stsb_test.head(3)

Unnamed: 0,sentence1,sentence2,Jaccard_score
0,Primary sludge is pebbles and soil that settle...,Primary sludge is all solids like soil small p...,62.5
1,Primary sludge is all solids like soil small p...,Primary sludge is all solids like soil small p...,45.833333
2,Soil and small pebbles that settle down in set...,Primary sludge is all solids like soil small p...,65.384615


In [5]:
import tensorflow as tf
import tensorflow_hub as hub

# Load the pre-trained model
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    # Control GPU memory usage
    tf.config.experimental.set_memory_growth(gpu, True)

module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'  #getting the embeddings library from the google resources
model = hub.load(module_url)

# Generate Embeddings
sentence1_emb = model(stsb_test['sentence1']).numpy()
sentence2_emb = model(stsb_test['sentence2']).numpy()

# Cosine Similarity
stsb_test['USE_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

In [6]:
from sentence_transformers import CrossEncoder

# Load the pre-trained model
model = CrossEncoder('cross-encoder/stsb-roberta-base')

sentence_pairs = []
for sentence1, sentence2 in zip(stsb_test['sentence1'], stsb_test['sentence2']):
    sentence_pairs.append([sentence1, sentence2])
    
stsb_test['SBERT CrossEncoder_score'] = model.predict(sentence_pairs, show_progress_bar=True)*100

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
stsb_test.head(11)

Unnamed: 0,sentence1,sentence2,Jaccard_score,USE_cosine_score,SBERT CrossEncoder_score
0,Primary sludge is pebbles and soil that settle...,Primary sludge is all solids like soil small p...,62.5,91.256187,91.715393
1,Primary sludge is all solids like soil small p...,Primary sludge is all solids like soil small p...,45.833333,75.014984,83.927162
2,Soil and small pebbles that settle down in set...,Primary sludge is all solids like soil small p...,65.384615,90.482994,87.938728
3,Sludge is the sediment formed in the settling ...,Primary sludge is all solids like soil small p...,20.833333,74.22747,73.264816
4,Activated sludge is formed by the bacterial fl...,Primary sludge is all solids like soil small p...,46.153846,74.176338,87.251129
5,Primary sludge and activated sludge are two di...,Primary sludge is all solids like soil small p...,25.806452,67.97551,63.016624
6,Primary sludge is formed during the prmary tre...,Primary sludge is all solids like soil small p...,25.925926,63.929581,67.615059
7,Solids like soil and small pebbles got during ...,Primary sludge is all solids like soil small p...,41.37931,78.719284,88.996712
8,Activated sludge is composed of bacterial sedi...,Primary sludge is all solids like soil small p...,50.0,74.536385,88.309601
9,Sediment formed during primary treatment of se...,Primary sludge is all solids like soil small p...,40.0,74.008919,87.102791
