In [1]:
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Loading the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [3]:
#Loading the data set
data = pd.read_csv('Precily_Text_Similarity.csv')
data

Unnamed: 0,text1,text2
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...
...,...,...
2995,uk directors guild nominees named martin scors...,steel firm to cut 45 000 jobs mittal steel ...
2996,u2 to play at grammy awards show irish rock ba...,israel looks to us for bank chief israel has a...
2997,pountney handed ban and fine northampton coach...,india and iran in gas export deal india has si...
2998,belle named best scottish band belle & sebas...,mido makes third apology ahmed mido hossam h...


In [4]:
# Preprocess the text data
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    return text

In [5]:

# Function to calculate cosine similarity between two vectors
def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    value = dot_product / (norm_u * norm_v)
    return value

In [6]:
def semantic_similarity(text1, text2):
    # Encode the sentences using Universal Sentence Encoder
    embeddings = embed([text1, text2])
    # Calculate cosine similarity between the embeddings
    similarity_score = cosine_similarity(embeddings[0], embeddings[1])
    # Map the range of -1 to 1 to 0 to 1
    similarity_score_normalized = (similarity_score + 1) / 2
    return similarity_score_normalized

In [7]:
# Calculate semantic similarity for each pair of sentences in the dataset
data['semantic_similarity'] = data.apply(lambda row: semantic_similarity(row['text1'], row['text2']), axis=1)

In [8]:
# Display the results
print(data[['text1', 'text2', 'semantic_similarity']])  

                                                  text1  \
0     broadband challenges tv viewing the number of ...   
1     rap boss arrested over drug find rap mogul mar...   
2     player burn-out worries robinson england coach...   
3     hearts of oak 3-2 cotonsport hearts of oak set...   
4     sir paul rocks super bowl crowds sir paul mcca...   
...                                                 ...   
2995  uk directors guild nominees named martin scors...   
2996  u2 to play at grammy awards show irish rock ba...   
2997  pountney handed ban and fine northampton coach...   
2998  belle named  best scottish band  belle & sebas...   
2999  criminal probe on citigroup deals traders at u...   

                                                  text2  semantic_similarity  
0     gardener wins double in glasgow britain s jaso...             0.674218  
1     amnesty chief laments war failure the lack of ...             0.657416  
2     hanks greeted at wintry premiere hollywood sta..

In [10]:
# Save the DataFrame to an H5 file
data.to_hdf('semantic_similarity_scores.h5', key='data', mode='w')
