# SEMANTIC TEXT SIMILARITY

In this notebook, I explore how to perform Semantic Text Similarity using BERT models.  

# Importing libraries

In [45]:
import numpy as np
import pandas as pd

import re
from tqdm import tqdm                    #For status bar display

import collections
    
from nltk.stem import WordNetLemmatizer  # For lemmatization of words
from nltk.corpus import stopwords        # Loading list of stopwords
from nltk import word_tokenize           # Converting paragraph in tokens

import pickle
import sys


import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Reading the given dataset

In [46]:
# Reading given data-set using pandas
text_data = pd.read_csv("Text_Similarity_Dataset.csv")
print("Shape of text_data : ", text_data.shape)
text_data.head(5)

Shape of text_data :  (4023, 3)


Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


Checking if the dataset has any null values and dropping those rows

In [47]:
text_data.isnull().sum() # Check if text data have any null values


Unique_ID    0
text1        0
text2        0
dtype: int64

In [48]:
text_data.dropna()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...
...,...,...,...
4018,4018,labour plans maternity pay rise maternity pay ...,no seasonal lift for house market a swathe of ...
4019,4019,high fuel costs hit us airlines two of the lar...,new media battle for bafta awards the bbc lead...
4020,4020,britons growing digitally obese gadget lover...,film star fox behind theatre bid leading actor...
4021,4021,holmes is hit by hamstring injury kelly holmes...,tsunami to hit sri lanka banks sri lanka s b...


# Preprocessing Dataset

Expanding shortened phrases, removing stopwords, removing special symbols and converting all characters into lower case

In [49]:
def expansion(phrase):    #expanding shortened phrases and removing stopwords using a function
    phrase = re.sub(r"won't", "will not", phrase)       #expanding 'wont' to 'will not'
    phrase = re.sub(r"can\'t", "can not", phrase)       #expanding 'cant' to 'can not'
    phrase = re.sub(r"n\'t", " not", phrase)            #expanding suffix n't to not
    phrase = re.sub(r"\'re", " are", phrase)            #expanding suffix 're to are 
    phrase = re.sub(r"\'s", " is", phrase)              #expanding suffix 's to is 
    phrase = re.sub(r"\'d", " would", phrase)           #expanding suffix 'd to would 
    phrase = re.sub(r"\'ll", " will", phrase)           #expanding suffix 'll to will
    phrase = re.sub(r"\'ve", " have", phrase)           #expanding suffix 've to have
    phrase = re.sub(r"\'m", " am", phrase)              #expanding suffix 'm to am
    #I might have missed some shortened phrases. If so, they can easily be added here
    return phrase

Prepocessing first column of dataset, i.e. "text1" column

In [50]:
preprocessed_text1 = []    #Defining an alternate preprocessed text list for text1

for sentence in tqdm(text_data['text1'].values):
    sent = expansion(sentence)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)

    sent = ' '.join(e for e in sent.split() if e not in stopwords.words('english'))
    preprocessed_text1.append(sent.lower().strip())

100%|██████████| 4023/4023 [02:30<00:00, 26.82it/s]


In [70]:
# Merging preprocessed_text1 in place of text1 in text_data
text_data['text1'] = preprocessed_text1
text_data.head(5)

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail spot ads internet search ...,newcastle 2 1 bolton kieron dyer smashed home ...
1,1,millions miss net 2025 40 uk population still ...,nasdaq planning 100m share sale owner technolo...
2,2,young debut cut short ginepri fifteen year old...,ruddock backs yapp credentials wales coach mik...
3,3,diageo buy us wine firm diageo world biggest s...,mci shares climb takeover bid shares us phone ...
4,4,careful code new european directive could put ...,media gadgets get moving pocket sized devices ...


Prepocessing second column of dataset, i.e. "text2" column

In [52]:
preprocessed_text2 = []    #Defining an alternate preprocessed text list for text2

for sentence in tqdm(text_data['text2'].values):
    sent = decontracted(sentence)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
   
    sent = ' '.join(e for e in sent.split() if e not in stopwords.words('english'))
    preprocessed_text2.append(sent.lower().strip())

100%|██████████| 4023/4023 [02:29<00:00, 26.85it/s]


In [71]:
# Merging preprocessed_text2 in place of text2 in text_data
text_data['text2'] = preprocessed_text2
text_data.head(5)

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail spot ads internet search ...,newcastle 2 1 bolton kieron dyer smashed home ...
1,1,millions miss net 2025 40 uk population still ...,nasdaq planning 100m share sale owner technolo...
2,2,young debut cut short ginepri fifteen year old...,ruddock backs yapp credentials wales coach mik...
3,3,diageo buy us wine firm diageo world biggest s...,mci shares climb takeover bid shares us phone ...
4,4,careful code new european directive could put ...,media gadgets get moving pocket sized devices ...


#Word tokenizer

We now build a function which tokenizes and lemmatizes the sentences into words. However this is not used in the actual model.

In [54]:
def word_tokenizer(text):
            #tokenizes, stems and lemmatizes the text. Though this will not be used in this code
            tokens = word_tokenize(text)
            lemmatizer = WordNetLemmatizer() 
            tokens = [lemmatizer.lemmatize(t) for t in tokens]
            return tokens

# Semantic Text Similarity Models

Pip-installing semantic text similarity

In [55]:
!pip install semantic-text-similarity



Importing Web BERT Similarity and Clinical BERT Similarity models

In [56]:
from semantic_text_similarity.models import WebBertSimilarity          #Directly importing Web Bert Similarity model
from semantic_text_similarity.models import ClinicalBertSimilarity     #Directly importing Clinical Bert Similarity model

web_model = WebBertSimilarity(device='cpu', batch_size=10)             #defaults to GPU prediction
clinical_model = ClinicalBertSimilarity(device='cuda', batch_size=10)  #defaults to GPU prediction


# Implementing the Web BERT Similarity Model

In [57]:
#WEB BERT SIMILARITY MODEL

nltk.download('punkt')
nltk.download('wordnet')

similarity = []      #Defining an empty list to store the final similarity values. 

for indices in text_data.index:
    
        s1 = text_data['text1'][indices]
        s2 = text_data['text2'][indices]
        s1words = word_tokenizer(s1)
        s2words = word_tokenizer(s2)

        if s1==s2:
           similarity.append(1.0)        # 1 means highly similar

        if len(s1 or s2)==0:            #If length of sentences in either of the datasets is zero, then ignore them. But we already dropped all null rows, so no need to worry.
            similarity.append(0.0)       # 0 means highly dissimilar

        else:   
            similarity.append(web_model.predict([(s1, s2)])) #1 corresponds to highly similar sentences and 0 corresponds to highly dissimilar sentences

                

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [60]:
# Getting Unique_ID and similarity
similarity.pop()              # List 'similarity' has 4024 values. One extra value added at the end due to the for loop
final_score1 = pd.DataFrame({'Unique_ID':text_data.Unique_ID, 'Similarity_score':similarity})   #Creating a new dataframe with Unique ID and Similarity score
final_score1.head(5)

Unnamed: 0,Unique_ID,Similarity_score
0,0,[0.24079262]
1,1,[0.78855926]
2,2,[0.5916782]
3,3,[0.99573034]
4,4,[0.90157884]


In [62]:
# Saving dataframe as CSV file 
final_score1.to_csv('Finalscore_WebBert.csv',index=False)

# Implementing the Clinical BERT Similarity Model

In [63]:
#CLINICAL BERT SIMILARITY MODEL

nltk.download('punkt')
nltk.download('wordnet')

similarity = []
for ind in text_data.index:
    
        s1 = text_data['text1'][ind]
        s2 = text_data['text2'][ind]
        
        if s1==s2:
           similarity.append(1.0) # 1 means highly similar

        if len(s1words and s2words)==0:
            similarity.append(0.0) 

        else:   

            s1words = word_tokenizer(s1)
            s2words = word_tokenizer(s2)
            similarity.append(clinical_model.predict([(s1, s2)])) # as it is given 1 means highly similar & 0 means highly dissimilar

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [65]:
# Getting Unique_ID and similarity for Clinical Bert Similarity Model
similarity.pop()              # List 'similarity' has 4024 values. One extra value added at the end due to the for loop
final_score2 = pd.DataFrame({'Unique_ID':text_data.Unique_ID, 'Similarity_score':similarity})       #Creating a new dataframe with Unique ID and Similarity score
final_score2.head(5)

Unnamed: 0,Unique_ID,Similarity_score
0,0,[0.6166725]
1,1,[0.8093727]
2,2,[1.2970107]
3,3,[1.4599866]
4,4,[0.762307]


In [69]:
# Saving dataframe as CSV file 
final_score2.to_csv('Finalscore_ClinicalBert.csv',index=False)

# Conclusion

Here I have implemented the Web BERT Similarity Model and Clinical BERT Similarity Model. Both give very different results. More explained in report. 