# **Semantic Textual Similarity**


In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
# IMPORTS

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
import seaborn as sns
from nltk.stem import WordNetLemmatizer 
from wordcloud import WordCloud

import scipy.spatial
import tensorflow as tf
import tensorflow_hub as hub

In [3]:
data = pd.read_csv('/content/gdrive/MyDrive/Text_Similarity_Dataset.csv')

In [4]:
data.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


In [5]:
data.isnull().sum()

Unique_ID    0
text1        0
text2        0
dtype: int64

### Data Pre-processing

In [6]:
data.shape

(4023, 3)

In [7]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python'''
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [8]:
def clean_text(text, remove_stopwords):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # remove special characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    word_list = nltk.word_tokenize(text)
    text = ' '.join([lemmatizer.lemmatize(w) for w in word_list])

    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [13]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [14]:
# data cleaning

Clean_text1 = []
for texts in data.text1:
  Clean_text1.append(clean_text(str(texts), remove_stopwords=False)) # stop words might bear data

Clean_text2 = []
for texts in data.text2:
  Clean_text2.append(clean_text(str(texts), remove_stopwords=False))

In [15]:
data['text1']= Clean_text1
data['text2']= Clean_text2

In [16]:
sim = data    # cleaned data in sim df
sim.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searcher fail to spot ad internet search...,newcastle 2 1 bolton kieron dyer smashed home ...
1,1,million to miss out on the net by 2025 40 of t...,nasdaq planning 100m share sale the owner of t...
2,2,young debut cut short by ginepri fifteen year ...,ruddock back yapp s credential wale coach mike...
3,3,diageo to buy u wine firm diageo the world s b...,mci share climb on takeover bid share in u pho...
4,4,be careful how you code a new european directi...,medium gadget get moving pocket sized device t...


In [17]:
# duplicate check
dup = 0
for i in range(len(sim['text1'])):
    if sim['text1'][i]==sim['text2'][i]:
      print(i)
      dup+=1
print(dup)

3403
1


In [18]:
sim.text1[3403]

'holmes start 2005 with gb event kelly holmes will start 2005 with a series of race in britain holmes will make her first track appearance on home soil since winning double olympic gold in january s norwich union international in glasgow she will also run in the grand prix in birmingham in february and may defend her indoor aaa 800m title in sheffield earlier that month i am still competitive and still want to win she said i m an athlete and i can t wait to get back on the track she added these event are also a great opportunity to thank the british public for the enormous level of support they have given me from the moment i stepped off that plane from greece the glasgow meeting will see holmes compete over 1500m in a five way match against sweden france russia and italy'

In [19]:
sim.text2[3403]

'holmes start 2005 with gb event kelly holmes will start 2005 with a series of race in britain holmes will make her first track appearance on home soil since winning double olympic gold in january s norwich union international in glasgow she will also run in the grand prix in birmingham in february and may defend her indoor aaa 800m title in sheffield earlier that month i am still competitive and still want to win she said i m an athlete and i can t wait to get back on the track she added these event are also a great opportunity to thank the british public for the enormous level of support they have given me from the moment i stepped off that plane from greece the glasgow meeting will see holmes compete over 1500m in a five way match against sweden france russia and italy'

### Using Google Universal Sentence Encoder 

In [20]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" 
model = hub.load(module_url)

In [21]:
def embed(input):
  return model(input)

In [22]:
def similarity_measure(input):
  m_embed = embed(input)
  distance = scipy.spatial.distance.cdist([m_embed[0]],[m_embed[1]],"cosine")[0] # first element
  return (1-distance)  # (1 - cosine distance) gives cosine similarity

In [23]:
# test for dup one
sentence = "holmes starts 2005 gb events kelly holmes start 2005 series races britain holmes make first track appearancehome soil since winning double olympic gold january norwich union international glasgow also run grand prix birmingham february may defend indoor aaa 800m title sheffield earlier month still competitive still want win said athlete wait get back track added events also great opportunity thank british public enormous levels support given moment stepped plane greece glasgow meeting see holmes compete 1500m fiveway match sweden france russia italy"
message_embeddings = embed([sentence])
message_embeddings.shape  #  The sentences get converted to a 512-dimensional vector.

TensorShape([1, 512])

In [24]:
input = []
input.append(sim.text1[3403])
input.append(sim.text2[3403])
s_em = similarity_measure(input)[0]

In [25]:
s_em

0.999999999999964

In [26]:
# For given data

sim_score = []
for i in range(len(sim)):
  input = []
  input.append(sim.text1[i])
  input.append(sim.text2[i])
  s_em = similarity_measure(input)[0]
  sim_score.append(s_em)

In [27]:
len(sim_score)

4023

In [28]:
sim['sim_score'] = sim_score

In [34]:
sim.loc[sim['sim_score']>= 0.8]

Unnamed: 0,Unique_ID,text1,text2,sim_score
284,284,brown call for £5 5bn aid fund gordon brown ha...,brown to outline presidency goal next year wil...,0.855064
2284,2284,dvd copy protection strengthened dvd will be h...,dvd copy protection strengthened dvd will be h...,0.986489
2712,2712,learning to love broadband we are reaching the...,tv s future down the phone line internet tv ha...,0.811203
3013,3013,call for kenteris to be cleared kostas kenteri...,iaaf to rule on greek sprint pair greek sprint...,0.836589
3056,3056,howard dismisses tory tax fear michael howard ...,defection timed to hit tax pledge with impecca...,0.871753
3403,3403,holmes start 2005 with gb event kelly holmes w...,holmes start 2005 with gb event kelly holmes w...,1.0
3421,3421,blair dismisses quit claim report tony blair h...,fox attack blair s tory lie tony blair lied wh...,0.82375
3552,3552,hunt demo at labour meeting pro hunt supporter...,final hunt held a ban loom hunt in england and...,0.85293
3827,3827,newcastle 2 1 bolton kieron dyer smashed home ...,arsenal through on penalty arsenal win 4 2 on ...,0.806961
3859,3859,troubled marsh under sec scrutiny the u stock ...,marsh executive in guilty plea an executive at...,0.893727


In [30]:
submission = sim[["Unique_ID","sim_score"]]
submission.head()

Unnamed: 0,Unique_ID,sim_score
0,0,0.416169
1,1,0.251416
2,2,0.513111
3,3,0.355525
4,4,0.638504


In [35]:
from google.colab import files

submission.to_csv("submission.csv", encoding='utf-8', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>