# Text Similarity using Python

In [12]:
import numpy as np
import math
import re

Sample1 = The easiest way to earn points with Fetch Rewards is to just shop for the products you already love. If you haveany participating brands on your receipt, you'll get points based on the cost of the products. You don't need toclip any coupons or scan individual barcodes. Just scan each grocery receipt after you shop and we'll find thesavings for you.

In [2]:
sample1 = "The easiest way to earn points with Fetch Rewards is to just shop for the products you already love. If you have any participating brands on your receipt, you'll get points based on the cost of the products. You don't need to clip any coupons or scan individual barcodes. Just scan each grocery receipt after you shop and we'll find the savings for you."

Sample2 = The easiest way to earn points with Fetch Rewards is to just shop for the items you already buy. If you have anyeligible brands on your receipt, you will get points based on the total cost of the products. You do not need to cutout any coupons or scan individual UPCs. Just scan your receipt after you check out and we will find the savingsfor you.

In [3]:
sample2 = "The easiest way to earn points with Fetch Rewards is to just shop for the items you already buy. If you have any eligible brands on your receipt, you will get points based on the total cost of the products. You do not need to cut out any coupons or scan individual UPCs. Just scan your receipt after you check out and we will find the savings for you."

Sample3 = We are always looking for opportunities for you to earn more points, which is why we also give you a selectionof Special Offers. These Special Offers are opportunities to earn bonus points on top of the regular points youearn every time you purchase a participating brand. No need to pre-select these offers, we'll give you the pointswhether or not you knew about the offer. We just think it is easier that way.

In [4]:
sample3 = "We are always looking for opportunities for you to earn more points, which is why we also give you a selection of Special Offers. These Special Offers are opportunities to earn bonus points on top of the regular points you earn every time you purchase a participating brand. No need to pre-select these offers, we'll give you the points whether or not you knew about the offer. We just think it is easier that way."

## Data Preprocessing

### Tokenization

Tokenization in sents and words in lower case

In [5]:
def decontracted(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

In [6]:
def tokenize_sents(text, stop_char="."):
    return [x.lower()+  ' ' + stop_char + ''  for x in text.split(stop_char) if x!=""]

In [7]:
def tokenize_words_from_sent(sents, stop_words=[], punct=[]):
    return [x.lower() for sent in sents for x in sent.split()  if (x != "" and x not in punct and x not in stop_words)]

In [8]:
stop_words = ['the', 'a', 'an', 'is', 'are', 'will', 'has', 'have', 'had', 'and', 'or', 'we', 'you', 'to', 'with', 'on', 'your', 'for', 'of', 'this', 'that', 'those', 'these', 'because', 'it']

In [9]:
punct = ['.',':',',','!',';','\'','\"','(',')']

In [13]:
sent1 = tokenize_sents(decontracted(sample1))
print(sent1)

['the easiest way to earn points with fetch rewards is to just shop for the products you already love .', ' if you have any participating brands on your receipt, you will get points based on the cost of the products .', ' you do not need to clip any coupons or scan individual barcodes .', ' just scan each grocery receipt after you shop and we will find the savings for you .']


In [14]:
words1 = tokenize_words_from_sent(sent1)
print(words1)

['the', 'easiest', 'way', 'to', 'earn', 'points', 'with', 'fetch', 'rewards', 'is', 'to', 'just', 'shop', 'for', 'the', 'products', 'you', 'already', 'love', '.', 'if', 'you', 'have', 'any', 'participating', 'brands', 'on', 'your', 'receipt,', 'you', 'will', 'get', 'points', 'based', 'on', 'the', 'cost', 'of', 'the', 'products', '.', 'you', 'do', 'not', 'need', 'to', 'clip', 'any', 'coupons', 'or', 'scan', 'individual', 'barcodes', '.', 'just', 'scan', 'each', 'grocery', 'receipt', 'after', 'you', 'shop', 'and', 'we', 'will', 'find', 'the', 'savings', 'for', 'you', '.']


In [15]:
sent2 = tokenize_sents(decontracted(sample2))
print(sent2)

['the easiest way to earn points with fetch rewards is to just shop for the items you already buy .', ' if you have any eligible brands on your receipt, you will get points based on the total cost of the products .', ' you do not need to cut out any coupons or scan individual upcs .', ' just scan your receipt after you check out and we will find the savings for you .']


In [16]:
words2 = tokenize_words_from_sent(sent2)
print(words2)

['the', 'easiest', 'way', 'to', 'earn', 'points', 'with', 'fetch', 'rewards', 'is', 'to', 'just', 'shop', 'for', 'the', 'items', 'you', 'already', 'buy', '.', 'if', 'you', 'have', 'any', 'eligible', 'brands', 'on', 'your', 'receipt,', 'you', 'will', 'get', 'points', 'based', 'on', 'the', 'total', 'cost', 'of', 'the', 'products', '.', 'you', 'do', 'not', 'need', 'to', 'cut', 'out', 'any', 'coupons', 'or', 'scan', 'individual', 'upcs', '.', 'just', 'scan', 'your', 'receipt', 'after', 'you', 'check', 'out', 'and', 'we', 'will', 'find', 'the', 'savings', 'for', 'you', '.']


In [17]:
sent3 = sample3.split(".")
print(sent3)

['We are always looking for opportunities for you to earn more points, which is why we also give you a selection of Special Offers', ' These Special Offers are opportunities to earn bonus points on top of the regular points you earn every time you purchase a participating brand', " No need to pre-select these offers, we'll give you the points whether or not you knew about the offer", ' We just think it is easier that way', '']


In [18]:
sent3 = tokenize_sents(decontracted(sample3))
print(sent3)

['we are always looking for opportunities for you to earn more points, which is why we also give you a selection of special offers .', ' these special offers are opportunities to earn bonus points on top of the regular points you earn every time you purchase a participating brand .', ' no need to pre-select these offers, we will give you the points whether or not you knew about the offer .', ' we just think it is easier that way .']


In [19]:
words3 = tokenize_words_from_sent(sent3)
print(words3)

['we', 'are', 'always', 'looking', 'for', 'opportunities', 'for', 'you', 'to', 'earn', 'more', 'points,', 'which', 'is', 'why', 'we', 'also', 'give', 'you', 'a', 'selection', 'of', 'special', 'offers', '.', 'these', 'special', 'offers', 'are', 'opportunities', 'to', 'earn', 'bonus', 'points', 'on', 'top', 'of', 'the', 'regular', 'points', 'you', 'earn', 'every', 'time', 'you', 'purchase', 'a', 'participating', 'brand', '.', 'no', 'need', 'to', 'pre-select', 'these', 'offers,', 'we', 'will', 'give', 'you', 'the', 'points', 'whether', 'or', 'not', 'you', 'knew', 'about', 'the', 'offer', '.', 'we', 'just', 'think', 'it', 'is', 'easier', 'that', 'way', '.']


In [20]:
samples = [sample1,sample2,sample3]

In [21]:
sample1 == sample2

False

## Preparing Vocabulary and Indexing

In [22]:
from collections import Counter
all_words = words1+words2

word_counts = Counter(all_words)
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

In [23]:
print(word_counts)
print("*********************")
print(sorted_vocab)
print("*********************")
print(int_to_vocab)
print("*********************")
print(vocab_to_int)
print("*********************")

Counter({'you': 12, 'the': 10, '.': 8, 'to': 6, 'points': 4, 'just': 4, 'for': 4, 'any': 4, 'on': 4, 'will': 4, 'scan': 4, 'shop': 3, 'products': 3, 'your': 3, 'easiest': 2, 'way': 2, 'earn': 2, 'with': 2, 'fetch': 2, 'rewards': 2, 'is': 2, 'already': 2, 'if': 2, 'have': 2, 'brands': 2, 'receipt,': 2, 'get': 2, 'based': 2, 'cost': 2, 'of': 2, 'do': 2, 'not': 2, 'need': 2, 'coupons': 2, 'or': 2, 'individual': 2, 'receipt': 2, 'after': 2, 'and': 2, 'we': 2, 'find': 2, 'savings': 2, 'out': 2, 'love': 1, 'participating': 1, 'clip': 1, 'barcodes': 1, 'each': 1, 'grocery': 1, 'items': 1, 'buy': 1, 'eligible': 1, 'total': 1, 'cut': 1, 'upcs': 1, 'check': 1})
*********************
['you', 'the', '.', 'to', 'points', 'just', 'for', 'any', 'on', 'will', 'scan', 'shop', 'products', 'your', 'easiest', 'way', 'earn', 'with', 'fetch', 'rewards', 'is', 'already', 'if', 'have', 'brands', 'receipt,', 'get', 'based', 'cost', 'of', 'do', 'not', 'need', 'coupons', 'or', 'individual', 'receipt', 'after', '

## Prepare Word Vectors using TF-IDF Embeddings

In [24]:
word_tf_1 = np.zeros(len(sorted_vocab),dtype=float)
word_tf_2 = np.zeros(len(sorted_vocab),dtype=float)

In [25]:
for word,count in Counter(words1).items():
    word_tf_1[vocab_to_int[word]] = count/len(words1)
print(word_tf_1)  

[0.08450704 0.07042254 0.05633803 0.04225352 0.02816901 0.02816901
 0.02816901 0.02816901 0.02816901 0.02816901 0.02816901 0.02816901
 0.02816901 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
 0.         0.01408451 0.01408451 0.01408451 0.01408451 0.01408451
 0.01408451 0.         0.         0.         0.         0.
 0.         0.        ]


In [26]:
for word,count in Counter(words2).items():
    word_tf_2[vocab_to_int[word]] = count/len(words2)
print(word_tf_2)  

[0.08219178 0.06849315 0.05479452 0.04109589 0.02739726 0.02739726
 0.02739726 0.02739726 0.02739726 0.02739726 0.02739726 0.01369863
 0.01369863 0.02739726 0.01369863 0.01369863 0.01369863 0.01369863
 0.01369863 0.01369863 0.01369863 0.01369863 0.01369863 0.01369863
 0.01369863 0.01369863 0.01369863 0.01369863 0.01369863 0.01369863
 0.01369863 0.01369863 0.01369863 0.01369863 0.01369863 0.01369863
 0.01369863 0.01369863 0.01369863 0.01369863 0.01369863 0.01369863
 0.02739726 0.         0.         0.         0.         0.
 0.         0.01369863 0.01369863 0.01369863 0.01369863 0.01369863
 0.01369863 0.01369863]


In [27]:
word_idf = np.zeros(len(sorted_vocab),dtype=float)

In [28]:
N = 2
for word in sorted_vocab:
    n_docs = 0
    for doc in [words1, words2]:
        if word in doc:
            n_docs += 1
    word_idf[vocab_to_int[word]] = N/n_docs
print(word_idf)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2.]


In [29]:
tfidf_1 = [tf*idf for tf, idf in zip(word_tf_1, word_idf)]
tfidf_2 = [tf*idf for tf, idf in zip(word_tf_2, word_idf)]

In [30]:
print(tfidf_1, tfidf_2)

[0.08450704225352113, 0.07042253521126761, 0.056338028169014086, 0.04225352112676056, 0.028169014084507043, 0.028169014084507043, 0.028169014084507043, 0.028169014084507043, 0.028169014084507043, 0.028169014084507043, 0.028169014084507043, 0.028169014084507043, 0.028169014084507043, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.014084507042253521, 0.0, 0.028169014084507043, 0.028169014084507043, 0.028169014084507043, 0.02816

## Jaccard Similarity

- IoU = (Intersection of two lists) / (Union of two lists)
- This can be used for two word lists where the IoU gives the similarity of all the words.


In [34]:
def jaccard_sim(words1, words2):
    l1 = len(words1)
    l2 = len(words2)
    inter = list(set(words1) & set(words2))

    print(l1,l2,inter, len(inter))

    iou = len(inter) / (l1+l2-len(inter))
    return iou

In [37]:
print(jaccard_sim(words1, words2))

71 73 ['if', 'you', 'after', 'individual', 'find', 'rewards', 'any', 'savings', 'of', 'receipt', 'the', 'and', 'for', 'based', 'do', 'receipt,', 'have', 'points', 'way', 'earn', 'just', 'to', 'products', 'already', 'brands', 'or', 'with', 'we', 'is', 'will', 'coupons', 'shop', 'cost', '.', 'your', 'easiest', 'scan', 'on', 'not', 'get', 'fetch', 'need'] 42
0.4117647058823529


## Cosine Similarity

- After converting words to vectors, we can compute the similarity between the vectors by getting the cosine of the angle between them.
- The dot product of two vectors gives us the cosine of the angle between them.
- The dot product is calculated as (sum of products of individual vector components) / (product of magnitudes of two vectors)

In [38]:
def cosine_similarity(x,y):
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = square_rooted(x)*square_rooted(y)
    return round(numerator/float(denominator),3)

In [39]:
from math import *
def square_rooted(x):
    return round(sqrt(sum([a*a for a in x])),3)

In [40]:
print(cosine_similarity(tfidf_1,tfidf_2))

0.808


## Euclidian Distance

- It is nothng but the direct distance between the two points in space.
- As we have word vectors, we can compute the distance between each vector component using distance formula.
- euclidian distance = sq.root(sum over all components i (xi - yi)^2)
- This gives the direct distance but we need a metric in 0 to 1.
- This is achieved by introducing a threshold value.
- The similarity = (threshold - distance) / threshold

In [45]:
def euclidean_distance(x,y):
    dist = round(sqrt(sum(pow(a-b,2) for a, b in zip(x, y))),3)
    
    thresh = 10
    if dist > thresh:
        return 0
    else:
        return (thresh - dist)/thresh

In [46]:
print(euclidean_distance(tfidf_1,tfidf_2))

0.9883
