# Document Similarity-Embedding Methods

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance

In [0]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import re
from nltk.corpus import  stopwords
import nltk

In [0]:
str1 = 'The president greets the press in Chicago'
str2 = 'Obama speaks to the media in Illinois'

## Text Similarity using Count Vectorizer and Cosine Similarity

###  Count Vectorizer

In [0]:
vectorizer = CountVectorizer()

In [0]:
s=[str1,str2]

In [34]:
all_vector=vectorizer.fit_transform(s)
all_vector.toarray()

array([[1, 1, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 1, 1, 0, 0, 1]])

### Finding Cosine Similarity

In [0]:
v_a=all_vector.toarray()[0]
v_b=all_vector.toarray()[1]

In [36]:
cosine_sim=cosine_similarity([v_a],[v_b])
print('Similarity of two sentences are equal to',((cosine_sim[0][0])*100),'%')

Similarity of two sentences are equal to 0.0 %


In [0]:
v1 = all_vector.toarray()[0]
v2 = all_vector.toarray()[1]

### Finding Cosine Distance

#### Cosine_distance = 1 - cosine_similarity

In [70]:
cosine_dist= distance.cosine(v1,v2)
cosine_dist

1.0

In [39]:
print('Similarity of two sentences are equal to',((1-cosine_dist)*100),'%')

Similarity of two sentences are equal to 0.0 %


## Text Similarity using Globe Embedding and Cosine Similarity

In [0]:
def clean_text(a):
  text=re.sub('[^a-zA-Z]',' ',a)
  text=text.lower()
  text=text.split(' ')
  text=list([word for word in text])
  return text

In [14]:
# Note this is the 100 dimension version of GloVe from Stanford
# I unzipped and hosted it on my site to make this notebook easier
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt
embeddings_index = {};
with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

--2020-05-09 13:21:59--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.142.128, 2607:f8b0:400e:c07::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.142.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 347116733 (331M) [text/plain]
Saving to: ‘/tmp/glove.6B.100d.txt’


2020-05-09 13:22:00 (219 MB/s) - ‘/tmp/glove.6B.100d.txt’ saved [347116733/347116733]



In [0]:
strl1=clean_text(str1)
strl2=clean_text(str2)

In [84]:
strl1

['the', 'president', 'greets', 'the', 'press', 'in', 'chicago']

In [77]:
strl2

['obama', 'speaks', 'to', 'the', 'media', 'in', 'illinois']

### Creating Vectors from globe 


In [0]:
vect1=np.mean([embeddings_index[word] for word in strl1],axis=0)
vect2=np.mean([embeddings_index[word] for word in strl2],axis=0)

### Cosine Similarity

In [0]:
cosine_similar=cosine_similarity([vect1],[vect2])

In [87]:
cosine_similar

array([[0.92722815]], dtype=float32)

In [88]:
print('Similarity of two sentences are equal to',((cosine_similar[0][0])*100),'%')

Similarity of two sentences are equal to 92.72281527519226 %


### Cosine Distance

In [0]:
 cosine_dist = distance.cosine(vect1, vect2)

In [90]:
cosine_dist

0.07277172803878784

In [89]:
print('Similarity of two sentences are equal to',((1-cosine_dist)*100),'%')

Similarity of two sentences are equal to 92.72282719612122 %


## Globe Embedding With Cosine Similarity gives more accurate results.


## Building Functions to Implement it


In [0]:
str1 = "I was given a card by her in the garden."
str2 = "In the garden, she gave me a card."