# Document Similarity-Embedding Methods

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance

In [0]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import re
from nltk.corpus import  stopwords
import nltk

In [0]:
from sklearn.metrics import classification_report

## Text Similarity using Count Vectorizer and Cosine Similarity

In [0]:
str1 = 'The president greets the press in Chicago'
str2 = 'Obama speaks to the media in Illinois'

###  Count Vectorizer

In [0]:
vectorizer = CountVectorizer()

In [0]:
s=[str1,str2]

In [7]:
all_vector=vectorizer.fit_transform(s)
all_vector.toarray()

array([[1, 1, 0, 1, 0, 0, 1, 1, 0, 2, 0],
       [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1]])

### Finding Cosine Similarity

In [0]:
v_a=all_vector.toarray()[0]
v_b=all_vector.toarray()[1]

In [9]:
cosine_sim=cosine_similarity([v_a],[v_b])
print('Similarity of two sentences are equal to',((cosine_sim[0][0])*100),'%')

Similarity of two sentences are equal to 37.79644730092272 %


In [0]:
v1 = all_vector.toarray()[0]
v2 = all_vector.toarray()[1]

### Finding Cosine Distance

#### Cosine_distance = 1 - cosine_similarity

In [11]:
cosine_dist= distance.cosine(v1,v2)
cosine_dist

0.6220355269907728

In [12]:
print('Similarity of two sentences are equal to',((1-cosine_dist)*100),'%')

Similarity of two sentences are equal to 37.79644730092272 %


## Text Similarity using Globe Embedding and Cosine Similarity

In [0]:
def clean_text(a):
  text=re.sub('[^a-zA-Z]',' ',a)
  text=text.lower()
  text=text.split()
  text=list([word for word in text])
  return text

In [14]:
# Note this is the 100 dimension version of GloVe from Stanford
# I unzipped and hosted it on my site to make this notebook easier
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt
embeddings_index = {};
with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

--2020-05-09 15:04:25--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.195.128, 2607:f8b0:400e:c08::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.195.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 347116733 (331M) [text/plain]
Saving to: ‘/tmp/glove.6B.100d.txt’


2020-05-09 15:04:28 (131 MB/s) - ‘/tmp/glove.6B.100d.txt’ saved [347116733/347116733]



In [0]:
strl1=clean_text(str1)
strl2=clean_text(str2)

In [16]:
strl1

['the', 'president', 'greets', 'the', 'press', 'in', 'chicago']

In [17]:
strl2

['obama', 'speaks', 'to', 'the', 'media', 'in', 'illinois']

### Creating Vectors from globe 


In [0]:
vect1=np.mean([embeddings_index[word] for word in strl1],axis=0)
vect2=np.mean([embeddings_index[word] for word in strl2],axis=0)

### Cosine Similarity

In [0]:
cosine_similar=cosine_similarity([vect1],[vect2])

In [20]:
cosine_similar

array([[0.92722815]], dtype=float32)

In [21]:
print('Similarity of two sentences are equal to',((cosine_similar[0][0])*100),'%')

Similarity of two sentences are equal to 92.72281527519226 %


### Cosine Distance

In [0]:
 cosine_dist = distance.cosine(vect1, vect2)

In [23]:
cosine_dist

0.07277172803878784

In [24]:
print('Similarity of two sentences are equal to',((1-cosine_dist)*100),'%')

Similarity of two sentences are equal to 92.72282719612122 %


## Globe Embedding With Cosine Similarity gives more accurate results.


## Building Functions to Implement Cosine simialrity with Globe Embedding


In [0]:
str1 = "I was given a card by her in the garden."
str2 = "In the garden, she gave me a card."

In [26]:
strl1=clean_text(str1)
strl1

['i', 'was', 'given', 'a', 'card', 'by', 'her', 'in', 'the', 'garden']

In [0]:
def similarity_predict(str1,str2):
  strl1=clean_text(str1)
  strl2=clean_text(str2)
  vect1=np.mean([embeddings_index[word] for word in strl1],axis=0)
  vect2=np.mean([embeddings_index[word] for word in strl2],axis=0)
  cosine_similar=cosine_similarity([vect1],[vect2])
  return cosine_similar[0][0]

In [28]:
prediction=similarity_predict(str1,str2)
prediction

0.98466194

In [0]:
str1 = "A cemetery is a place where dead people's bodies or their ashes are buried."
str2 = "A graveyard is an area of land ,sometimes near a church, where dead people are buried." 

In [30]:
prediction=similarity_predict(str1,str2)
prediction

0.96416223