## Document Similarity

#### The aim of the code is to check the document similarity using different metrics

In [6]:
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd

In [12]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/sameer/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

#### Function to convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets

In [7]:
def convert_tag(tag):    
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None

#### Function to return a list of synsets in document

In [8]:
def doc_to_synsets(doc):
    doc_t = nltk.word_tokenize(doc)
    doc_t_pos = nltk.pos_tag(doc_t)
    syn = [(w,convert_tag(t)) for w,t in doc_t_pos]
    synt = [wn.synsets(s, pos = p) for s,p in syn]
    snt_n = []
    for s in synt:
        if len(s)>0:
            snt_n.append(s[0])
        else:
            snt_n.append(s)
    return snt_n

#### Function to calculate the normalized similarity score (using Path Similarity, and Wup Similarity) of s1 onto s2

In [54]:
def similarity_score(s1, s2, meth = "path_similarity"):  
    tot = []
    for a in s1:
        if a:
            sc = []
            for b in s2:
                if b:
                    if meth == "path_similarity":
                        z = wn.path_similarity(a,b)
                    elif meth == "wup_similarity":
                        z = wn.wup_similarity(a,b)
                        
                    if z:
                        sc.append(z)
            if sc:
                sc = sorted(sc,reverse=True)
                tot.append(sc[0])
    return sum(tot)/len(tot)

#### Function to find the symmetrical similarity between doc1 and doc2
##### As the scores are not symmetrical, finding out the average score of (doc1, doc2) and (doc2, doc1) similarity

In [65]:
def document_path_similarity(doc1, doc2, meth = "path_similarity"):
    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)
    return (similarity_score(synsets1, synsets2, meth) + similarity_score(synsets2, synsets1, meth)) / 2

#### Function which tests the document path similarity function

In [56]:
def test_document_path_similarity():
    doc1 = 'This is a function to test document_path_similarity.'
    doc2 = 'Use this function to see if your code in doc_to_synsets \
    and similarity_score is correct!'
    return document_path_similarity(doc1, doc2)

test_document_path_similarity()

0.554265873015873

#### Inputting dummy values to check the accuracy of the functions

In [39]:
doc1 = "That is a cat"
doc2 = "That is a dog"
doc3 = "That is cat"

In [62]:
document_path_similarity(doc1,doc2,'path_similarity')

0.7333333333333334

In [64]:
document_path_similarity(doc1,doc2,'wup_similarity')

0.9523809523809524

##### We can observe that the score with wup similarity is higher compared to path similarity, as Wu-Palmer Similarity returns a score denoting how similar two word senses are, based on the depth of the two senses in the taxonomy and that of their Least Common Subsumer (most specific ancestor node). Whereas, path similarity returns a score denoting how similar two word senses are, based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy

In [60]:
document_path_similarity(doc1,doc3,'path_similarity')

1.0

In [61]:
document_path_similarity(doc1,doc3,'wup_similarity')

1.0

##### We can observe that scores for doc1 and doc3 are 1.0, which is expected