In [1]:
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd

In [2]:
# Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets

def convert_tag(tag):
    
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None

In [3]:
# Tokenizes and tags the words in the document doc

def doc_to_synsets(doc):
    
#     nltk.download('book')
    
    tokens = nltk.word_tokenize(doc)
    tags = nltk.pos_tag(tokens)
    
    wn_tags = [(i[0], convert_tag(i[1])) for i in tags]
    
    wn_tags2 = [wn.synsets(m, n)[0] for m , n in wn_tags if len(wn.synsets(m, n)) != 0]
    
    return wn_tags2

In [4]:
# the normalized similarity score of s1 onto s2

def similarity_score(s1, s2):

    list2 = []    
    for i in s1:
        list1 = []
        list1 =  [i.path_similarity(j) for j in s2 if i.path_similarity(j) != None]
        if len(list1) != 0:
            list2.append(max(list1))
    
    return (sum(list2)/len(list2))

In [5]:
# Finds the symmetrical similarity between doc1 and doc2

def document_path_similarity(doc1, doc2):
    
    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)

    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

In [6]:
doc1 = 'This is a function to test document_path_similarity.'
doc2 = 'Use this function to see if your code in doc_to_synsets and similarity_score is correct!'

document_path_similarity(doc1, doc2)

0.5426445578231293

# Testing on pairs of paraphrases

In [7]:
paraphrases = pd.read_csv('paraphrases.csv')
paraphrases.head()

Unnamed: 0,Quality,D1,D2
0,1,"Ms Stewart, the chief executive, was not expec...","Ms Stewart, 61, its chief executive officer an..."
1,1,After more than two years' detention under the...,After more than two years in detention by the ...
2,1,"""It still remains to be seen whether the reven...","""It remains to be seen whether the revenue rec..."
3,0,"And it's going to be a wild ride,"" said Allan ...","Now the rest is just mechanical,"" said Allan H..."
4,1,The cards are issued by Mexico's consulates to...,The card is issued by Mexico's consulates to i...


In [8]:
# The pair of documents in paraphrases which has the maximum similarity score

def most_similar_docs():
    
    list3 = []
    for i in range(len(paraphrases)):
        list3.append(document_path_similarity(paraphrases.iloc[i]['D1'], paraphrases.iloc[i]['D2']))
        
    paraphrases['similarity_score'] = list3
        
    most_similar = np.argsort(list3)[-1]
    
    return(paraphrases.iloc[most_similar]['D1'], paraphrases.iloc[most_similar]['D2'], paraphrases.iloc[most_similar]['similarity_score'])

most_similar_docs()

('"Indeed, Iran should be put on notice that efforts to try to remake Iraq in their image will be aggressively put down," he said.',
 '"Iran should be on notice that attempts to remake Iraq in Iran\'s image will be aggressively put down," he said.\n',
 0.9590643274853801)

In [9]:
# If the score is greater than 0.75, label is paraphrase (1), else label is not paraphrase (0)

def label_accuracy():
    from sklearn.metrics import accuracy_score

    def label(x):
        if x > 0.75:
            return 1
        else:
            return 0
    
    paraphrases['label'] = paraphrases['similarity_score'].apply(label)
    
    return accuracy_score(paraphrases['label'], paraphrases['Quality'])

label_accuracy()

0.7