In [115]:
import nltk
import string
from nltk.tokenize import punkt
from nltk.metrics import *
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [114]:
# Load the first three lines of training text
with open('train/msr_paraphrase_train_input.txt', 'r') as f:
    line1, line2, line3 = next(f), next(f), next(f)

In [131]:
# UTILITIES
# Pretrained nltk detector
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
# Lines to sentences
def lines_to_sentences(line):
    return sent_detector.tokenize(line.strip())
# Wordnet lemmatizer
wnl = WordNetLemmatizer()
# Function to get wordnet pos code
def wordnet_pos_code(tag):
    if tag.startswith('NN'):
        return wordnet.NOUN
    elif tag.startswith('VB'):
        return wordnet.VERB
    elif tag.startswith('JJ'):
        return wordnet.ADJ
    elif tag.startswith('RB'):
        return wordnet.ADV
    else:
        return None
# Token to lemmas using wordnet lemmatizer
def token_to_lemmas(token):    
    pos = wordnet_pos_code(token[1])
    if pos:
        return wnl.lemmatize(token[0], pos=pos)
    return token[0]
# Convert token to sense where sense is the first synset + POS
def token_to_sense(token):
    lemma = wnl.lemmatize(token[0])
    pos = wordnet_pos_code(token[1])
    synsets = []
    if pos: 
        synsets = wordnet.synsets(lemma, pos=pos)
    if len(synsets) > 0:
        return str(synsets[0]) + pos
# Removing the punctuation and lowering the case of a string
def remove_punctuation(line):
    return line.translate(str.maketrans('', '', string.punctuation))

def words_from_sent(sent):
    # tokenized
    tokenized = nltk.word_tokenize(sent)
    # remove stopwords and return
    return [word for word in tokenized if word not in stopwords.words('english')]

# Comparison function
def compare_sentences(sent_0, sent_1):
    print('==COMPARING==\n', sent_0, '\n==WITH==\n', sent_1)
    # Remove the punctuation and make lower case
    sent_0, sent_1 = remove_punctuation(sent_0.lower()), remove_punctuation(sent_1.lower())
    # Get words from sentences
    words_0, words_1 = words_from_sent(sent_0), words_from_sent(sent_1)
    print('==WORDS==\n', words_0, '\n', words_1)
    # Jaccard distance between words
    print('word_jaccard_distance: ', jaccard_distance(set(words_0), set(words_1)))
    # Split into tokens
    tokens_0, tokens_1 = nltk.pos_tag(words_0), nltk.pos_tag(words_1)
    print('==TOKENS==\n', tokens_0, '\n', tokens_1)
    # Split into lemmas
    lemmas_0, lemmas_1 = list(map(token_to_lemmas, tokens_0)), list(map(token_to_lemmas, tokens_1))
    print('==LEMMAS==\n', lemmas_0, '\n', lemmas_1)
    # Jaccard distance between lemmas
    print('lemma_jaccard_distance: ', jaccard_distance(set(lemmas_0), set(lemmas_1)))
    # Split into senses
    senses_0, senses_1 = list(filter(None, map(token_to_sense, tokens_0))), list(filter(None, map(token_to_sense, tokens_1)))
    print('==SENSES==\n', senses_0, '\n', senses_1)
    # Jaccard distance between senses
    print('sense_jaccard_distance: ', jaccard_distance(set(senses_0), set(senses_1)))

In [132]:
# Compare line1 pair1
sentences = lines_to_sentences(line1)
compare_sentences(sentences[0], sentences[1])

==COMPARING==
 Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence. 
==WITH==
 Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.
==WORDS==
 ['amrozi', 'accused', 'brother', 'called', 'witness', 'deliberately', 'distorting', 'evidence'] 
 ['referring', 'witness', 'amrozi', 'accused', 'brother', 'deliberately', 'distorting', 'evidence']
word_jaccard_distance:  0.2222222222222222
==TOKENS==
 [('amrozi', 'NN'), ('accused', 'VBD'), ('brother', 'NN'), ('called', 'VBN'), ('witness', 'NN'), ('deliberately', 'RB'), ('distorting', 'VBG'), ('evidence', 'NN')] 
 [('referring', 'VBG'), ('witness', 'NN'), ('amrozi', 'NN'), ('accused', 'VBD'), ('brother', 'RBR'), ('deliberately', 'RB'), ('distorting', 'VBG'), ('evidence', 'NN')]
==LEMMAS==
 ['amrozi', 'accuse', 'brother', 'call', 'witness', 'deliberately', 'distort', 'evidence'] 
 ['refer', 'witness', 'amrozi', 'accuse', 'brother', 'deliberately', 

In [110]:
# Summary
# word_jaccard_distance:  0.2222222222222222
# lemma_jaccard_distance: 0.2222222222222222
# sense_jaccard_distance: 0.375
# 
# A lower jaccard distance means a greater similarity between the sentences.
# This would suggest that the two sentences are more similar in word composition than meaning.
# This disagrees with our intuition as we understand the sentences the same. 
# The reason this has happened is because in sentence two, the lemma 'brother' has a different
# POS tag than in sentence one (wordnet.ADV instead of wordnet.NOUN) and no synsets can be found.

In [133]:
# Compare line2 pair2
sentences = lines_to_sentences(line2)
compare_sentences(sentences[0], sentences[1])

==COMPARING==
 Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion. 
==WITH==
 Yucaipa bought Dominick's in 1995 for $693 million and sold it to Safeway for $1.8 billion in 1998.
==WORDS==
 ['yucaipa', 'owned', 'dominicks', 'selling', 'chain', 'safeway', '1998', '25', 'billion'] 
 ['yucaipa', 'bought', 'dominicks', '1995', '693', 'million', 'sold', 'safeway', '18', 'billion', '1998']
word_jaccard_distance:  0.6666666666666666
==TOKENS==
 [('yucaipa', 'NN'), ('owned', 'VBD'), ('dominicks', 'NNS'), ('selling', 'VBG'), ('chain', 'NN'), ('safeway', 'RB'), ('1998', 'CD'), ('25', 'CD'), ('billion', 'CD')] 
 [('yucaipa', 'RB'), ('bought', 'VBD'), ('dominicks', 'NNS'), ('1995', 'CD'), ('693', 'CD'), ('million', 'CD'), ('sold', 'VBN'), ('safeway', 'RB'), ('18', 'CD'), ('billion', 'CD'), ('1998', 'NNS')]
==LEMMAS==
 ['yucaipa', 'own', 'dominick', 'sell', 'chain', 'safeway', '1998', '25', 'billion'] 
 ['yucaipa', 'buy', 'dominick', '1995', '693', 'million', 'sell

In [109]:
# Summary
# word_jaccard_distance:  0.6666666666666666
# lemma_jaccard_distance: 0.5714285714285714
# sense_jaccard_distance: 0.6
#  
# The results suggest that the sentences have roughly an equal similarity in words, lemma and sense.
# Also, it would suggest that they are less similar than in pair one, as they have a higher jaccard_distance coefficient.
# Intuition agrees with this as in sentence two Safeway is sold for $1.8 billion instead of the $2.5 billion
# in sentence one. 

In [134]:
# Compare line3 pair3
sentences = lines_to_sentences(line3)
compare_sentences(sentences[0], sentences[1])

==COMPARING==
 They had published an advertisement on the Internet on June 10, offering the cargo for sale, he added. 
==WITH==
 On June 10, the ship's owners had published an advertisement on the Internet, offering the explosives for sale.
==WORDS==
 ['published', 'advertisement', 'internet', 'june', '10', 'offering', 'cargo', 'sale', 'added'] 
 ['june', '10', 'ships', 'owners', 'published', 'advertisement', 'internet', 'offering', 'explosives', 'sale']
word_jaccard_distance:  0.4166666666666667
==TOKENS==
 [('published', 'VBN'), ('advertisement', 'JJ'), ('internet', 'NN'), ('june', 'NN'), ('10', 'CD'), ('offering', 'NN'), ('cargo', 'NN'), ('sale', 'NN'), ('added', 'VBD')] 
 [('june', 'NN'), ('10', 'CD'), ('ships', 'NNS'), ('owners', 'NNS'), ('published', 'VBN'), ('advertisement', 'JJ'), ('internet', 'NN'), ('offering', 'NN'), ('explosives', 'NNS'), ('sale', 'NN')]
==LEMMAS==
 ['publish', 'advertisement', 'internet', 'june', '10', 'offering', 'cargo', 'sale', 'add'] 
 ['june', '10', '

In [107]:
# Summary
# word_jaccard_distance:  0.4166666666666667
# lemma_jaccard_distance: 0.4166666666666667
# sense_jaccard_distance: 0.5
# 
# These results suggest that the word composition is slightly more similar than the sense of the 
# two sentences. However, this doesn't agree with our human understanding as we understand the
# sentences as having the same meaning, just that sentence two is more specific about the cargo
# being explosives.

In [89]:
# Which one of these approaches, if any, do you think that could perform better
# for any pair of texts?

In [106]:
# Given the small nature of the corpus we have used, it's difficult to say which one would perform better
# on a larger dataset. However, the results show that the sense_jaccard_distance does not agree
# with our human understanding in line3 pair3 even though it was a good measure for similarity
# line1 pair1.
# The word and lemma similarity measures perform quite similarly, so it may be ok to only use one
# of them, the lemma similarity can identify similarities when the sentence has the same meaning
# but words placed differently, and therefore may be more useful. 
# Overall, we think that combining all three measures would be the best approach as it compares
# the similarities of sentences across multiple dimensions, perhaps taking an average of the three
# or first seeing what distribution they have over larger bodies of text.