In [1]:
import nltk
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/santiago/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [2]:
from nltk import pos_tag
from nltk.wsd import lesk
from nltk.stem import WordNetLemmatizer
from nltk.metrics import jaccard_distance
from nltk.corpus import wordnet as wn

def lemmatize(p):
    if p[1][0] in {'N', 'V'}:
        return wnl.lemmatize(p[0].lower(), pos=p[1][0].lower())
    return p[0]

def penn2morphy(penntag, returnNone=False):
    morphy_tag = {'NN':wn.NOUN, 'JJ':wn.ADJ,
                  'VB':wn.VERB, 'RB':wn.ADV}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return None if returnNone else ''

def getHypernym(s):
    if not s:
        return [s]
    if s.hypernyms() == []:
        return [None]
    return s.hypernyms()    

wnl = WordNetLemmatizer()

input_file = 'trial/STS.input.txt'
with open(input_file) as f:
    input_data = f.readlines()
    
document_distances = []
lesks_distances = []
lesks_hyper_distances = []
morphology_distances = []
for i in input_data:
    sentences = nltk.sent_tokenize(i[4:])
    words = [nltk.word_tokenize(sent) for sent in sentences]
    pairs = [pos_tag(w) for w in words]
    l_words = [[lemmatize(p) for p in pair] for pair in pairs]
    synsets = [[[lesk(w, p[0], pos=penn2morphy(p[1][0])) for p in pair] for pair in pairs] for w in words]
    hypernyms = [[getHypernym(s) for s in synsets[c][c]] for c in range(len(synsets))]
    hypernyms = [[hy[0] for hy in hyp] for hyp in hypernyms]
    lesks_distances.append(jaccard_distance(set(synsets[0][0]), set(synsets[1][1])))
    morphology_distances.append(jaccard_distance(set(l_words[0]),set(l_words[1])))
    document_distances.append(jaccard_distance(set(words[0]),set(words[1])))
    lesks_hyper_distances.append(jaccard_distance(set(hypernyms[0]),set(hypernyms[1])))
    
print("Lab2 distances: " + str(document_distances))
print("Lab3 distances: " + str(morphology_distances))
print("Lesk distance: " + str(lesks_distances))
print("Lesk variant with hypernyms: " + str(lesks_hyper_distances))


Lab2 distances: [0.6923076923076923, 0.7368421052631579, 0.6666666666666666, 0.5454545454545454, 0.7692307692307693, 0.8620689655172413]
Lab3 distances: [0.6923076923076923, 0.6666666666666666, 0.6666666666666666, 0.5454545454545454, 0.7692307692307693, 0.8620689655172413]
Lesk distance: [0.7, 0.7857142857142857, 0.5, 0.8888888888888888, 0.9, 0.92]
Lesk variant with hypernyms: [0.7, 0.8, 0.4444444444444444, 0.8333333333333334, 0.875, 0.9]


In [4]:
from scipy.stats import pearsonr

gold_file = 'trial/STS.gs.txt'
with open(gold_file) as f:
    gold_data = f.readlines()
gold = [int(g[4:5]) for g in gold_data]

doc_pearson = pearsonr(document_distances, gold)[0]
mor_pearson = pearsonr(morphology_distances, gold)[0]
lesk_pearson = pearsonr(lesks_distances, gold)[0]
hypernyms_pearson = pearsonr(lesks_hyper_distances, gold)[0]

print('Lab2 pearson correlation: ' + str(doc_pearson))
print('Lab3 pearson correlation: ' + str(mor_pearson))
print('Lesk pearson correlation: ' + str(lesk_pearson))
print('Lesk variant with hypernyms correlation: ' + str(hypernyms_pearson))

# The correlation greatly improves when using the Lesk algorithm
# as the WSD helps it detect the similarities in the sentences
# better.

Lab2 pearson correlation: 0.4143770872333895
Lab3 pearson correlation: 0.517276212426234
Lesk pearson correlation: 0.6056964784272112
Lesk variant with hypernyms correlation: 0.5101560894527944
