In [1]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/santiago/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [8]:
# Mandatory exercise
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

brown_ic = wordnet_ic.ic('ic-brown.dat')

pairs = [('the','DT'), ('man','NN'), ('swim','VB'), ('with', 'PR'), ('a', 'DT'), ('girl','NN'), ('and', 'CC'), ('a', 'DT'), ('boy', 'NN'), ('whilst', 'PR'), ('the','DT'), ('woman', 'NN'), ('walk', 'VB')]
synsets = [wn.synset(p[0] + '.' + p[1][0].lower() + '.01') for p in pairs if p[1][0] in ['N', 'V']]

freqs = nltk.FreqDist(w.lower() for w in brown.words())
for i in range(len(synsets)):
    for j in range(i+1,len(synsets)):
        s1, s2 = synsets[i], synsets[j]
        
        if s1.name().split('.')[1] == s2.name().split('.')[1]:
            lch = [x.name().split('.')[0] for x in s1.lowest_common_hypernyms(s2)]
            ps = s1.path_similarity(s2) if s1.path_similarity(s2) is not None else 0
            wups = s1.wup_similarity(s2) if s1.wup_similarity(s2) is not None else 0

            print('** ' + str(s1.name().split('.')[0]) + ' and ' + str(s2.name().split('.')[0]) + ' **')
            print('Least Common Subsumers: ' + str(lch))
            print('Path Similarity: ' + str(ps))
            print('Leacock-Chodorow Similarity: ' + str(s1.lch_similarity(s2)))
            print('Wu-Palmer Similarity: ' + str(wups))
            print('Lin Similarity: ' + str(s1.lin_similarity(s2, brown_ic)))
            print()

print("All the other combinations have no relation whatsoever")


** man and girl **
Least Common Subsumers: ['adult']
Path Similarity: 0.25
Leacock-Chodorow Similarity: 2.2512917986064953
Wu-Palmer Similarity: 0.631578947368421
Lin Similarity: 0.7135111237276783

** man and male_child **
Least Common Subsumers: ['male']
Path Similarity: 0.3333333333333333
Leacock-Chodorow Similarity: 2.538973871058276
Wu-Palmer Similarity: 0.6666666666666666
Lin Similarity: 0.7294717876200584

** man and woman **
Least Common Subsumers: ['adult']
Path Similarity: 0.3333333333333333
Leacock-Chodorow Similarity: 2.538973871058276
Wu-Palmer Similarity: 0.6666666666666666
Lin Similarity: 0.7870841372982784

** swim and walk **
Least Common Subsumers: ['travel']
Path Similarity: 0.3333333333333333
Leacock-Chodorow Similarity: 2.159484249353372
Wu-Palmer Similarity: 0.3333333333333333
Lin Similarity: 0.4910052007916556

** girl and male_child **
Least Common Subsumers: ['person']
Path Similarity: 0.16666666666666666
Leacock-Chodorow Similarity: 1.845826690498331
Wu-Palmer

In [3]:
# In our opinion, the best similarity would be Lin Similarity.
# It uses the Information Content (IC) of the Least Common Subsumer
# to get its result and it seems to return the most accurate similarities.

In [27]:
# Optional Exercises
# Develop a function to search and show the shortest path
# between two noun synset.
# Apply it to show the shortest path between dog.n.01 and
# cat.n.01.

import math
from six import iteritems

# Based on nltk shortest_path_distance implementation
# Searches first for the shortest path Synset and 
# then prints the path from each synset
def searchAndShowPath(s1, s2):
    shp1 = s1._shortest_hypernym_paths(False)
    shp2 = s2._shortest_hypernym_paths(False)
    
    indexsyn = None
    inf = float('inf')
    path_distance = inf
    for synset, d1 in iteritems(shp1):
        d2 = shp2.get(synset, inf)
        dsum = d1 + d2
        
        if dsum < path_distance:
            indexsyn = synset
            path_distance = dsum
    
    if math.isinf(path_distance):
        print('No path distance')
        return None
    
    printPath(s1, shp1, indexsyn)
    printPath(s2, shp2, indexsyn)

def printPath(s, shp, insyn):
    count = 0
    space = ' '
    for synset, d in iteritems(shp):
        print(str(count*space) + str(d) + ' ' + str(synset))
        count +=1
        if(synset == insyn):
            break
    

In [28]:
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')

# prints distance and synset
searchAndShowPath(dog, cat)

0 Synset('dog.n.01')
 1 Synset('canine.n.02')
  1 Synset('domestic_animal.n.01')
   2 Synset('carnivore.n.01')
0 Synset('cat.n.01')
 1 Synset('feline.n.01')
  2 Synset('carnivore.n.01')
