In [93]:
import nltk
import pandas
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
from nltk.corpus import wordnet_ic

In [35]:
brown_ic =  wordnet_ic.ic('ic-brown.dat')
lem_cat_pairs = [["the","DT"], ["man","NN"], ["swim","VB"], ["with", "PR"], ["a", "DT"],
["girl","NN"], ["and", "CC"], ["a", "DT"], ["boy", "NN"], ["whilst", "PR"], ["the",
"DT"], ["woman", "NN"], ["walk", "VB"]]

# dict to ignore preposition determiners or conjunctions
# the false values correspond to part of speach that have no symsets in the wn database
cat_to_pos = {
    "NN": wn.NOUN,
    "VB": wn.VERB,
    "PR": False, 
    "DT": False, 
    "CC": False 
}

In [38]:
# A list to contain the top synset of each lemma of its category
top_synsets = []

for lem_cat in lem_cat_pairs:
    lem = lem_cat[0]
    cat = lem_cat[1]
    pos = cat_to_pos[cat]
    # ignore part of speaches that cannot be mapped to wordnet
    if (pos != False):
    
        synsets = wn.synsets(lem, pos)
        top_synsets.append(synsets[0])
        
        print(lem)
        print(cat)
        print(synsets[0])

        print("\n")


man
NN
Synset('man.n.01')


swim
VB
Synset('swim.v.01')


girl
NN
Synset('girl.n.01')


boy
NN
Synset('male_child.n.01')


woman
NN
Synset('woman.n.01')


walk
VB
Synset('walk.v.01')




In [102]:
# Top Symset similarities
# NOTE:  we normalise the lch similarity between 0-1 which is the range of the other similarities
# MAX_LCH_N is the lch similarity of 2 identical Noun synsets
# we find this to be 3.6375861597263857
# MAX_LCH_V is the lch similarity of 2 identical Verb synsets
# we find this to be 3.258096538021482

MAX_LCH_N = 3.6375861597263857
MAX_LCH_V = 3.258096538021482

similarities = {}

for synset_a in top_synsets:
    for synset_b in top_synsets:    
        # Here we compare only synsets with the same part of speach
        if synset_a.pos() == synset_b.pos():
            lcs = synset_a.lowest_common_hypernyms(synset_b)
            similarity = synset_a.path_similarity(synset_b)
                wup_similarity = synset_a.wup_similarity(synset_b)
            lin_similarity = synset_a.lin_similarity(synset_b, brown_ic)                        
            lch_similarity = synset_a.lch_similarity(synset_b)
            # Normalise lch similarity, as the other similarities range from 0-1 we can just nomalise lch
            if synset_a.pos() == 'v':
                normalised_lch = (1 / MAX_LCH_V) * lch_similarity
            else:
                normalised_lch = (1 / MAX_LCH_N) * lch_similarity
            # store results
            similarities[str([synset_a, synset_b])] = {
                'lcs': lcs,
                'similarity': similarity,
                'normalised_lch': normalised_lch,
                'wup_similarity': wup_similarity,
                'lin_similarity':  lin_similarity
            }
            # print results
            pp.pprint(similarities)

{"[Synset('man.n.01'), Synset('man.n.01')]": {'lcs': [Synset('man.n.01')],
                                              'lin_similarity': 1.0,
                                              'normalised_lch': 1.0,
                                              'similarity': 1.0,
                                              'wup_similarity': 1.0}}
{"[Synset('man.n.01'), Synset('girl.n.01')]": {'lcs': [Synset('adult.n.01')],
                                               'lin_similarity': 0.7135111237276783,
                                               'normalised_lch': 0.6188971751464533,
                                               'similarity': 0.25,
                                               'wup_similarity': 0.631578947368421},
 "[Synset('man.n.01'), Synset('man.n.01')]": {'lcs': [Synset('man.n.01')],
                                              'lin_similarity': 1.0,
                                              'normalised_lch': 1.0,
                                       

                                               'similarity': 0.25,
                                               'wup_similarity': 0.631578947368421},
 "[Synset('man.n.01'), Synset('male_child.n.01')]": {'lcs': [Synset('male.n.02')],
                                                     'lin_similarity': 0.7294717876200584,
                                                     'normalised_lch': 0.6979831568441129,
                                                     'similarity': 0.3333333333333333,
                                                     'wup_similarity': 0.6666666666666666},
 "[Synset('man.n.01'), Synset('man.n.01')]": {'lcs': [Synset('man.n.01')],
                                              'lin_similarity': 1.0,
                                              'normalised_lch': 1.0,
                                              'similarity': 1.0,
                                              'wup_similarity': 1.0},
 "[Synset('man.n.01'), Synset('woman.n.01')]": {'lcs': [

                                                'lin_similarity': 1.0,
                                                'normalised_lch': 1.0,
                                                'similarity': 1.0,
                                                'wup_similarity': 1.0},
 "[Synset('swim.v.01'), Synset('walk.v.01')]": {'lcs': [Synset('travel.v.01')],
                                                'lin_similarity': 0.4910052007916556,
                                                'normalised_lch': 0.6628054829415044,
                                                'similarity': 0.3333333333333333,
                                                'wup_similarity': 0.3333333333333333},
 "[Synset('woman.n.01'), Synset('girl.n.01')]": {'lcs': [Synset('woman.n.01')],
                                                 'lin_similarity': 0.9067798595489287,
                                                 'normalised_lch': 0.8094485875732267,
                                            

In [81]:
df = pandas.DataFrame(similarities)
df

Unnamed: 0,"[Synset('girl.n.01'), Synset('girl.n.01')]","[Synset('girl.n.01'), Synset('male_child.n.01')]","[Synset('girl.n.01'), Synset('man.n.01')]","[Synset('girl.n.01'), Synset('woman.n.01')]","[Synset('male_child.n.01'), Synset('girl.n.01')]","[Synset('male_child.n.01'), Synset('male_child.n.01')]","[Synset('male_child.n.01'), Synset('man.n.01')]","[Synset('male_child.n.01'), Synset('woman.n.01')]","[Synset('man.n.01'), Synset('girl.n.01')]","[Synset('man.n.01'), Synset('male_child.n.01')]","[Synset('man.n.01'), Synset('man.n.01')]","[Synset('man.n.01'), Synset('woman.n.01')]","[Synset('swim.v.01'), Synset('swim.v.01')]","[Synset('swim.v.01'), Synset('walk.v.01')]","[Synset('walk.v.01'), Synset('swim.v.01')]","[Synset('walk.v.01'), Synset('walk.v.01')]","[Synset('woman.n.01'), Synset('girl.n.01')]","[Synset('woman.n.01'), Synset('male_child.n.01')]","[Synset('woman.n.01'), Synset('man.n.01')]","[Synset('woman.n.01'), Synset('woman.n.01')]"
lcs,[Synset('girl.n.01')],[Synset('person.n.01')],[Synset('adult.n.01')],[Synset('woman.n.01')],[Synset('person.n.01')],[Synset('male_child.n.01')],[Synset('male.n.02')],[Synset('person.n.01')],[Synset('adult.n.01')],[Synset('male.n.02')],[Synset('man.n.01')],[Synset('adult.n.01')],[Synset('swim.v.01')],[Synset('travel.v.01')],[Synset('travel.v.01')],[Synset('walk.v.01')],[Synset('woman.n.01')],[Synset('person.n.01')],[Synset('adult.n.01')],[Synset('woman.n.01')]
lin_similarity,1,0.292728,0.713511,0.90678,0.292728,1,0.729472,0.318423,0.713511,0.729472,1,0.787084,1,0.491005,0.491005,1,0.90678,0.318423,0.787084,1
normalised_lch,1,0.507432,0.618897,0.809449,0.507432,1,0.697983,0.557553,0.618897,0.697983,1,0.697983,1,0.662805,0.662805,1,0.809449,0.557553,0.697983,1
similarity,1,0.166667,0.25,0.5,0.166667,1,0.333333,0.2,0.25,0.333333,1,0.333333,1,0.333333,0.333333,1,0.5,0.2,0.333333,1
wup_similarity,1,0.631579,0.631579,0.631579,0.631579,1,0.666667,0.666667,0.631579,0.666667,1,0.666667,1,0.333333,0.333333,1,0.631579,0.666667,0.666667,0.666667


In [101]:
# OBSERVATIONS 

# It is observed that all the similarity algorithms used are reversable; For example the similarity between girl and man
# is identical to that of man and girl. It is assumed this is because the edges of the lexical trees have the same cost
# either way that they are traversed.

# similarity comparison: [girl, male_child] & [girl, man]
# It was hypothesised that [girl, male_child] should have less similarity than [girl, man],
# however it was found that across similarity functions [girl, man] was more similar.
# This could be due to their relations in the brown corpus.

# wup_similarity
# [girl, man] has the same wup_similarity as [girl, woman] and [girl, male_child]
# wup_similarity does not seem to give the same detail as the other similarity functions.
# This could be because the depth of the LCS in this taxonomy does not vary much.
# Also, there is an anomaly, [woman, woman] gives a similarity not of 1, but of 0.666667
# Although this may suggest that according to the brown corpus not all women are the same,
# this also suggests that the results of the wup_similarity are not reliable.

# lin_similarity
# [girl, man] is shown as less similar to [male_child, man] (0.714, 0.730)
# this similarity may prioritise gender more than age

# similarity 
# [girl, male_child] is less similar than [girl, man] (0.167, 0.25)
# this suggests that age similarity is not caputred in this function.
# In reality, a girl has a more similar age to a male_child than a man
# Also similarity does not seem to go above 0.5 for two synsets that are not equal

# normalised_lch 
# [girl, male_child] has the least similarity
# no similarities lower than 0.5
# It is a logarithmic function so the normalisation may be inaccurate
# It has the highest similarity amongst the other functions for [swim-walk]
# This may be useful for understanding more similarites between verbs.

# These observations highlight some of the properties of the brown corpus, a different corpus may have
# yielded different results.