In [1]:
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown, stopwords
import itertools
from collections import Counter
import json

nltk.download('brown')
nltk.download('wordnet')
nltk.download('stopwords')
stopwords = stopwords.words('english')

np.set_printoptions(suppress=True)

[nltk_data] Downloading package brown to /Users/r0g06z5/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/r0g06z5/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/r0g06z5/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Word Sense Disambiguation

The problem of Word Sense Disambiguation (WSD) is identifying the correct sense of word being used in a sentence. In English Language, there can be multiple senses of the same word. 

For example - Apple in a sentence can be used as a fruit or as the name of company.

In [2]:
ambiguous_word = 'ash'
word_senses = wn.synsets(ambiguous_word)
print('Word:', ambiguous_word)
print('Number of senses:', len(word_senses))
print()
for i, word_sense in enumerate(word_senses):
    print(f'Sense {i+1}:', word_sense.definition())

Word: ash
Number of senses: 4

Sense 1: the residue that remains when something is burned
Sense 2: any of various deciduous pinnate-leaved ornamental or timber trees of the genus Fraxinus
Sense 3: strong elastic wood of any of various ash trees; used for furniture and tool handles and sporting goods such as baseball bats
Sense 4: convert into ashes


### Basic Algorithm

Given a sentence and an ambiguous word in it, we are interested in finding the sense of that word. So, we create **sense bags** which is collection of features (hyponyms/hypernyms/definitions or others) for each sense of ambiguous word and a **context bag** which is collection of features for all the words in sentence excluding ambiguous word.

We calculate overlap between context bag and various sense bags (one for each sense). We consider that sense which has maximum overlap.

1. Hyponyms are words of more specific meaning than given word. Example - Apple is hyponym of fruit.
2. Hypernyms are words of broader meaning than given word. Example - Fruit is hypernym of Apple.

In [3]:
def get_sense_context_bag(sample_sentence, ambiguous_word, feature):
    words = sample_sentence.split(' ')
    words = [w.lower() for w in words]
    words.remove(ambiguous_word)

    sense_bag = []
    context_bag = []
    for word in words:
        word_senses = wn.synsets(word)

        if feature == 'hyponym':
            word_hyponyms = list(itertools.chain(*map(lambda x: x.hyponyms(), word_senses)))
            context_bag += word_hyponyms
        if feature == 'hypernym':
            word_hypernyms = list(itertools.chain(*map(lambda x: x.hypernyms(), word_senses)))
            context_bag += word_hypernyms

    ambiguous_word_senses = wn.synsets(ambiguous_word)
    if feature=='hyponym':
        sense_bag = list(map(lambda x: x.hyponyms(), ambiguous_word_senses))
    if feature=='hypernym':
        sense_bag = list(map(lambda x: x.hypernyms(), ambiguous_word_senses))
    return sense_bag, set(context_bag)  

In [4]:
def get_sense(sample_sentence, ambiguous_word, feature):
    sense_bag, context_bag = get_sense_context_bag(sample_sentence, ambiguous_word, feature)

    # count overlap between context bag and various sense bags, one for each sense
    count_overlap = list(map(lambda x: len(set(x) and context_bag), sense_bag))
    sense_idx = count_overlap.index(max(count_overlap))
    sense_dfn = wn.synsets(ambiguous_word)[sense_idx].definition()
    return sense_dfn

In [5]:
sample_sentence = 'I am eating apple'
ambiguous_word = 'apple'
feature = 'hyponym'
sense_dfn = get_sense(sample_sentence, ambiguous_word, feature) 
print('Ambiguous word:', ambiguous_word)   
print('Definition:', sense_dfn)

Ambiguous word: apple
Definition: fruit with red or yellow or green skin and sweet to tart crisp whitish flesh


In [6]:
def preprocess(sense):
    words = sense.definition().split()
    words = [word.lower() for word in words if word.isalnum() and word not in stopwords]
    return words

def Lesk_algorithm(ambiguous_word, context):
    sense_bag = list(map(lambda x: preprocess(x), wn.synsets(ambiguous_word)))

    context_words = context.strip().split(' ')
    context_bag = []
    for context_word in context_words:
        context_bag += list(itertools.chain(*map(lambda x: preprocess(x), wn.synsets(context_word))))
    context_bag = set(context_bag)
    
    count_overlap = list(map(lambda x: len(set(x) and context_bag), sense_bag))
    sense_idx = count_overlap.index(max(count_overlap))
    sense_dfn = wn.synsets(ambiguous_word)[sense_idx].definition()
    return sense_dfn

In [7]:
ambiguous_word = 'ash'
context = 'coal'
sense_dfn = Lesk_algorithm(ambiguous_word, context)
print('Ambiguous word:', ambiguous_word)   
print('Definition:', sense_dfn)

Ambiguous word: ash
Definition: the residue that remains when something is burned


In [8]:
def get_thesaurus(sense):
    return list(set(itertools.chain(*map(lambda x: x.lemma_names(), sense.hypernyms() + sense.hyponyms()))))

def Walker_algorithm(ambiguous_word, context):
    thesaurus_senses = list(map(lambda x: get_thesaurus(x), wn.synsets(ambiguous_word)))
    assert len(thesaurus_senses) == len(wn.synsets(ambiguous_word))

    context_words = context.strip().split(' ')
    thesaurus_contexts = []
    for context_word in context_words:
        thesaurus_contexts += [list(set(itertools.chain(*map(lambda x: get_thesaurus(x), wn.synsets(context_word)))))]
    assert len(thesaurus_contexts) == len(context_words)

    count_overlaps = []
    for thesaurus_sense in thesaurus_senses:
        count_overlap = 0
        for thesaurus_context in thesaurus_contexts:
            if len(set(thesaurus_sense).intersection(set(thesaurus_context))) > 0:
                count_overlap +=1
        count_overlaps.append(count_overlap)
        
    print('Number of overlaps:', count_overlaps)
    sense_idx = count_overlaps.index(max(count_overlaps))
    sense_dfn = wn.synsets(ambiguous_word)[sense_idx].definition()
    return sense_dfn

ambiguous_word = 'burn'
context = 'coal fire flame residue wood combust'

print('Ambiguous word:', ambiguous_word)  
sense_dfn = Walker_algorithm(ambiguous_word, context)
print('Definition:', sense_dfn)

Ambiguous word: burn
Number of overlaps: [0, 0, 2, 0, 1, 2, 1, 3, 1, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2]
Definition: undergo combustion
