# Word Sense Disambiguation using a Sensegram

## Imports and Initializations

We need to import `numpy` for working with arrays, and other libs like `os`, `pickle` and `pprint` for other utility functions.

In [45]:
import os, pprint, pickle, re
import numpy as np
from stop_words import get_stop_words
import nltk

lem = nltk.stem.wordnet.WordNetLemmatizer()
pp = pprint.PrettyPrinter(indent=2)

TEST_SENTENCES_PATH = '/Users/sounak/Documents/clg/nlp/nlp-projects/data/wsd/sentences.txt'

## Helper functions

The two helper functions `save_obj` and `load_obj` are used to pickle any object and load back the pickle file. These functions will be useful in saving the vector dicts and thus faster loading of the same.

In [47]:
def save_obj(obj, name):
    if 'obj' not in os.listdir():
        os.mkdir('obj')
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    try:
        with open('obj/' + name + '.pkl', 'rb') as f:
            return pickle.load(f)
    except:
        return None

## Loading the Sensegram

In [3]:
sense_vecs = load_obj('sense_vecs')
pos_tags = load_obj('pos_tags')

if not (sense_vecs and pos_tags):
    SENSEGRAM_PATH = "/Users/sounak/Documents/clg/nlp/nlp-projects/data/sensegrams_of_wikipedia_cluster"
    f = open(SENSEGRAM_PATH, 'r')
    sense_vecs = {}
    pos_tags = set()

    for line in f.readlines():
        t = line.split('\t')
        word, pos = t[0].split('#')
        pos_tags.add(pos)
        if t[1] == '0':
            sense_vecs[(word, pos)] = []
        sense_vecs[(word, pos)].append(np.array(eval(t[2])))
    f.close()
    save_obj(sense_vecs, 'sense_vecs')
    save_obj(pos_tags, 'pos_tags')

print('sense_vecs have been loaded')

sense_vecs have been loaded


## Loading the Glove Model

In [4]:
word_vecs = load_obj('word_vecs')

if not word_vecs:
    GLOVE_PATH = "/Users/sounak/Documents/clg/nlp/nlp-projects/data/glove.6B.300d.txt"
    f = open(GLOVE_PATH, 'r')
    word_vecs = {}
    for line in f.readlines():
        t = line.split(' ')
        word_vecs[t[0]] = np.array([float(_) for _ in t[1:]])
    f.close()
    save_obj(word_vecs, 'word_vecs')
    
print('word_vecs have been loaded')

word_vecs have been loaded


## Computing Sense

The function `compute_sense_idx` takes a sentence, the target and some other arguments and returns the index of the sense of the target that was used in the current context.

This function maximizes the cosine similarity of an aggregate context vector with the vectors of the different senses of the target word. It also doesn't include the stop words in the context. The aggregate context vector is calculated using the lemmatized words in the context after removing the stop words.

In [54]:
stop_words = get_stop_words('en')

def compute_sense_idx(sentence, target):
    if target not in sentence:
        return None
    sentence = nltk.pos_tag(sentence)
    context = list(filter(lambda x: x[0] != target, sentence))
    sum = np.zeros(300)
    context_final = [(lem.lemmatize(w, pos[0].lower()), pos) for w, pos in context if w not in stop_words]
    print(context_final)
    for w, _ in context_final:
        sum += word_vecs[w]
        
    cw_mean = np.divide(sum, len(context))
    max_idx = -1
    max_value = float('-inf')
    for pos in pos_tags:
        try:
            for idx, sense in enumerate(sense_vecs[(target, pos)]):
                if np.linalg.norm(sense) > 0:
                    result = np.divide(np.dot(sense, cw_mean), (np.linalg.norm(sense) * np.linalg.norm(cw_mean)))
                    if result > max_value:
                        max_value = result
                        max_idx = idx
        except KeyError:
            continue
    return max_idx

## Tokenizer

This is a light-weight tokenizer for tokenizing the input sentences.

In [6]:
def tokenize(text):
    words = [_.lower() for _ in re.split(r"[^a-zA-ZÀ-ÿ0-9']+", text)]
    return list(filter(('').__ne__, words))

## Testing

The sentences must provided in the following way, `<sentence>/<target_word>` in every line of the file.

In [55]:
f = open(TEST_SENTENCES_PATH, 'r')
lines = f.readlines()
f.close()

for line in lines:
    sentence, target = line.split('/')
    sentence = tokenize(sentence)
    target = target.strip()
    print(compute_sense_idx(sentence, target))

[('go', 'VBG'), ('director', 'NN'), ('project', 'NN')]
1
[('tie', 'VBN'), ('tightly', 'RB'), ('ball', 'NN')]
0
[('need', 'VBP'), ('tie', 'VB'), ('cow', 'NN'), ('pole', 'NN')]
1
