# Word Sense Disambiguation using a Sensegram

## Imports and Initializations

We need to import `numpy` for working with arrays, and other libs like `os`, `pickle` and `pprint` for other utility functions.

In [8]:
import os, pprint, pickle
import numpy as np

pp = pprint.PrettyPrinter(indent=2)

## Helper functions

In [9]:
def save_obj(obj, name):
    if 'obj' not in os.listdir():
        os.mkdir('obj')
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    try:
        with open('obj/' + name + '.pkl', 'rb') as f:
            return pickle.load(f)
    except:
        return None

## Loading the Sensegram

In [13]:
sense_vecs = load_obj('sense_vecs')
pos_tags = load_obj('pos_tags')

if not (sense_vecs and pos_tags):
    SENSEGRAM_PATH = "/Users/sounak/Documents/clg/nlp/nlp-projects/data/sensegrams_of_wikipedia_cluster"
    f = open(SENSEGRAM_PATH, 'r')
    sense_vecs = {}
    pos_tags = set()

    for line in f.readlines():
        t = line.split('\t')
        word, pos = t[0].split('#')
        pos_tags.add(pos)
        if t[1] == '0':
            sense_vecs[(word, pos)] = []
        sense_vecs[(word, pos)].append(np.array(eval(t[2])))
    f.close()
    save_obj(sense_vecs, 'sense_vecs')
    save_obj(pos_tags, 'pos_tags')

print('sense_vecs have been loaded')

sense_vecs have been loaded


## Loading the Glove Model

In [4]:
word_vecs = load_obj('word_vecs')

if not word_vecs:
    GLOVE_PATH = "/Users/sounak/Documents/clg/nlp/nlp-projects/data/glove.6B.300d.txt"
    f = open(GLOVE_PATH, 'r')
    word_vecs = {}
    for line in f.readlines():
        t = line.split(' ')
        word_vecs[t[0]] = np.array([float(_) for _ in t[1:]])
    f.close()
    save_obj(word_vecs, 'word_vecs')
    
print('word_vecs have been loaded')

word_vecs have been loaded


In [None]:
np.dot((word_vecs['zombie'] - word_vecs['bad'] + word_vecs['good']), word_vecs['human'])

## Computing Sense

In [42]:
sentences = [
    ['i', 'am', 'sitting', 'on', 'a', 'table'],
    ['table', 'is', 'a', 'necessary', 'furniture'],
    ['fill', 'the', 'data', 'in', 'the', 'table'],
    ['the', 'rows', 'and', 'columns', 'of', 'the', 'table', 'are', 'zero', 'value']
]
target = 'table'

def compute_sense_idx(sentence, target, pos_tagging=False):
    if target not in sentence:
        return None
    sum = np.zeros(300)
    context = list(filter((target).__ne__, sentence))
    for c in context:
        sum += word_vecs[c]
    cw = sum / len(context)
    max_idx = -1
    max_value = float('-inf')
    for pos in pos_tags:
        try:
            for idx, sense in enumerate(sense_vecs[(target, pos)]):
                if np.linalg.norm(sense) > 0:
                    result = np.dot(sense, cw) / (np.linalg.norm(sense) * np.linalg.norm(cw))
                    if result > max_value:
                        max_value = result
                        max_idx = idx
        except KeyError:
            continue
    return max_idx

for s in sentences:
    print(compute_sense_idx(s, target))

2
2
0
3
