In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict
from nltk.corpus import wordnet as wn
from tqdm import tqdm

### 1.Extract definitions for all senses-candidates from WordNet (via nltk)

In [4]:
candidates_file = '../WSD_Evaluation_Framework/Data_Validation/candidatesWN30.txt'

In [5]:
def read_candidates(file):
    kv = defaultdict(list)
    wn_ids = set()
    with open(file) as f:
        for line in tqdm(f,position=0):
            s = line[:-1].split('\t')
            assert len(s) >= 3 and s[1] in ['v','a','n','r']
            kv[(s[0],s[1])] = s[2:]
            for t in s[2:]:
                wn_ids.add(t)
    return kv, wn_ids

In [6]:
w_t2wn, wn_id_set = read_candidates(candidates_file)

155287it [00:00, 301669.99it/s]


In [7]:
len(w_t2wn), len(wn_id_set)

(155287, 206941)

In [8]:
w_t2wn[('1000000000000', 'n')]

['1000000000000%1:23:01::', '1000000000000%1:23:00::']

In [9]:
wn2definition = defaultdict(str)
wn2examples = defaultdict(list)

for k in tqdm(w_t2wn.keys(), position=0):
    candidates = wn.synsets(k[0])
    assert len(candidates) > 0
    for c in candidates:
        for l in c.lemmas():
            wn_id = l.key()  # something like shuffle%1:04:00::
            if wn_id in wn_id_set:
                wn2definition[wn_id] = c.definition()  # sentence(str)
                if len(c.examples()) > 0:
                    wn2examples[wn_id] = c.examples()  # list of sentences (save all of them just in case)

100%|██████████| 155287/155287 [00:15<00:00, 10309.72it/s]


In [10]:
len(wn2definition), len(wn2examples)

(206941, 59634)

In [11]:
w_t2wn_definition_examples = defaultdict(list)

for k in tqdm(w_t2wn.keys(), position=0):
    data = [(id, wn2definition[id], wn2examples[id]) for id in w_t2wn[k]]
    if len(data) > 0:
        w_t2wn_definition_examples[k] = data

100%|██████████| 155287/155287 [00:00<00:00, 177977.10it/s]


In [12]:
len(w_t2wn_definition_examples)

155287

In [13]:
w_t2wn_definition_examples[('king', 'n')]

[('king%1:18:00::', 'a male sovereign; ruler of a kingdom', []),
 ('king%1:18:02::', 'a competitor who holds a preeminent position', []),
 ('king%1:18:01::',
  'a very wealthy or powerful businessman',
  ['an oil baron']),
 ('king%1:26:00::',
  'preeminence in a particular category or group or field',
  ['the lion is the king of beasts']),
 ('king%1:18:05::', 'United States woman tennis player (born in 1943)', []),
 ('king%1:18:04::',
  'United States guitar player and singer of the blues (born in 1925)',
  []),
 ('king%1:18:03::',
  'United States charismatic civil rights leader and Baptist minister who campaigned against the segregation of Blacks (1929-1968)',
  []),
 ('king%1:06:02::',
  "a checker that has been moved to the opponent's first row where it is promoted to a piece that is free to move either forward or backward",
  []),
 ('king%1:06:01::',
  'one of the four playing cards in a deck bearing the picture of a king',
  []),
 ('king%1:06:00::', '(chess) the weakest but the m

Pickle mapping (word, tag) -> [(WN_id_1, definition_1, [example_of_definition_1_1,...]), ...(WN_id_N, definition_N, [...])]

In [14]:
w_t2wn_definition_examples = dict(w_t2wn_definition_examples)
np.save('word_tag_2_wordnet_definition_examples.npy', w_t2wn_definition_examples)

### 2.Prepare WSD datasets: train/eval set - SemCor, test sets - Semeval2007, Semeval2013, Semeval2015 
(Parse xml with triplets (word, tag, sample_id))

In [15]:
semcor_data_file = '../WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.data.xml'

semval7_data_file = '../WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.data.xml'
semval13_data_file = '../WSD_Evaluation_Framework/Evaluation_Datasets/semeval2013/semeval2013.data.xml'
semval15_data_file = '../WSD_Evaluation_Framework/Evaluation_Datasets/semeval2015/semeval2015.data.xml'

In [16]:
# EXAMPLE from Semeval2007
#
# [('lemma', 'you'), ('pos', 'PRON')]
# [('lemma', 'oct.'), ('pos', 'NOUN')]
# [('lemma', '6'), ('pos', 'NUM')]
# [('lemma', 'editorial'), ('pos', 'NOUN')]
# [('lemma', '``'), ('pos', '.')]
# [('lemma', 'the'), ('pos', 'DET')]
# [('lemma', 'ill'), ('pos', 'NOUN')]
# [('lemma', 'homeless'), ('pos', 'NOUN')]
# [('lemma', '``'), ('pos', '.')]
# [('id', 'd000.s000.t000'), ('lemma', 'refer'), ('pos', 'VERB')]
# [('lemma', 'to'), ('pos', 'PRT')]
# [('id', 'd000.s000.t001'), ('lemma', 'research'), ('pos', 'NOUN')]

In [17]:
import xml.etree.ElementTree

In [18]:
def read_corpus_from_xml(file):

    corpus = []
    corpus_xml = xml.etree.ElementTree.parse(file).getroot()

    for text_xml in tqdm(corpus_xml.findall('text'), position=0):
        for sent_xml in text_xml.findall('sentence'):
            sent = []
            tags = []
            target_ids = []
            for token in sent_xml:
                token = token.items()
                sent.append(token[-2][1])  # 
                tags.append(str(token[-1][1][0]).lower())  # first lowercase letter of tag
                target_ids.append(token[0][1] if len(token) > 2 else None)  #  something like d000.s000.t001 (optional)
            corpus.append({'sentence': sent, 'tags': tags, 'target_ids': target_ids})
            
    return corpus

In [19]:
semeval2007 = read_corpus_from_xml(semval7_data_file)

100%|██████████| 3/3 [00:00<00:00, 759.10it/s]


In [77]:
semeval2013 = read_corpus_from_xml(semval13_data_file)

100%|██████████| 13/13 [00:00<00:00, 1133.01it/s]


In [20]:
print(semeval2007[1]['sentence'][:10])
print(semeval2007[1]['tags'][:10])
print(semeval2007[1]['target_ids'][:10])

['you', 'comment', 'imply', 'we', 'have', 'discover', 'that', 'the', '``', 'principal']
['p', 'n', 'v', 'p', 'v', 'v', 'a', 'd', '.', 'a']
[None, 'd000.s001.t000', 'd000.s001.t001', None, None, 'd000.s001.t002', None, None, None, None]


### Experiment 0. Not fine-tuned raw BERT + best cosine similarity

Sentence 0 - Target lemma,
Sentence 1 - original sentence or each possible definition (tags aren't used)

In [21]:
from pytorch_pretrained_bert import BertTokenizer, BertModel

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [22]:
isinstance('sfdv ', list)

False

In [78]:
corpus = semeval2013
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenize = lambda s: tokenizer.convert_tokens_to_ids(
    tokenizer.wordpiece_tokenizer.tokenize(' '.join(s) if not isinstance(s, str) else s))

problems = []
for sent in tqdm(corpus, position=0):
    ids = tokenize(sent['sentence'])
    tasks = []
    for word, tag, target_id in zip(sent['sentence'], sent['tags'], sent['target_ids']):
        if target_id is not None:
            tasks.append(
                (tokenize(word), target_id,
                 [(sense_data[0] ,tokenize(sense_data[1])) for sense_data in w_t2wn_definition_examples[(word, tag)]]
                ))
    problems.append((ids, tasks))

100%|██████████| 306/306 [00:00<00:00, 1225.54it/s]


In [79]:
len(tokenizer.vocab)

30522

In [25]:
import torch

In [26]:
model = BertModel.from_pretrained('bert-base-uncased')

In [68]:
from scipy.spatial.distance import cosine

In [None]:
def build_sense_embdedding(model, word, sentence):
    tokens_tensor = torch.tensor([word + sentence])
    segments_tensors = torch.tensor([[0] * len(word) + [1] * len(sentence)])

    encoded_layers, _ = model(tokens_tensor, segments_tensors)
    return torch.reshape(torch.mean(
        encoded_layers[-1][:, :len(word), :], dim=1, keepdim=True), shape=(-1,)).detach().numpy()

predicted = []

for problem in tqdm(problems, position=0):
    sentence = problem[0]
    for task in problem[1]:
        word, target_id, senses = task
        labels = [s[0] for s in senses]
        golden_sense = build_sense_embdedding(model, word, sentence)
        definitions = np.array([build_sense_embdedding(model, word, s[1]) for s in senses])
        dists = map(lambda s: cosine(gold_sence,s), definitions)
        predicted.append((target_id, labels[np.argmin(dists)]))

 75%|███████▍  | 229/306 [11:48<03:58,  3.09s/it]

In [72]:
import pandas as pd

In [76]:
pd.DataFrame(predicted).to_csv('baseline_0_semval_2013', index=None, header=None, sep=' ')

#### Results:

Semeval2007 - 55.2 F1

Semeval2013 - tba  F1

Semeval2015 - tba  F1