In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from nltk.corpus import wordnet as wn
from tqdm import tqdm

import xml.etree.ElementTree

### 1.Extract definitions for all senses-candidates from WordNet (via nltk)

In [2]:
candidates_file = '../WSD_Evaluation_Framework/Data_Validation/candidatesWN30.txt'

In [3]:
def read_candidates(file):
    kv = defaultdict(list)
    wn_ids = set()
    with open(file) as f:
        for line in tqdm(f,position=0):
            s = line[:-1].split('\t')
            assert len(s) >= 3 and s[1] in ['v','a','n','r']
            kv[(s[0],s[1])] = s[2:]
            for t in s[2:]:
                wn_ids.add(t)
    return kv, wn_ids

In [64]:
w_t2wn, wn_id_set = read_candidates(candidates_file)

155287it [00:00, 169445.94it/s]


In [65]:
len(w_t2wn), len(wn_id_set)

(155287, 206941)

In [66]:
w_t2wn[('1000000000000', 'n')]

['1000000000000%1:23:01::', '1000000000000%1:23:00::']

In [67]:
wn2definition = defaultdict(str)
wn2examples = defaultdict(list)

for k in tqdm(w_t2wn.keys(), position=0):
    candidates = wn.synsets(k[0])
    assert len(candidates) > 0
    for c in candidates:
        for l in c.lemmas():
            wn_id = l.key()  # something like shuffle%1:04:00::
            if wn_id in wn_id_set:
                wn2definition[wn_id] = c.definition()  # sentence(str)
                if len(c.examples()) > 0:
                    wn2examples[wn_id] = c.examples()  # list of sentences (save all of them just in case)

100%|██████████| 155287/155287 [00:03<00:00, 44894.26it/s]


In [68]:
len(wn2sense), len(wn2examples)

(206941, 59634)

In [76]:
w_t2wn_definition_examples = defaultdict(list)

for k in tqdm(w_t2wn.keys(), position=0):
    data = [(id, wn2definition[id], wn2examples[id]) for id in w_t2wn[k]]
    if len(data) > 0:
        w_t2wn_definition_examples[k] = data

100%|██████████| 155287/155287 [00:01<00:00, 147903.07it/s]


In [81]:
len(w_t2wn_definition_examples)

155287

In [84]:
w_t2wn_definition_examples[('king', 'n')]

[('king%1:18:00::', 'a male sovereign; ruler of a kingdom', []),
 ('king%1:18:02::', 'a competitor who holds a preeminent position', []),
 ('king%1:18:01::',
  'a very wealthy or powerful businessman',
  ['an oil baron']),
 ('king%1:26:00::',
  'preeminence in a particular category or group or field',
  ['the lion is the king of beasts']),
 ('king%1:18:05::', 'United States woman tennis player (born in 1943)', []),
 ('king%1:18:04::',
  'United States guitar player and singer of the blues (born in 1925)',
  []),
 ('king%1:18:03::',
  'United States charismatic civil rights leader and Baptist minister who campaigned against the segregation of Blacks (1929-1968)',
  []),
 ('king%1:06:02::',
  "a checker that has been moved to the opponent's first row where it is promoted to a piece that is free to move either forward or backward",
  []),
 ('king%1:06:01::',
  'one of the four playing cards in a deck bearing the picture of a king',
  []),
 ('king%1:06:00::', '(chess) the weakest but the m

Pickle mapping (word, tag) -> [(WN_id_1, definition_1, [example_of_definition_1_1,...]), ...(WN_id_N, definition_N, [...])]

In [87]:
np.save('word_tag_2_wordnet_definition_examples.npy', dict(w_t2wn_definition_examples))

### 2.Prepare WSD datasets Semeval2007, Semeval2013, Semeval2015 
Parse xml with triplets (word, tag, sample_id)

In [89]:
semval7_data_file = '../WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.data.xml'

In [137]:
# [('lemma', 'you'), ('pos', 'PRON')]
# [('lemma', 'oct.'), ('pos', 'NOUN')]
# [('lemma', '6'), ('pos', 'NUM')]
# [('lemma', 'editorial'), ('pos', 'NOUN')]
# [('lemma', '``'), ('pos', '.')]
# [('lemma', 'the'), ('pos', 'DET')]
# [('lemma', 'ill'), ('pos', 'NOUN')]
# [('lemma', 'homeless'), ('pos', 'NOUN')]
# [('lemma', '``'), ('pos', '.')]
# [('id', 'd000.s000.t000'), ('lemma', 'refer'), ('pos', 'VERB')]
# [('lemma', 'to'), ('pos', 'PRT')]
# [('id', 'd000.s000.t001'), ('lemma', 'research'), ('pos', 'NOUN')]

In [162]:
def read_corpus_from_xml(file):

    corpus = []
    corpus_xml = xml.etree.ElementTree.parse(file).getroot()

    for text_xml in tqdm(corpus_xml.findall('text'), position=0):
        for sent_xml in text_xml.findall('sentence'):
            sent = []
            tags = []
            target_ids = []
            for token in sent_xml:
                token = token.items()
                sent.append(token[-2][1])  # 
                tags.append(str(token[-1][1][0]).lower())  # first lowercase letter of tag
                target_ids.append(token[0][1] if len(token) > 2 else None)  #  something like d000.s000.t001 (optional)
            corpus.append({'sentence': sent, 'tags': tags, 'target_ids': target_ids})
            
    return corpus

In [163]:
semeval2007 = read_corpus_from_xml(semval7_datafile)

100%|██████████| 3/3 [00:00<00:00, 959.06it/s]
