In [1]:
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
from tqdm import tqdm
import pandas as pd
import spacy

nlp = spacy.load('en_core_web_md')

In [2]:
data_header = ['e1_kb', 'rel_kb', 'rel_id', 'e2_kb', 'e1_oie', 'rel_oie', 'e2_oie',
                       'e1_kb_id', 'e2_kb_id', 'e1_oie_id', 'e2_oie_id',
                       'e1_oie_root', 'e2_oie_root', 'label']
# dt_type = 'v10/test'
dt_type = 'test'
align_data = pd.read_csv('../' + dt_type + '_align_gold_v2.tsv', sep='\t', header=None, names=data_header)

In [3]:
align_data

Unnamed: 0,e1_kb,rel_kb,rel_id,e2_kb,e1_oie,rel_oie,e2_oie,e1_kb_id,e2_kb_id,e1_oie_id,e2_oie_id,e1_oie_root,e2_oie_root,label
0,Cronus,child,P40,Zeus,Cronus,' son is,Zeus,Q44204,Q34201,Q44204,Q34201,,,0
1,A Royal Christmas,distributor,P750,Hallmark Channel,Royal Christmas,aired on,Hallmark Channel,Q18611433,Q15501374,Q18611433,Q15501374,,,0
2,Mystic Ark,developer,P178,Produce,Mystic Ark,also developed by,Produce,Q6948921,Q7247680,Q6948921,Q7247680,,,0
3,Nisqually River,mouth of the watercourse,P403,Puget Sound,Nisqually River,also lived throughout,Puget Sound,Q829444,Q604141,Q829444,Q604141,,,1
4,Big Fish,filming location,P915,Alabama,Big Fish,Apart was entirely shot in,Alabama,Q278997,Q173,Q278997,Q173,,,0
5,Jane Menelaus,spouse,P26,Geoffrey Rush,Jane Menelaus,appeared with,Geoffrey Rush,Q2063601,Q166272,Q2063601,Q166272,,,1
6,Diddy Kong,present in work,P1441,Donkey Kong Country,Diddy Kong,appearing in,Donkey Kong Country,Q10904385,Q518987,Q10904385,Q518987,,,0
7,Deadpool,member of,P463,X-Force,Deadpool,appearing in,X-Force,Q1631090,Q938034,Q1631090,Q938034,,,0
8,Stony Brook Seawolves,represents,P1268,Stony Brook University,Stony Brook Seawolves,are athletic teams of,Stony Brook University,Q7619507,Q969850,Q7619507,Q969850,,,1
9,Garuda,facet of,P1269,Buddhism,Garuda,are birds In,Buddhism,Q188676,Q748,Q188676,Q748,,,0


In [4]:
def spacy_to_wn_tags(pos_tag):
    if pos_tag == 'ADJ':
        return wn.ADJ
    elif pos_tag in ['ADV', 'ADP']:
        return wn.ADV
    elif pos_tag.startswith('V'):
        return wn.VERB
    elif pos_tag.startswith('N'):
        return wn.NOUN
    else:
        return None

rels_oie_def = []
for i in tqdm(range(len(align_data))):
    e1 = align_data['e1_oie'][i]
    rel = align_data['rel_oie'][i]
    e2 = align_data['e2_oie'][i]
    doc = nlp(rel)

    defs = []
    for token in doc:
        word = token.text
        if word == "'s":
            defs.append(word)
        elif token.pos_ not in ['PUNCT']:
            pos_word = spacy_to_wn_tags(token.pos_)
            dep_word = token.dep_

            if pos_word == 'r':
                syns = wn.synsets(word, pos=pos_word)
                if len(syns) > 0:
                    defs.append(syns[0].definition().split('; ')[0])
            else:
                syn = lesk([word, e1, e2], word)
                if syn is not None:
                    defs.append(syn.definition().split('; ')[0])

    if len(defs) > 0:
        rels_oie_def.append(' '.join(defs))
    else:
        rels_oie_def.append(rel)

100%|██████████| 400/400 [00:04<00:00, 90.25it/s]


In [5]:
len(rels_oie_def)

400

In [6]:
align_data['def'] = rels_oie_def

In [7]:
align_data.to_csv('../' + dt_type + '_align_gold_def_v2.tsv', sep='\t', header=False, index=False)

In [8]:
data_header = ['e1_kb', 'rel_kb', 'rel_id', 'e2_kb', 'e1_oie', 'rel_oie', 'e2_oie',
                       'e1_kb_id', 'e2_kb_id', 'e1_oie_id', 'e2_oie_id',
                       'e1_oie_root', 'e2_oie_root', 'label', 'oie_def']
align_data = pd.read_csv('../' + dt_type + '_align_gold_def_v2.tsv', sep='\t', header=None, names=data_header)

In [9]:
align_data

Unnamed: 0,e1_kb,rel_kb,rel_id,e2_kb,e1_oie,rel_oie,e2_oie,e1_kb_id,e2_kb_id,e1_oie_id,e2_oie_id,e1_oie_root,e2_oie_root,label,oie_def
0,Cronus,child,P40,Zeus,Cronus,' son is,Zeus,Q44204,Q34201,Q44204,Q34201,,,0,"the divine word of God have an existence, be e..."
1,A Royal Christmas,distributor,P750,Hallmark Channel,Royal Christmas,aired on,Hallmark Channel,Q18611433,Q15501374,Q18611433,Q15501374,,,0,expose to cool or cold air so as to cool or fr...
2,Mystic Ark,developer,P178,Produce,Mystic Ark,also developed by,Produce,Q6948921,Q7247680,Q6948921,Q7247680,,,0,in addition create by training and teaching so...
3,Nisqually River,mouth of the watercourse,P403,Puget Sound,Nisqually River,also lived throughout,Puget Sound,Q829444,Q604141,Q829444,Q604141,,,1,in addition continue to live through hardship ...
4,Big Fish,filming location,P915,Alabama,Big Fish,Apart was entirely shot in,Alabama,Q278997,Q173,Q278997,Q173,,,0,separated or at a distance in place or positio...
5,Jane Menelaus,spouse,P26,Geoffrey Rush,Jane Menelaus,appeared with,Geoffrey Rush,Q2063601,Q166272,Q2063601,Q166272,,,1,give a certain impression or have a certain ou...
6,Diddy Kong,present in work,P1441,Donkey Kong Country,Diddy Kong,appearing in,Donkey Kong Country,Q10904385,Q518987,Q10904385,Q518987,,,0,give a certain impression or have a certain ou...
7,Deadpool,member of,P463,X-Force,Deadpool,appearing in,X-Force,Q1631090,Q938034,Q1631090,Q938034,,,0,give a certain impression or have a certain ou...
8,Stony Brook Seawolves,represents,P1268,Stony Brook University,Stony Brook Seawolves,are athletic teams of,Stony Brook University,Q7619507,Q969850,Q7619507,Q969850,,,1,"have an existence, be extant having a sturdy a..."
9,Garuda,facet of,P1269,Buddhism,Garuda,are birds In,Buddhism,Q188676,Q748,Q188676,Q748,,,0,"have an existence, be extant watch and study b..."


In [10]:
align_data_arranged = align_data[[
                        'e1_kb', 'rel_kb', 'rel_id', 'e2_kb', 'e1_oie', 'rel_oie', 'e2_oie',
                        'e1_kb_id', 'e2_kb_id', 'e1_oie_id', 'e2_oie_id', 'oie_def', 'label'
                    ]]

In [11]:
align_data_arranged.to_csv('../' + dt_type + '_align_gold_def_v2.tsv', sep='\t', header=False, index=False)