## Data dependencies

In [1]:
!sha1sum ../data/semcor_synfreq.json

e5581f09fa9a56b2bfa80badf9af6e5ddfd1e7b5  ../data/semcor_synfreq.json


## Load Data

In [2]:
import json
from pathlib import Path
from tqdm.auto import tqdm
from nltk.corpus import wordnet as wn

In [3]:
synfreq = dict(json.loads(Path("../data/semcor_synfreq.json").read_text()))
synfreq = {k:f for k, f in synfreq.items() if f>5}
def in_synfreq(x):
    return filter(lambda s: s.name() in synfreq, x)

hypernymy_pairs = []
holonymy_pairs = []

for syn_x in tqdm(wn.all_synsets()):
    # only consider synsets appearing in the semcor 3.0 
    if syn_x.name() not in synfreq:
        continue
    hypernymy_pairs.extend([(syn_x, hypo_x) for hypo_x in in_synfreq(syn_x.hyponyms())])
    holonymy_pairs.extend([(syn_x, mero_x) for mero_x in in_synfreq(syn_x.member_meronyms())])
    holonymy_pairs.extend([(syn_x, mero_x) for mero_x in in_synfreq(syn_x.substance_meronyms())])
    holonymy_pairs.extend([(syn_x, mero_x) for mero_x in in_synfreq(syn_x.part_meronyms())])

0it [00:00, ?it/s]

In [4]:
len(hypernymy_pairs), len(holonymy_pairs)

(3036, 278)

In [5]:
def dedup_pairs(pairs):
    a_set, b_set = set(), set()
    pairset = set()
    for (a,b) in pairs:
        if a.name() in a_set or b.name() in b_set:
            continue
        else:
            a_set.add(a.name())
            b_set.add(b.name())
            pairset.add((a.name(),b.name()))
    return pairset
hypernymy_pairs_dedup = list(dedup_pairs(hypernymy_pairs))
holonymy_pairs_dedup = list(dedup_pairs(holonymy_pairs))

# sort the list for replicable ordering
hypernymy_pairs_dedup = sorted(hypernymy_pairs_dedup, key=lambda x: x[0])
holonymy_pairs_dedup = sorted(holonymy_pairs_dedup, key=lambda x: x[0])

In [6]:
len(hypernymy_pairs_dedup), len(holonymy_pairs_dedup)

(1285, 164)

In [7]:
## Check all synset appears at most twice (one in upper, one in lower) and occurs in synfreq
from collections import Counter
counter = Counter()
for a,b in hypernymy_pairs_dedup:
    assert a in synfreq and b in synfreq
    counter[a] += 1
    counter[b] += 1
assert all(x<=2 for x in counter.values())

counter = Counter()
for a,b in holonymy_pairs_dedup:
    assert a in synfreq and b in synfreq
    counter[a] += 1
    counter[b] += 1
assert all(x<=2 for x in counter.values())

In [8]:
Path("../data/pwn_semrel_pairs.json").write_text(
     json.dumps({"hypernymy": hypernymy_pairs_dedup, "holonymy": holonymy_pairs_dedup}))
!sha1sum ../data/pwn_semrel_pairs.json

2ef1e20540a9724ef89f4da60dd883eb6993d60c  ../data/pwn_semrel_pairs.json
