## Data dependencies

In [1]:
!sha1sum ../data/pwn_semrel_pairs.json  # from 12.10

2ef1e20540a9724ef89f4da60dd883eb6993d60c  ../data/pwn_semrel_pairs.json


## Load data

In [2]:
import json
import re
import time
import random
from pathlib import Path
from io import StringIO
from itertools import chain
from tqdm import tqdm
from nltk.corpus import wordnet as wn

In [3]:
pwn_pairs = json.loads(Path("../data/pwn_semrel_pairs.json").read_text())
hypernym_pairs = pwn_pairs["hypernymy"]
holonym_pairs = pwn_pairs["holonymy"]

In [4]:
len(hypernym_pairs), len(holonym_pairs)

(1285, 164)

In [5]:
def_task_prompt_base = """
The definition of "{a_lemma}" is {a_def}
The {semrel} of the word "{a_lemma}" is {b_lemma}: {b_def}
"""

def get_lemma_def(synid):
    syn_x = wn.synset(synid)
    x_lemma = syn_x.lemmas()[0].name()
    x_def = syn_x.definition()
    return x_lemma, x_def
    
def build_prompt(fstring, **kwargs):
    var_locations = []
    formatted_string = StringIO()    
    while match:=re.search(r'\{(\w+)\}', fstring):
        var_name = match.group(1)
        var_value = str(kwargs[var_name])  
        
        formatted_string.write(fstring[:match.start()])
        var_start = formatted_string.tell()
        formatted_string.write(var_value)
        var_end = formatted_string.tell()
        var_locations.append((var_name, var_value, var_start, var_end))
        fstring = fstring[match.end():]    
    formatted_string.write(fstring)

    ## check var_locations
    outstr = formatted_string.getvalue()
    for _, val, si, ei in var_locations:
        assert outstr[si:ei] == val
        
    return outstr, var_locations

def prompt_from_semrel(semrel, pair):
    assert semrel in ("hypernym", "holonym")
    a, b = pair
    up_lemma, up_def = get_lemma_def(a)
    low_lemma, low_def = get_lemma_def(b)
    
    prompt, var_loc = build_prompt(
        def_task_prompt_base.strip(), 
        semrel=semrel, a_lemma=low_lemma, a_def=low_def, b_lemma=up_lemma, b_def=up_def)    
    return prompt, var_loc

In [6]:
prompt_from_semrel("hypernym", hypernym_pairs[0])

('The definition of "forfeit" is lose (s.th.) or lose the right to (s.th.) by some error, offense, or crime\nThe hypernym of the word "forfeit" is abandon: forsake, leave behind',
 [('a_lemma', 'forfeit', 19, 26),
  ('a_def',
   'lose (s.th.) or lose the right to (s.th.) by some error, offense, or crime',
   31,
   105),
  ('semrel', 'hypernym', 110, 118),
  ('a_lemma', 'forfeit', 132, 139),
  ('b_lemma', 'abandon', 144, 151),
  ('b_def', 'forsake, leave behind', 153, 174)])

In [7]:
prompt_from_semrel("holonym", holonym_pairs[0])

('The definition of "scene" is a subdivision of an act of a play\nThe holonym of the word "scene" is act: a subdivision of a play or opera or ballet',
 [('a_lemma', 'scene', 19, 24),
  ('a_def', 'a subdivision of an act of a play', 29, 62),
  ('semrel', 'holonym', 67, 74),
  ('a_lemma', 'scene', 88, 93),
  ('b_lemma', 'act', 98, 101),
  ('b_def', 'a subdivision of a play or opera or ballet', 103, 145)])

## Make dataset

In [8]:
def shuffle_pairs(data_pairs, rng):
    """
    Shuffles the first element of each pair in a list of data pairs randomly.

    Args:
        data_pairs (list): A list of pairs, where each pair is a tuple of two elements.

    Returns:
        list: A new list of pairs, where each pair is a tuple of two elements. The first element of each pair
        is shuffled randomly.
    """    
    shuffle_hypers = data_pairs[::1]
    rng.shuffle(shuffle_hypers)
    ori_b_list = [x[1] for x in data_pairs]
    perm_a_list = [x[0] for x in shuffle_hypers]
    perm_data_pairs = [(a,b) for a,b in zip(perm_a_list, ori_b_list)]
    return perm_data_pairs

In [9]:
rng = random.Random(123)
perm_hypernym_pairs = shuffle_pairs(hypernym_pairs, rng)
perm_holonym_pairs = shuffle_pairs(holonym_pairs, rng)
datalist = []

### build hypernym pairs

In [10]:
for pair_x in hypernym_pairs:
    prompt, var_loc = prompt_from_semrel("hypernym", pair_x)
    datalist.append({
        "semrel": "hypernym",
        "type": "emp",
        "pair": list(pair_x),
        "prompt": prompt,
        "var_loc": var_loc        
    })

for pair_x in perm_hypernym_pairs:
    prompt, var_loc = prompt_from_semrel("hypernym", pair_x)
    datalist.append({
        "semrel": "hypernym",
        "type": "perm",
        "pair": list(pair_x),
        "prompt": prompt,
        "var_loc": var_loc        
    })

### build holonym pairs

In [11]:

for pair_x in holonym_pairs:
    prompt, var_loc = prompt_from_semrel("holonym", pair_x)
    datalist.append({
        "semrel": "holonym",
        "type": "emp",
        "pair": list(pair_x),
        "prompt": prompt,
        "var_loc": var_loc        
    })

for pair_x in perm_holonym_pairs:
    prompt, var_loc = prompt_from_semrel("holonym", pair_x)
    datalist.append({
        "semrel": "holonym",
        "type": "perm",
        "pair": list(pair_x),
        "prompt": prompt,
        "var_loc": var_loc        
    })

In [12]:
out_path = Path("../data/pwn_semrel_dataset.json")
out_path.write_text(json.dumps(datalist))
!sha1sum {str(out_path)}

5236da3d32f4a5a1afe42eedc40e1a0fe7b87edc  ../data/pwn_semrel_dataset.json
