## Motivations

* cwn-semrels tries to build a prompts dataset counterpart of the 14.10 pwn semrel.

In [1]:
import re
import random
import json
from pathlib import Path
from io import StringIO
from CwnGraph import CwnImage, CwnSense

cwn = CwnImage.latest()

In [2]:
def make_unique(candid_list):
  a_set = set()
  b_set = set()
  unique_list = []
  for x, y in candid_list:
    if x in a_set or y in b_set:
      continue
    else:
      a_set.add(x)
      b_set.add(y)
      unique_list.append((x, y))
  return unique_list  

In [3]:

senses = cwn.get_all_senses()
hypernyms = []
holonyms = []
for sense_x in senses:
  for rel, sense_y, dir in sense_x.semantic_relations:
    if dir != "forward": continue
    if not isinstance(sense_y, CwnSense): continue
    if rel == "hyponym":
      hypernyms.append((sense_x, sense_y))
    elif rel == "meronym":
      holonyms.append((sense_x, sense_y))

hyper_pairs = make_unique(hypernyms)
holo_pairs = make_unique(holonyms)


In [4]:
len(hyper_pairs), len(holo_pairs)

(549, 77)

In [5]:
def_task_prompt_base = """
「{a_lemma}」的釋義是{a_def}
「{a_lemma}」的{semrel}是「{b_lemma}」：{b_def}
"""

def build_prompt(fstring, **kwargs):
    var_locations = []
    formatted_string = StringIO()    
    while match:=re.search(r'\{(\w+)\}', fstring):
        var_name = match.group(1)
        var_value = str(kwargs[var_name])  
        
        formatted_string.write(fstring[:match.start()])
        var_start = formatted_string.tell()
        formatted_string.write(var_value)
        var_end = formatted_string.tell()
        var_locations.append((var_name, var_value, var_start, var_end))
        fstring = fstring[match.end():]    
    formatted_string.write(fstring)

    ## check var_locations
    outstr = formatted_string.getvalue()
    for _, val, si, ei in var_locations:
        assert outstr[si:ei] == val
        
    return outstr, var_locations

def prompt_from_semrel(semrel, pair):
    assert semrel in ("上位詞", "整體詞")
    a, b = pair
    
    up_lemma = a.lemmas[0].lemma
    low_lemma = b.lemmas[0].lemma
    up_def = a.definition
    low_def = b.definition
    
    prompt, var_loc = build_prompt(
        def_task_prompt_base.strip(), 
        semrel=semrel, a_lemma=low_lemma, a_def=low_def, b_lemma=up_lemma, b_def=up_def)    
    return prompt, var_loc

In [6]:
def shuffle_pairs(data_pairs, rng):
    """
    Shuffles the first element of each pair in a list of data pairs randomly.

    Args:
        data_pairs (list): A list of pairs, where each pair is a tuple of two elements.

    Returns:
        list: A new list of pairs, where each pair is a tuple of two elements. The first element of each pair
        is shuffled randomly.
    """    
    shuffle_hypers = data_pairs[::1]
    rng.shuffle(shuffle_hypers)
    ori_b_list = [x[1] for x in data_pairs]
    perm_a_list = [x[0] for x in shuffle_hypers]
    perm_data_pairs = [(a,b) for a,b in zip(perm_a_list, ori_b_list)]
    return perm_data_pairs

In [7]:
rng = random.Random(123)
perm_hyper_pairs = shuffle_pairs(hyper_pairs, rng)
perm_holo_pairs = shuffle_pairs(holo_pairs, rng)

In [8]:
prompt_from_semrel("上位詞", hyper_pairs[0])

('「安」的釋義是心中存有特定態度或想法，常帶有負面涵義。\n「安」的上位詞是「抱」：比喻心中存有特定態度或想法。',
 [('a_lemma', '安', 1, 2),
  ('a_def', '心中存有特定態度或想法，常帶有負面涵義。', 7, 27),
  ('a_lemma', '安', 29, 30),
  ('semrel', '上位詞', 32, 35),
  ('b_lemma', '抱', 37, 38),
  ('b_def', '比喻心中存有特定態度或想法。', 40, 54)])

## Build pairs

In [9]:
datalist = []
def make_pair_label(pair):
    a, b = pair    
    return f"{a.lemmas[0].lemma}@{a.id},{b.lemmas[0].lemma}@{b.id}"

def make_pos_label(pair):
    a, b = pair    
    return f"{a.pos},{b.pos}"

for pair_x in hyper_pairs:
    prompt, var_loc = prompt_from_semrel("上位詞", pair_x)
    datalist.append({
        "semrel": "hypernym",
        "type": "emp",        
        "pair": make_pair_label(pair_x),
        "pos": make_pos_label(pair_x),
        "prompt": prompt,
        "var_loc": var_loc        
    })

for pair_x in perm_hyper_pairs:
    prompt, var_loc = prompt_from_semrel("上位詞", pair_x)
    datalist.append({
        "semrel": "hypernym",
        "type": "perm",
        "pair": make_pair_label(pair_x),
        "pos": make_pos_label(pair_x),
        "prompt": prompt,
        "var_loc": var_loc        
    })

for pair_x in holo_pairs:
    prompt, var_loc = prompt_from_semrel("整體詞", pair_x)
    datalist.append({
        "semrel": "holonym",
        "type": "emp",
        "pair": make_pair_label(pair_x),
        "pos": make_pos_label(pair_x),
        "prompt": prompt,
        "var_loc": var_loc        
    })

for pair_x in perm_holo_pairs:
    prompt, var_loc = prompt_from_semrel("整體詞", pair_x)
    datalist.append({
        "semrel": "holonym",
        "type": "perm",
        "pair": make_pair_label(pair_x),
        "pos": make_pos_label(pair_x),
        "prompt": prompt,
        "var_loc": var_loc        
    })

In [10]:
datalist[:1]

[{'semrel': 'hypernym',
  'type': 'emp',
  'pair': '抱@03001704,安@06535712',
  'pos': 'VJ,VC',
  'prompt': '「安」的釋義是心中存有特定態度或想法，常帶有負面涵義。\n「安」的上位詞是「抱」：比喻心中存有特定態度或想法。',
  'var_loc': [('a_lemma', '安', 1, 2),
   ('a_def', '心中存有特定態度或想法，常帶有負面涵義。', 7, 27),
   ('a_lemma', '安', 29, 30),
   ('semrel', '上位詞', 32, 35),
   ('b_lemma', '抱', 37, 38),
   ('b_def', '比喻心中存有特定態度或想法。', 40, 54)]}]

In [11]:
datalist[-1]

{'semrel': 'holonym',
 'type': 'perm',
 'pair': '舉止@13340201,報紙@03001801',
 'pos': 'Na,Na',
 'prompt': '「報紙」的釋義是定期出版、報導新聞、提供各式訊息的出版品。\n「報紙」的整體詞是「舉止」：言語行為。',
 'var_loc': [('a_lemma', '報紙', 1, 3),
  ('a_def', '定期出版、報導新聞、提供各式訊息的出版品。', 8, 29),
  ('a_lemma', '報紙', 31, 33),
  ('semrel', '整體詞', 35, 38),
  ('b_lemma', '舉止', 40, 42),
  ('b_def', '言語行為。', 44, 49)]}

In [12]:
out_path = Path("../data/cwn_semrel_dataset.json")
out_path.write_text(json.dumps(datalist))
!sha1sum {str(out_path)}

e17555f7349313ce3e70ae764925618cf5b886e1  ../data/cwn_semrel_dataset.json
