# Prepare Rating Data

In [1]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")
if "../../pyASBC/src" not in sys.path:
    sys.path.append("../../pyASBC/src")

In [2]:
import re
import pickle
import random
from datetime import datetime
from pathlib import Path
from itertools import islice, chain, groupby

import numpy as np
import torch
from tqdm.auto import tqdm

from transformers import MT5TokenizerFast
from transformers import DataCollatorForSeq2Seq
import datasets

from CwnGraph import CwnImage
import vec4gloss
from vec4gloss import check_hashes
from vec4gloss import Vec4GlossModel
from pyASBC import Asbc5Corpus

## Data dependencies

```
10.11 -> ..\data\defgen_dataset_cwn\train\dataset.arrow 65a56d
20.21 -> ..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f
(external) -> ..\data\asbc5_words_pos.pkl 70badc
```

In [3]:
vec4gloss_model_dir = "../data/models/vec4gloss-defgen-220629-1250"
_ = check_hashes([
    "../data/defgen_dataset_cwn/train/dataset.arrow",
    vec4gloss_model_dir + "/pytorch_model.bin",
    "../data/asbc5_words_pos.pkl"
])

..\data\defgen_dataset_cwn\train\dataset.arrow 65a56d
..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f
..\data\asbc5_words_pos.pkl 70badc


## Loading resources

In [4]:
with open("../data/asbc5_words_pos.pkl", "rb") as fin:
    asbc_words = pickle.load(fin)

In [5]:
if torch.cuda.is_available() and "GeForce" not in torch.cuda.get_device_name():
    device = "cuda"
else:
    device = "cpu"
print("Using", device)

Using cpu


In [6]:
ds_defgen = datasets.load_from_disk("../data/defgen_dataset_cwn")
model = Vec4GlossModel.from_pretrained(vec4gloss_model_dir).to(device)
tokenizer = MT5TokenizerFast.from_pretrained(vec4gloss_model_dir)
gen = vec4gloss.gen_func(tokenizer, model)

In [7]:
CWN_VER = "v.2022.06.21"
cwn = CwnImage.load(CWN_VER)

## Prepare rating materials

### New words from ASBC

* frequency > 10
* no proper nouns (Nb)
* only words composed of Chinese characters(U+4E00-U+9FFF)

In [8]:
asbc = Asbc5Corpus("../../pyASBC/data")

In [9]:
def find_example_sentence(target, pos):
    for sent in asbc.iter_sentences():        
        tgt_idx = [i for i, (w,p,_)
                   in enumerate(sent)
                   if w==target and p==pos]
        if tgt_idx:            
            tgt_idx = tgt_idx[0]
            words = [x[0] for x in sent]
            words[tgt_idx] = f"<{target}>"
            return "".join(words)

In [10]:
rng_asbc = random.Random(14434)
lemmas = set(cwn.get_all_lemmas().keys())
chpat = re.compile("^[\u4e00-\u9fff]+$")
asbc_pos_list = {"D": [], "N": [], "V": [], "O": []}
for (word, pos), freq in asbc_words.most_common():
    if freq < 20: continue
    if pos == "Nb": continue
    if not chpat.match(word): continue
    if word not in lemmas:
        if pos and pos[0] in "DVN":
            poscat = pos[0]
        else:
            poscat = "O"
        asbc_pos_list[poscat].append((word, pos))
for poscat in asbc_pos_list:
    rng_asbc.shuffle(asbc_pos_list[poscat])

In [11]:
{k: len(v) for k, v in asbc_pos_list.items()}

{'D': 285, 'N': 6396, 'V': 4032, 'O': 135}

In [12]:
new_words = {"D": [], "N": [], "V": [], "O": []}
pbar = tqdm(total=20)
for poscat, pos_buf in new_words.items():
    word_list = asbc_pos_list[poscat][::-1]
    while len(pos_buf) < 5:        
        pbar.update(1)
        entry_x = {}
        word, pos = word_list.pop()
        entry_x["from"] = "ASBC"
        entry_x["pos"] = poscat
        entry_x["target"] = word
        entry_x["fillers"] = [word_list.pop()[0] for _ in range(3)]
        sent = find_example_sentence(word, pos)        
        if not sent:
            continue
            
        deftext = gen(sent).split("。")[1]
        if not deftext.endswith("。"):
            deftext += "。"
        deftext = deftext.translate(str.maketrans(",.", "，。"))
        entry_x["definition"] = deftext
        entry_x["item_id"] = f"{poscat}-{(len(pos_buf)+50):02d}"
        pos_buf.append(entry_x)        
pbar.close()            

  0%|          | 0/20 [00:00<?, ?it/s]

In [13]:
{k: v[0] for k, v in new_words.items()}

{'D': {'from': 'ASBC',
  'pos': 'D',
  'target': '一窩蜂',
  'fillers': ['何妨', '同聲', '方才'],
  'definition': '比喻嗜好特定對象的怪獸。',
  'item_id': 'D-50'},
 'N': {'from': 'ASBC',
  'pos': 'N',
  'target': '明牌',
  'fillers': ['西湖', '小生', '磚牆'],
  'definition': '比喻在競爭中被淘汰的對象。',
  'item_id': 'N-50'},
 'V': {'from': 'ASBC',
  'pos': 'V',
  'target': '刺進',
  'fillers': ['放生', '燃起', '與日俱增'],
  'definition': '物體表面或特定部位向外凸出。',
  'item_id': 'V-50'},
 'O': {'from': 'ASBC',
  'pos': 'O',
  'target': '長足',
  'fillers': ['麻辣', '英屬', '駐華'],
  'definition': '比喻事件發展的基礎。',
  'item_id': 'O-50'}}

In [14]:
[(x[0], len(x[1])) for x in new_words.items()]

[('D', 5), ('N', 5), ('V', 5), ('O', 5)]

### Words in evaluation set

* No proper names (Nb)
* Total 100 words, 20 used definition from CWN, 80 from model generation. These words are all taken from evaluation set.
* Among the generation items, 20 are nouns, 20 are verbs, 20 are adverbs, and 20 are others. 
* The word class composition is the same for the ones from CWN.

In [15]:
eval_data = list(ds_defgen["test"])
for x in eval_data:                
    pos = x["tgt"].split("。")[0]
    pos = ",".join(x 
                   for x in pos.split(",")
                   if x!="nom")
    if pos and pos == "Nb":
        x["pos"] = "X" # ignore
    elif not cwn.from_sense_id(x["cwnid"]).head_word:
        # empty words?
        x["pos"] = "X"
    elif pos and pos[0] in "DVN":
        x["pos"] = pos[0]    
    else:
        x["pos"] = "O"

In [16]:
eval_data = sorted(eval_data, key=lambda x: x["pos"])
grouped_data = list((grp, list(grp_iter)) 
                     for grp, grp_iter in groupby(eval_data, key=lambda x: x["pos"])
                     if grp!="X")

In [17]:
[(x[0], len(x[1])) for x in grouped_data]

[('D', 432), ('N', 2797), ('O', 529), ('V', 4368)]

In [18]:
import random
rng = random.Random(12345)
sampled = {"D": [], "N": [], "V": [], "O": []}
pbar = tqdm(total=100)
for pos, data in grouped_data:    
    shuffle_data = data[:]
    rng.shuffle(shuffle_data)
    buf = sampled[pos]
    for i in range(25):
        pbar.update(1)
        data_x = shuffle_data[i]
        data_x["target"] = cwn.from_sense_id(data_x["cwnid"]).head_word
        fillers = [
            cwn.from_sense_id(x["cwnid"]).head_word 
            for x in shuffle_data[-i*5-5:-i*5-1]]
        data_x["fillers"] = sorted(list(set(fillers)))[:3]
        if i < 5:
            data_x["from"] = "CWN"
            deftext = data_x["tgt"].split("。")[1]
        else:
            data_x["from"] = "vec4gloss"
            deftext = gen(data_x["src"]).split("。")[1]
        
        if not deftext.endswith("。"):
            deftext += "。"
        deftext = deftext.translate(str.maketrans(",.", "，。"))
        data_x["definition"] = deftext
        data_x["item_id"] = f"{pos}-{i:02d}"
        buf.append(data_x)        
    

  0%|          | 0/100 [00:00<?, ?it/s]

In [19]:
[(x[0], len(x[1])) for x in sampled.items()]

[('D', 25), ('N', 25), ('V', 25), ('O', 25)]

## Make rate items

In [20]:
from itertools import chain
rng_item = random.Random(12333)
def make_items(entry_x, idx):
    item_x = {}
    candids = entry_x["fillers"] + [entry_x["target"]]    
    rng_item.shuffle(candids)
    options = [f"{a}.{x}" for a, x in zip("ABCD", candids)]
    tgt_idx = candids.index(entry_x["target"])
        
    item_x["target"] = entry_x["target"]
    item_x["ans"] = "ABCD"[tgt_idx]
    item_x["pos"] = entry_x["pos"]
    item_x["from"] = entry_x["from"]
    item_x["item_id"] = entry_x["item_id"]        
    item_x["definition"] = entry_x["definition"]
    item_x["options"] = " ".join(options)
    return item_x

ent_iter = chain.from_iterable([sampled.values(), new_words.values()])
rate_items = [make_items(x, idx) for idx, x in enumerate(chain.from_iterable(ent_iter))]
rng_item.shuffle(rate_items)

In [21]:
import pandas as pd
rate_df = pd.DataFrame.from_records(rate_items)
rate_df.index = np.arange(1, len(rate_df)+1)

In [22]:
rate_df.pivot_table(values=["target"], index=["from"], columns=["pos"], aggfunc='count')

Unnamed: 0_level_0,target,target,target,target
pos,D,N,O,V
from,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ASBC,5,5,5,5
CWN,5,5,5,5
vec4gloss,20,20,20,20


In [23]:
assert rate_df.apply(lambda x: x.target in x.options, axis=1).all() # every options has target
# no duplicated options
assert rate_df.apply(lambda x: len(set(x.options.translate(str.maketrans("", "", "ABCD.")).split()))==4, axis=1).all()

In [24]:
rate_df.to_csv("../data/rating_materials.csv", index=True)

## Output Hash

```
..\data\rating_materials.csv a750c9
```

In [25]:
_ = check_hashes([
    "../data/rating_materials.csv"
])

..\data\rating_materials.csv a750c9
