# Prepare Dataset

In [1]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [2]:
from vec4gloss import check_hashes
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from CwnGraph import CwnBase, CwnImage
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, DatasetInfo
from datasets import Value, Sequence, Features
from datasets import concatenate_datasets

## Data Dependencies

```
..\data\def_frame_annot_sense_ids.txt 49fc44
```

In [3]:
_ = check_hashes("../data/def_frame_annot_sense_ids.txt")

../data/def_frame_annot_sense_ids.txt 49fc44


In [4]:
excl_sense_ids = set(Path("../data/def_frame_annot_sense_ids.txt").read_text().split())
len(excl_sense_ids)

288

In [5]:
from itertools import islice
list(islice(excl_sense_ids, 3))

['06620705', '05167001', '04085402']

## Generate denoising dataset

In [6]:
rng = np.random.RandomState(12052)

In [7]:
CWN_VER = "v.2022.06.21"

In [8]:
cwn = CwnImage.load(CWN_VER)
senses = []
for sense_x in tqdm(cwn.get_all_senses()):
    if sense_x.id in excl_sense_ids:
        continue
        
    all_exs = sense_x.all_examples()
    # skip those problematic examples (e.g. a list of characters)
    if len(all_exs) > 10: continue
    if all_exs and all(x for x in all_exs):
        senses.append(sense_x)
len(senses), sum(len(set(sense_x.all_examples())) for sense_x in senses)

  0%|          | 0/29433 [00:00<?, ?it/s]

(29021, 92165)

In [9]:
def make_example(sense_x):
    return dict(
        cwnid=sense_x.id,
        word=sense_x.head_word,
        pos=sense_x.pos,
        definition=sense_x.definition,
        examples=sense_x.all_examples(),
    )

In [10]:
sense_data = [make_example(x) for x in senses]
sense_cols = {fld: [sense_x[fld] for sense_x in sense_data] for fld in sense_data[0].keys()}
sense_ds = Dataset.from_dict(sense_cols)

sense_ds = sense_ds.cast(
    Features({
          "cwnid": Value(dtype='string'),
          "word": Value(dtype='string'),
          "pos": Value(dtype='string'),
          "definition": Value(dtype='string'),
          "examples": Sequence(feature=Value(dtype='string'))
         }))

Casting the dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

In [11]:
sense_ds[0]

{'cwnid': '03000101',
 'word': '啊唷',
 'pos': 'I',
 'definition': '表驚訝的語氣。',
 'examples': ['<啊唷>，你把我嚇了一跳！',
  '<啊唷>，這麼多蟑螂和老鼠屎！髒透了！',
  '門砰的一聲，阿姨跳了起來，喊一聲「<啊唷>」。']}

## Definition Corruption

In [12]:
def corrupt_sentence(instance, win=3):
    cwnid = instance["cwnid"]
    text = instance["definition"]
    if len(text) <= win+1:
        return {
            "cwnid": cwnid,
            "src": text, 
            "tgt": text
        }
    cor_x = rng.randint(len(text)-win)
    
    # select corruption site (two sites if longer than 20)
    cor_len = np.clip(rng.poisson(2), 1, 4)
    cor_sites = [(cor_x, cor_len)]
    if len(text) > 20:
        cor_x = ((cor_x+cor_len) + rng.randint(len(text)-cor_len)) % len(text)
        cor_len = np.clip(rng.poisson(2), 1, 4)        
        cor_sites.append((cor_x, cor_len))
    cor_sites = sorted(cor_sites, key=lambda x: x[0])
    
    # generate denoising pairs
    cor_text = ""    
    target_text = ""
    cur_pos = 0
    cor_idx = 0    
    for cor_x, cor_len in cor_sites:
        if cur_pos <= cor_x:            
            sentinel = f" <extra_id_{cor_idx:02d}>"
            cor_text += text[cur_pos:cor_x] + sentinel
            target_text += sentinel + text[cor_x:cor_x+cor_len]
            cor_idx += 1
        cur_pos = cor_x + cor_len
    cor_text += text[cur_pos:]
    target_text += f" <extra_id_{cor_idx:02d}>"
    return {
        "cwnid": cwnid,
        "src": cor_text.strip(), 
        "tgt": target_text.strip()}
    

In [13]:
corrupt_sentence(sense_ds[3])

{'cwnid': '03000202',
 'src': '表痛 <extra_id_00>痛的聲音。',
 'tgt': '<extra_id_00>苦、呼 <extra_id_01>'}

In [14]:
ds_corrupt = sense_ds.map(corrupt_sentence, remove_columns=["word", "pos", "examples", "definition"])

  0%|          | 0/29021 [00:00<?, ?ex/s]

In [15]:
ds_corrupt.info.description = f"vec4gloss denoising dataset based on CWN {CWN_VER}"
ds_corrupt.info.builder_name = "vec4gloss/etc/10.11"
ds_corrupt_split = ds_corrupt.train_test_split(test_size=0.1, generator=np.random.RandomState(3152))

In [16]:
{k: len(v) for k, v in ds_corrupt_split.items()}

{'train': 26118, 'test': 2903}

In [17]:
ds_corrupt_dir = "../data/denoising_dataset_cwn"
ds_corrupt_split.save_to_disk(ds_corrupt_dir)

Flattening the indices:   0%|          | 0/27 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

## Definition Generation

In [18]:
def defgen_mapper(instance):    
    cwnid = instance["cwnid"]
    word = instance["word"]
    pos = instance["pos"]
    definition = instance["definition"]
    examples = sorted(set(instance["examples"]))
    pos_prefix = f"{pos}。"
    return {
        "cwnid": [cwnid] * len(examples),            
        "src": [ex for ex in examples],
        "tgt": [pos_prefix+definition]*len(examples)
    }

def flatten_list(instances):    
    return {k: sum(instances[k], [])
            for k in instances.keys()}

In [19]:
ds_defgen = sense_ds.map(defgen_mapper, remove_columns=["word", "pos", "examples", "definition"])\
                    .map(flatten_list, batched=True)

  0%|          | 0/29021 [00:00<?, ?ex/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

In [20]:
print("before filtering: ", len(ds_defgen))
ds_defgen = ds_defgen.filter(lambda x: x["src"].count("<")==x["src"].count(">")==1)
print("after filtering: ", len(ds_defgen))

before filtering:  92165


  0%|          | 0/93 [00:00<?, ?ba/s]

after filtering:  85522


In [21]:
ds_defgen[121]

{'cwnid': '03001704',
 'src': '甚至連她不相信的迷信療法，她也<抱>著一線希望去試試。',
 'tgt': 'VJ。比喻心中存有特定態度或想法。'}

In [22]:
ds_corrupt.info.description = f"vec4gloss definition generation dataset based on CWN {CWN_VER}"
ds_corrupt.info.builder_name = "vec4gloss/etc/10.11"
ds_defgen_split = ds_defgen.train_test_split(test_size=0.1, generator=np.random.RandomState(3152))

In [23]:
{k: len(v) for k, v in ds_defgen_split.items()}

{'train': 76969, 'test': 8553}

In [24]:
ds_defgen_dir = "../data/defgen_dataset_cwn"
ds_defgen_split.save_to_disk(ds_defgen_dir)
# info.write_to_directory(ds_defgen_dir)

Flattening the indices:   0%|          | 0/77 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/9 [00:00<?, ?ba/s]

## Output Hash

```
../data/denoising_dataset_cwn/train/dataset.arrow 4148ef
../data/defgen_dataset_cwn/train/dataset.arrow 65a56d
```

In [25]:
_ = check_hashes([
    "../data/denoising_dataset_cwn/train/dataset.arrow",
    "../data/defgen_dataset_cwn/train/dataset.arrow",
])

../data/denoising_dataset_cwn/train/dataset.arrow 4148ef
../data/defgen_dataset_cwn/train/dataset.arrow 65a56d
