# Vec4Gloss Sandbox

In [1]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")
if "../../pyASBC/src" not in sys.path:
    sys.path.append("../../pyASBC/src")

In [2]:
import re
import pickle
import random
from datetime import datetime
from pathlib import Path
from itertools import islice, chain, groupby

import numpy as np
import torch
from tqdm.auto import tqdm

from transformers import MT5TokenizerFast
from transformers import DataCollatorForSeq2Seq
import datasets

from CwnGraph import CwnImage
import vec4gloss
from vec4gloss import check_hashes
from vec4gloss import Vec4GlossModel
from pyASBC import Asbc5Corpus

## Data dependencies

Note: the `defgen_dataset_cwn\train` is only used in checking data hash, the 'test' split is used in the notebook itself.
```
10.11 -> ..\data\defgen_dataset_cwn\train\dataset.arrow 65a56d
20.21 -> ..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f
(external) -> ..\data\asbc5_words_pos.pkl 70badc
```

In [3]:
vec4gloss_model_dir = "../data/models/vec4gloss-defgen-220629-1250"
_ = check_hashes([    
    vec4gloss_model_dir + "/pytorch_model.bin",    
])

..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f


## Loading resources

In [4]:
if torch.cuda.is_available() and "GeForce" not in torch.cuda.get_device_name():
    device = "cuda"
else:
    device = "cpu"
print("Using", device)

Using cpu


In [5]:
ds_defgen = datasets.load_from_disk("../data/defgen_dataset_cwn")
model = Vec4GlossModel.from_pretrained(vec4gloss_model_dir).to(device)
tokenizer = MT5TokenizerFast.from_pretrained(vec4gloss_model_dir)
gen = vec4gloss.gen_func(tokenizer, model)

In [7]:
CWN_VER = "v.2022.06"
cwn = CwnImage.load(CWN_VER)

In [54]:
gen("我<開>了一個會。")

'VC。進行會議。'

In [43]:
gen("他還沒<開>口。")

'VC。比喻提出要求。'

In [45]:
gen("他<還>沒開口。")

'Dfa。表事情尚未完成。'

In [52]:
gen("這<彰顯>出重要的價值。")

'VJ。顯現出後述事物或特質。'

In [53]:
cwn.find_lemma("彰顯")

[]