# Build event token param

In [3]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [12]:
import re
import pickle
import json
import random
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass
from typing import List, Tuple, Dict

import numpy as np
import torch
from tqdm.auto import tqdm

from transformers import MT5TokenizerFast

from CwnGraph import CwnImage
import vec4gloss
from vec4gloss import check_hashes
from vec4gloss import Vec4GlossModel
from vec4gloss import RatingFrameInfo

from vec4gloss import TokenParam, TokenParamFactory

## Data dependencies
```
..\data\rating_materials.n10.raw.pkl 4b7ae3
..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f
```

In [5]:
vec4gloss_model_dir = "../data/models/vec4gloss-defgen-220629-1250"
_ = check_hashes([
    "../data/rating_materials.n10.raw.pkl",
    vec4gloss_model_dir + "/pytorch_model.bin",
])

..\data\rating_materials.n10.raw.pkl 4b7ae3
..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f


## Loading resources

In [6]:
n10_data = pickle.loads(Path("../data/rating_materials.n10.raw.pkl").read_bytes())

In [7]:
rating_evals = {k: v for k, v in n10_data.items()         
            if v["from"] == "vec4gloss"}

In [8]:
## Loading modem
use_cuda = torch.cuda.is_available() and "GeForce" not in torch.cuda.get_device_name()
device = "cuda" if use_cuda else "cpu"    
print("Using", device)

model = Vec4GlossModel.from_pretrained(vec4gloss_model_dir).to(device)
tokenizer = MT5TokenizerFast.from_pretrained(vec4gloss_model_dir)
gen = vec4gloss.gen_func(tokenizer, model)

Using cpu


## Building TokenParams

In [13]:
torch.manual_seed(12345)
rating_factory_list: List[Tuple[Dict[str, any], TokenParamFactory]] = []
rating_frameinfo_list: List[RatingFrameInfo] = []
empty_senses = []

for entry_id, entry_x in tqdm(rating_evals.items()):
    try:
        example_sentences = [entry_x["example"]]
        tgt_definition = entry_x["tgt"]
        factory_x = TokenParamFactory(example_sentences, tgt_definition, tokenizer, model)
        rating_factory_list.append((entry_x, factory_x))
    except Exception as ex:
        empty_senses.append(entry_id)

rotated_list = list(range(len(rating_factory_list)))
rotated_list = rotated_list[-1:] + rotated_list[:-1]

for idx, (entry_x, factory_x) in enumerate(tqdm(rating_factory_list)):
    replaced_idx = rotated_list[idx]
    replaced_vec = rating_factory_list[replaced_idx][1].mean_vec
    factory_x.set_replaced_vec(replaced_vec)
    token_params = factory_x.build_all_sequences(dbg=False)
    rating_frameinfo_list.append((entry_x, token_params))

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

In [14]:
rating_frameinfo_list[2]

({'cwnid': '05142101',
  'src': '有些人聽外國歌曲，剛開始聽的時候，很喜歡。因為他不太懂外文，<聽起來>很順耳。',
  'tgt': 'D。表說話者對所聽到的聲音作評價。',
  'pos': 'D',
  'target': '聽起來',
  'fillers': ['便', '怎能', '換句話說'],
  'example': '有些人聽外國歌曲，剛開始聽的時候，很喜歡。因為他不太懂外文，<聽起來>很順耳。',
  'from': 'vec4gloss',
  'definition': '表說話者主觀的感覺。',
  'item_id': 'D-07'},
 [<TokenParam 　表: F/A/D/R/X 1.00/0.99/1.00/1.00/0.00>,
  <TokenParam 　說: F/A/D/R/X 0.91/0.89/0.86/0.00/0.00>,
  <TokenParam 　話: F/A/D/R/X 1.00/1.00/1.00/1.00/0.00>,
  <TokenParam 　者: F/A/D/R/X 0.97/0.99/0.23/0.99/0.00>,
  <TokenParam 　對: F/A/D/R/X 0.34/0.31/0.01/0.17/0.00>,
  <TokenParam 　所: F/A/D/R/X 0.03/0.07/0.00/0.00/0.00>,
  <TokenParam 　聽: F/A/D/R/X 0.98/0.93/0.12/0.00/0.00>,
  <TokenParam 到的: F/A/D/R/X 1.00/0.99/0.00/1.00/0.00>,
  <TokenParam 　聲: F/A/D/R/X 0.41/0.58/0.95/0.00/0.00>,
  <TokenParam 　音: F/A/D/R/X 1.00/0.99/0.14/0.07/0.00>,
  <TokenParam 　作: F/A/D/R/X 0.01/0.00/0.00/0.03/0.00>,
  <TokenParam 　評: F/A/D/R/X 1.00/1.00/0.00/0.99/0.00>,
  <TokenParam 　價: F/A/D/R/X 0.91/0.88

In [15]:
rating_params_list_path = "../data/rating_tokenparams_list.pkl"
with open(rating_params_list_path, "wb") as fout:
    pickle.dump(rating_frameinfo_list, fout)

In [18]:
from itertools import chain
param_iter= chain.from_iterable(x[1] for x in rating_frameinfo_list)
sem_tokens = [x for x in param_iter if x.full_prob / x.replaced_prob > 5 and x.full_prob>.8]

## Output Hashes

```
..\data\rating_tokenparams_list.pkl 7f6db0
```

In [16]:
_ = check_hashes([rating_params_list_path])

..\data\rating_tokenparams_list.pkl 7f6db0
