# Build event token param

In [1]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [2]:
import re
import pickle
import json
import random
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass
from typing import List, Tuple

import numpy as np
import torch
from tqdm.auto import tqdm

from transformers import MT5TokenizerFast

from CwnGraph import CwnImage
import vec4gloss
from vec4gloss import check_hashes
from vec4gloss import Vec4GlossModel
from vec4gloss import (
    AnnotFrame, Scheme, 
    AnnotDepInfo,
    AnnotFrameInfo)

from vec4gloss import TokenParam, TokenParamFactory

## Data dependencies
```
..\data\annotation.json 2ed250
..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f
```

In [3]:
vec4gloss_model_dir = "../data/models/vec4gloss-defgen-220629-1250"
_ = check_hashes([
    "../data/annotation.json",
    vec4gloss_model_dir + "/pytorch_model.bin",
])

..\data\annotation.json 2ed250
..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f


## Loading resources

In [4]:
annot_data = json.loads(Path("../data/annotation.json").read_text(encoding="UTF-8"))

In [5]:
len(annot_data), annot_data[45]

(288,
 {'sense_id': 3048001,
  'head_word': '沿街',
  'POS': 'D',
  'definition': '表同一事件在經過的街道中重複發生。',
  'event_role': 'agent',
  'schemas': [{'type': 'event', 'start': 1, 'end': 5},
   {'type': 'scope', 'start': 5, 'end': 6},
   {'type': 'place', 'start': 6, 'end': 11},
   {'type': 'scope', 'start': 11, 'end': 12},
   {'type': 'mod', 'start': 12, 'end': 14},
   {'type': 'action', 'start': 14, 'end': 16}]})

In [6]:
## Loading modem
use_cuda = torch.cuda.is_available() and "GeForce" not in torch.cuda.get_device_name()
device = "cuda" if use_cuda else "cpu"    
print("Using", device)

model = Vec4GlossModel.from_pretrained(vec4gloss_model_dir).to(device)
tokenizer = MT5TokenizerFast.from_pretrained(vec4gloss_model_dir)
gen = vec4gloss.gen_func(tokenizer, model)

Using cpu


In [7]:
CWN_VER = "v.2022.06.21"
cwn = CwnImage.load(CWN_VER)

## Annotation Frame

In [8]:
from collections import Counter
annot_frames = []
for annot_i, annot_x in enumerate(annot_data):
    schemas = [Scheme(**x) for x in annot_x["schemas"]]
    annot_y = {k: v for k, v in annot_x.items() if k!="schemas"}    
    frame_x = AnnotFrame(**annot_y, schemas=schemas)
    annot_frames.append(frame_x)        

In [9]:
for i in range(20):
    annot_frames[i].show()

<-->表 <neg>沒有 <noun>足夠的時間 <action>進行 <event>後述事件
<-->表 <mod>勉強 <action>去做 <event>後述事件
<-->表 <mod>親身 <action>體驗 <event>後述事件
<-->表 <event>後述事件 <neg>不 <scope>在 <noun>預期中
<-->表 <exist>具備 <action>達到 <event>後述事件 <action>成立 <-->的 <noun>條件
<-->表 <event>後述事件 <scope>在 <time>很短的時間內 <action>完成
<-->表 <event>後述事件 <action>發生 <-->的 <time>時間 <scope>比 <noun>預期 <value>早
<-->表 <action>做 <event>後述事件 <-->的 <degree>程度 <value>非常低
<-->表 <action>做 <event>後述事件 <-->的 <degree>程度 <value>非常高
<-->表比喻 <scope>只 <action>做 <event>後述事件 <-->而 <neg>沒有 <mod>其他 <-->的 <action.NOM>行動
<-->表 <event>後述事件 <neg>不 <scope>在 <noun>預期 <scope>之中 <action>發生
<-->表 <exist>有 <noun>足夠的時間 <action>進行 <event>後述事件
<-->表 <event>後述事件 <mod>一直 <action>持續
<-->表 <action>進行 <event>後述事件 <-->但 <neg>沒有 <action>產生 <noun>預期的效果
<-->表 <instrument>以 <noun>相同一致的方法 <action>進行 <event>後述事件
<-->表比喻 <neg>未 <mod>經多加考慮地 <action>進行 <event>後述事件
<-->表 <action>進行 <event>後述事件 <action>涉及 <-->的 <place>空間範圍 <value>大且數量多
<-->表 <neg>不 <scope>對 <event>後述事件 <action>設定 <noun>任何限制
<

## Building TokenParams

In [10]:
torch.manual_seed(12345)
annot_factory_list: List[Tuple[AnnotFrame, TokenParamFactory]] = []
annot_frameinfo_list: List[AnnotFrameInfo] = []
empty_senses = []

for annot_x in tqdm(annot_frames):
    try:
        example_sentences = cwn.from_sense_id("{:08d}".format(annot_x.sense_id)).examples                
        tgt_definition = "{}。{}".format(annot_x.POS, annot_x.definition)
        factory_x = TokenParamFactory(example_sentences, tgt_definition, tokenizer, model)
        annot_factory_list.append((annot_x, factory_x))
    except Exception as ex:
        empty_senses.append(annot_x.sense_id)

rotated_list = list(range(len(annot_factory_list)))
rotated_list = rotated_list[-1:] + rotated_list[:-1]

for idx, (annot_x, factory_x) in enumerate(tqdm(annot_factory_list)):
    replaced_idx = rotated_list[idx]
    replaced_vec = annot_factory_list[replaced_idx][1].mean_vec
    factory_x.set_replaced_vec(replaced_vec)
    token_params = factory_x.build_all_sequences(dbg=False)
    annot_frameinfo_list.append((annot_x, token_params))

  0%|          | 0/288 [00:00<?, ?it/s]

  0%|          | 0/244 [00:00<?, ?it/s]

In [11]:
annot_frameinfo_list[2]

(AnnotFrame(sense_id=5166702, POS='D', head_word='額外', definition='表後述事件不在預期中。', event_role='agent', schemas=[Scheme(type='--', start=0, end=1), Scheme(type='event', start=1, end=5), Scheme(type='neg', start=5, end=6), Scheme(type='scope', start=6, end=7), Scheme(type='noun', start=7, end=10)]),
 [<TokenParam 　表: F/A/D/R/X 1.00/1.00/1.00/1.00/0.00>,
  <TokenParam 　後: F/A/D/R/X 0.00/0.00/0.00/0.00/0.00>,
  <TokenParam 　述: F/A/D/R/X 1.00/1.00/1.00/1.00/0.00>,
  <TokenParam 事件: F/A/D/R/X 0.00/0.00/0.00/0.12/0.00>,
  <TokenParam 不在: F/A/D/R/X 0.09/0.04/0.00/0.00/0.00>,
  <TokenParam 　預: F/A/D/R/X 0.98/0.99/0.00/0.90/0.00>,
  <TokenParam 　期: F/A/D/R/X 1.00/1.00/0.55/0.81/0.00>,
  <TokenParam 　中: F/A/D/R/X 0.00/0.00/0.00/0.00/0.00>])

In [12]:
annot_params_list_path = "../data/annot_tokenparams_list.pkl"
with open(annot_params_list_path, "wb") as fout:
    pickle.dump(annot_frameinfo_list, fout)

In [16]:
from itertools import chain
param_iter= chain.from_iterable(x[1] for x in annot_frameinfo_list)
sem_tokens = [x for x in param_iter if x.full_prob / x.replaced_prob > 5 and x.full_prob>.8]

## Output Hashes

```
..\data\annot_tokenparams_list.pkl 1ff297
```

In [13]:

_ = check_hashes([annot_params_list_path])

..\data\annot_tokenparams_list.pkl 1ff297
