# Decoder Dependency dev.

In [1]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [2]:
import re
import pickle
import json
import random
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass
from typing import List, Tuple

import numpy as np
import torch
from tqdm.auto import tqdm

from transformers import MT5TokenizerFast

from CwnGraph import CwnImage
import vec4gloss
from vec4gloss import check_hashes
from vec4gloss import Vec4GlossModel
from vec4gloss import (
    AnnotFrame, Scheme, 
    AnnotDepInfo,
    AnnotFrameInfo)

from vec4gloss import (
    DepWinOutput,
    compute_mean_vec,
    generate_seq_masks,
    compute_dependents,
    select_highest_lr)        

## Data dependencies
```
..\data\annotation.json 2ed250
..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f
```

In [3]:
vec4gloss_model_dir = "../data/models/vec4gloss-defgen-220629-1250"
_ = check_hashes([
    "../data/annotation.json",
    vec4gloss_model_dir + "/pytorch_model.bin",
])

..\data\annotation.json 2ed250
..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f


## Loading resources

In [4]:
annot_data = json.loads(Path("../data/annotation.json").read_text(encoding="UTF-8"))

In [5]:
len(annot_data), annot_data[45]

(288,
 {'sense_id': 3048001,
  'head_word': '沿街',
  'POS': 'D',
  'definition': '表同一事件在經過的街道中重複發生。',
  'event_role': 'agent',
  'schemas': [{'type': 'event', 'start': 1, 'end': 5},
   {'type': 'scope', 'start': 5, 'end': 6},
   {'type': 'place', 'start': 6, 'end': 11},
   {'type': 'scope', 'start': 11, 'end': 12},
   {'type': 'mod', 'start': 12, 'end': 14},
   {'type': 'action', 'start': 14, 'end': 16}]})

In [6]:
## Loading modem
use_cuda = torch.cuda.is_available() and "GeForce" not in torch.cuda.get_device_name()
device = "cuda" if use_cuda else "cpu"    
print("Using", device)

model = Vec4GlossModel.from_pretrained(vec4gloss_model_dir).to(device)
tokenizer = MT5TokenizerFast.from_pretrained(vec4gloss_model_dir)
gen = vec4gloss.gen_func(tokenizer, model)

Using cpu


In [7]:
CWN_VER = "v.2022.06.21"
cwn = CwnImage.load(CWN_VER)

## Annotation Frame

In [8]:
from collections import Counter
annot_frames = []
for annot_i, annot_x in enumerate(annot_data):
    schemas = [Scheme(**x) for x in annot_x["schemas"]]
    annot_y = {k: v for k, v in annot_x.items() if k!="schemas"}    
    frame_x = AnnotFrame(**annot_y, schemas=schemas)
    annot_frames.append(frame_x)        

In [9]:
for i in range(20):
    annot_frames[i].show()

<-->表 <neg>沒有 <noun>足夠的時間 <action>進行 <event>後述事件
<-->表 <mod>勉強 <action>去做 <event>後述事件
<-->表 <mod>親身 <action>體驗 <event>後述事件
<-->表 <event>後述事件 <neg>不 <scope>在 <noun>預期中
<-->表 <exist>具備 <action>達到 <event>後述事件 <action>成立 <-->的 <noun>條件
<-->表 <event>後述事件 <scope>在 <time>很短的時間內 <action>完成
<-->表 <event>後述事件 <action>發生 <-->的 <time>時間 <scope>比 <noun>預期 <value>早
<-->表 <action>做 <event>後述事件 <-->的 <degree>程度 <value>非常低
<-->表 <action>做 <event>後述事件 <-->的 <degree>程度 <value>非常高
<-->表比喻 <scope>只 <action>做 <event>後述事件 <-->而 <neg>沒有 <mod>其他 <-->的 <action.NOM>行動
<-->表 <event>後述事件 <neg>不 <scope>在 <noun>預期 <scope>之中 <action>發生
<-->表 <exist>有 <noun>足夠的時間 <action>進行 <event>後述事件
<-->表 <event>後述事件 <mod>一直 <action>持續
<-->表 <action>進行 <event>後述事件 <-->但 <neg>沒有 <action>產生 <noun>預期的效果
<-->表 <instrument>以 <noun>相同一致的方法 <action>進行 <event>後述事件
<-->表比喻 <neg>未 <mod>經多加考慮地 <action>進行 <event>後述事件
<-->表 <action>進行 <event>後述事件 <action>涉及 <-->的 <place>空間範圍 <value>大且數量多
<-->表 <neg>不 <scope>對 <event>後述事件 <action>設定 <noun>任何限制
<

## Decoding

In [10]:
## test generate_seq_mask
annot_x = annot_frames[34]
print(annot_x.head_word)
tgt_text = "{}。{}".format(annot_x.POS, annot_x.definition)
tgt_seq = tokenizer(tgt_text)["input_ids"]
tgt_idx = 7
tgt_seq = [0] + tgt_seq[:tgt_idx]
_ = generate_seq_masks(tgt_seq, tokenizer, model, dbg=True, max_win=10)

紛紛
( 8: 8) D。表事件接連不
( 7: 8) D。表事件接連Ｏ
( 6: 8) D。表事件接ＯＯ
( 5: 8) D。表事件ＯＯＯ
( 4: 8) D。表ＯＯＯＯ
( 3: 8) D。ＯＯＯＯＯ
( 3: 7) D。ＯＯＯＯ不
( 3: 6) D。ＯＯＯ連不
( 3: 5) D。ＯＯ接連不
( 3: 4) D。Ｏ事件接連不


## Building decoder data objects

In [11]:
annot_frameinfo_list: List[AnnotFrameInfo] = []
empty_senses = []
for annot_x in tqdm(annot_frames):    
    try:
        annot_depinfo = []
        example_sentences = cwn.from_sense_id("{:08d}".format(annot_x.sense_id)).examples                
        mean_vec = compute_mean_vec(example_sentences, tokenizer, model)
        if mean_vec is None:
            empty_senses.append(annot_x.sense_id)
            continue
        # print(annot_x.head_word)
        tgt_text = "{}。{}".format(annot_x.POS, annot_x.definition)
        tgt_seq = tokenizer(tgt_text)["input_ids"]
        period_idx = tgt_seq.index(306)
        # print("Tgt:", tgt_text)
        # print("Gen:", vec4gloss.decode_vector(mean_vec, tokenizer, model))                
        for idx in range(period_idx+1, len(tgt_seq)-2):            
            depwins = compute_dependents(tgt_seq, idx, mean_vec, tokenizer, model, dbg=False)                            
            # print("depwins:", depwins)
            sel_dep_idx = select_highest_lr(depwins)
            annot_depinfo.append(AnnotDepInfo(idx, depwins, sel_dep_idx))            
        annot_frameinfo_list.append(AnnotFrameInfo(annot_x, annot_depinfo))
    except Exception as ex:        
        import traceback; traceback.print_exc()
        print("Exception: ", ex)        

  0%|          | 0/288 [00:00<?, ?it/s]

In [12]:
annot_frameinfo_list[0].dep_info[5]

<AnnotDepInfo (7) 1.00: ( 3- 8) [ 1.00] 　述 / 表勉強去做後>

In [13]:
## empty_senses
len(empty_senses), len(annot_frameinfo_list)

(44, 244)

In [14]:
annot_frameinfo_list_path = "../data/annot_frameinfo_list.pkl"
with open(annot_frameinfo_list_path, "wb") as fout:
    pickle.dump(annot_frameinfo_list, fout)

## Output Hashes

```
..\data\annot_frameinfo_list.pkl f25b87
```

In [15]:
_ = check_hashes([annot_frameinfo_list_path])

..\data\annot_frameinfo_list.pkl f25b87
