# Build depdency info of nouns in rating.n10

In [1]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [2]:
import re
import pickle
import json
import random
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass
from typing import List, Tuple

import numpy as np
import torch
from tqdm.auto import tqdm

from transformers import MT5TokenizerFast

from CwnGraph import CwnImage
import vec4gloss
from vec4gloss import check_hashes
from vec4gloss import Vec4GlossModel
from vec4gloss import (
    AnnotDepInfo,
    NounFrameInfo)

from vec4gloss import (
    DepWinOutput,
    compute_mean_vec,
    generate_seq_masks,
    compute_dependents,
    select_highest_lr)        

## Data dependencies
```
30.02 => ..\data\rating_materials.n10.raw.pkl 4b7ae3
20.21 => ..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f
```

In [3]:
vec4gloss_model_dir = "../data/models/vec4gloss-defgen-220629-1250"
_ = check_hashes([
    "../data/rating_materials.n10.raw.pkl",
    vec4gloss_model_dir + "/pytorch_model.bin",
])

..\data\rating_materials.n10.raw.pkl 4b7ae3
..\data\models\vec4gloss-defgen-220629-1250\pytorch_model.bin 9f894f


## Loading resources

In [4]:
n10_data = pickle.loads(Path("../data/rating_materials.n10.raw.pkl").read_bytes())

In [5]:
nouns = {k: v for k, v in n10_data.items()
         if k.startswith("N") and 
            v["from"] == "vec4gloss"}

In [6]:
len(nouns)

20

In [7]:
list(nouns.items())[:3]

[('N-05',
  {'cwnid': '09267502',
   'src': '打造<昆>菜品牌的同時，還應努力提高我市蔬菜產品的市場競爭力和占有率。',
   'tgt': 'Na。昆明的文化。',
   'pos': 'N',
   'target': '昆',
   'fillers': ['文', '文字', '水'],
   'example': '打造<昆>菜品牌的同時，還應努力提高我市蔬菜產品的市場競爭力和占有率。',
   'from': 'vec4gloss',
   'definition': '昆蟲的文化。',
   'item_id': 'N-05'}),
 ('N-06',
  {'cwnid': '09167302',
   'src': '昨天他派宦官已經暗示過張延賞，今天將由張延賞來挑主<樑>力促此事，但韓卻看破他的企圖，一上來便釜底抽薪，打亂了他地部署。',
   'tgt': 'Na。比喻在特定事件中最重要的角色。',
   'pos': 'N',
   'target': '樑',
   'fillers': ['學院', '室友', '弟兄'],
   'example': '昨天他派宦官已經暗示過張延賞，今天將由張延賞來挑主<樑>力促此事，但韓卻看破他的企圖，一上來便釜底抽薪，打亂了他地部署。',
   'from': 'vec4gloss',
   'definition': '比喻在特定事件中最重要的角色。',
   'item_id': 'N-06'}),
 ('N-07',
  {'cwnid': '06586403',
   'src': '當<需要>不變但供給減少時均衡價格會上漲。',
   'tgt': 'Na。在特定時間、市場，依特定價格而獲得滿足的商品量。',
   'pos': 'N',
   'target': '需要',
   'fillers': ['卡', '撢', '料'],
   'example': '當<需要>不變但供給減少時均衡價格會上漲。',
   'from': 'vec4gloss',
   'definition': '特定對象所需要的所有權。',
   'item_id': 'N-07'})]

In [8]:
## Loading model
use_cuda = torch.cuda.is_available() and "GeForce" not in torch.cuda.get_device_name()
device = "cuda" if use_cuda else "cpu"    
print("Using", device)

model = Vec4GlossModel.from_pretrained(vec4gloss_model_dir).to(device)
tokenizer = MT5TokenizerFast.from_pretrained(vec4gloss_model_dir)
gen = vec4gloss.gen_func(tokenizer, model)

Using cpu


## Computing dependency

In [9]:
## test generate_seq_mask
entry = nouns["N-07"]
tgt_text = entry["tgt"]
tgt_seq = tokenizer(tgt_text)["input_ids"]
tgt_idx = 9
tgt_seq = [0] + tgt_seq[:tgt_idx]
_ = generate_seq_masks(tgt_seq, tokenizer, model, dbg=True, max_win=10)

(10:10) Na。在特定時間、市場,依
( 9:10) Na。在特定時間、市場,Ｏ
( 8:10) Na。在特定時間、市場ＯＯ
( 7:10) Na。在特定時間、ＯＯＯ
( 6:10) Na。在特定時間ＯＯＯＯ
( 5:10) Na。在特定ＯＯＯＯＯ
( 4:10) Na。在ＯＯＯＯＯＯ
( 3:10) Na。ＯＯＯＯＯＯＯ
( 3: 9) Na。ＯＯＯＯＯＯ依
( 3: 8) Na。ＯＯＯＯＯ,依
( 3: 7) Na。ＯＯＯＯ市場,依
( 3: 6) Na。ＯＯＯ、市場,依
( 3: 5) Na。ＯＯ時間、市場,依
( 3: 4) Na。Ｏ特定時間、市場,依


## Building decoder data objects

In [10]:
noun_frameinfo_list: List[NounFrameInfo] = []
empty_senses = []
for entry_id, entry_x in tqdm(nouns.items()):    
    try:
        annot_depinfo = []
        example = entry_x["example"]
        mean_vec = compute_mean_vec([example], tokenizer, model)
        if mean_vec is None:
            empty_senses.append(entry_id)
            continue
        # print(annot_x.head_word)
        tgt_text = entry_x["tgt"]
        tgt_seq = tokenizer(tgt_text)["input_ids"]
        period_idx = tgt_seq.index(306)
        # print("Tgt:", tgt_text)
        # print("Gen:", vec4gloss.decode_vector(mean_vec, tokenizer, model))                
        for idx in range(period_idx+1, len(tgt_seq)-2):            
            depwins = compute_dependents(tgt_seq, idx, mean_vec, tokenizer, model, dbg=False) 
            # print("depwins:", depwins)
            sel_dep_idx = select_highest_lr(depwins)
            annot_depinfo.append(AnnotDepInfo(idx, depwins, sel_dep_idx))            
        noun_frameinfo_list.append(NounFrameInfo(entry_x, annot_depinfo))        
    except Exception as ex:        
        import traceback; traceback.print_exc()
        print("Exception: ", ex)        

  0%|          | 0/20 [00:00<?, ?it/s]

In [11]:
noun_frameinfo_list[0].dep_info[2]

<AnnotDepInfo (4) 1.00: ( 3- 5) [ 1.00] 文化 / 昆明的>

In [12]:
## empty_senses
len(empty_senses), len(noun_frameinfo_list)

(0, 20)

In [13]:
noun_frameinfo_list_path = "../data/nouns_vec4gloss_frameinfo_list.pkl"
with open(noun_frameinfo_list_path, "wb") as fout:
    pickle.dump(noun_frameinfo_list, fout)

## Output Hashes

```
..\data\nouns_vec4gloss_frameinfo_list.pkl 9ad739
```

In [14]:
_ = check_hashes([noun_frameinfo_list_path])

..\data\nouns_vec4gloss_frameinfo_list.pkl 9ad739
