# Decoding Logits - Sandbox

In [1]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [2]:
from datetime import datetime
from pathlib import Path
from itertools import islice
from tqdm.auto import tqdm
import numpy as np
import torch
from torch.utils.data import DataLoader

import datasets
from datasets import Dataset
from transformers import DataCollatorForSeq2Seq
from transformers import MT5TokenizerFast
from vec4gloss import check_hashes
from vec4gloss import Vec4GlossModel

In [3]:
ds_defgen = datasets.load_from_disk("../data/defgen_dataset_cwn")
vec4gloss_model_dir = "../data/models/vec4gloss-defgen-220628-1546"

In [4]:
tokenizer = MT5TokenizerFast.from_pretrained(vec4gloss_model_dir)

## Preprocess functions

In [5]:
max_length = 256

def get_marked_pos(text):
    assert text.count("<") == text.count(">") == 1
    s, e = text.index("<")+1, text.index(">")    
    assert s != e
    return s, e

def add_marked_pos(ex):
    pos = get_marked_pos(ex["src"])
    return {"decoder_start_markers": pos[0], "decoder_end_markers": pos[1]}

def preprocess_fn(batch):    
    src_batch = tokenizer(batch["src"], 
                          max_length=max_length, truncation=True)
    start_markers = [src_batch.char_to_token(bi,s) 
                     for bi, s in enumerate(batch["decoder_start_markers"])]
    end_markers = [src_batch.char_to_token(bi,e) 
                   for bi, e in enumerate(batch["decoder_end_markers"])]
    
    with tokenizer.as_target_tokenizer():
        tgt_batch = tokenizer(batch["tgt"],
                              max_length=max_length, truncation=True)        
        
    return {
        **src_batch, 
        "decoder_start_markers": start_markers,
        "decoder_end_markers": end_markers,
        "labels": tgt_batch["input_ids"]
    }

In [6]:
drop_columns = ["cwnid", "src", "tgt"]
ds_defgen = (ds_defgen.map(add_marked_pos)
             .map(preprocess_fn, batched=True, remove_columns=drop_columns))
train_ds = ds_defgen["train"]
test_ds = ds_defgen["test"]

Loading cached processed dataset at ../data/defgen_dataset_cwn/train\cache-73645a22723c5115.arrow
Loading cached processed dataset at ../data/defgen_dataset_cwn/test\cache-79eff688f816c6ef.arrow
Loading cached processed dataset at ../data/defgen_dataset_cwn/train\cache-7d7b0c80d6938a82.arrow
Loading cached processed dataset at ../data/defgen_dataset_cwn/test\cache-51609cdc52b73bd9.arrow


## Model

In [7]:
torch.manual_seed(12345)
model = Vec4GlossModel.from_pretrained(vec4gloss_model_dir)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest")

In [8]:
g_cuda = torch.Generator()
g_cuda.manual_seed(211321)
loader = DataLoader(train_ds, batch_size=2, collate_fn=data_collator, shuffle=True, generator=g_cuda)
batches = list(islice(loader, 10))

In [9]:
# labels
batch = batches[0]
tokenizer.batch_decode(torch.where(batch["labels"]>=0, batch["labels"], 0))

['VC。模仿或照原樣重製他人的創意當作自己的。</s>',
 'Nc。美術館的建築物及建築物所在的位置。</s><pad><pad><pad><pad><pad>']

In [10]:
## direct decoding settings
with torch.no_grad():    
    batch = batches[0]
    out = model(**batch)
tokenizer.batch_decode(out.logits.argmax(2))

['VC。比仿特定仿原來的的複的的文字新。作特定作品</s>', 'Nc。美術館的建築物及建築物所在的位置。</s></s> N N N N']

In [11]:
## generation setting
dbg = {}
batch = batches[0]
gen_batch = {k:v for k, v in batch.items() if k not in ("labels", "decoder_input_ids")}
tokenizer.batch_decode(model.generate(**gen_batch))

['<pad> VC。比喻將資料依照特定格式複製。</s>', '<pad> Nc。美術館的建築物及建築物所在的位置。</s>']

In [13]:
genout = model.generate(**gen_batch, return_dict_in_generate=True, 
                        output_scores=True, output_hidden_states=True, 
                        output_attentions=True
                       )

In [14]:
list(genout.keys())

['sequences',
 'scores',
 'encoder_attentions',
 'encoder_hidden_states',
 'decoder_attentions',
 'cross_attentions',
 'decoder_hidden_states']

In [15]:
tokenizer.batch_decode(torch.vstack([genout.scores[i].argmax(1) for i in range(len(genout.scores))]).permute(1,0))

['VC。比喻將資料依照特定格式複製。</s>', 'Nc。美術館的建築物及建築物所在的位置。</s>']

In [16]:
intext = "我們<上>山途中欣賞沿途風景。"
vbatch = tokenizer(intext, return_tensors="pt")
s,e = get_marked_pos(intext)
s = vbatch.char_to_token(s)
e = vbatch.char_to_token(e)
vbatch["decoder_start_markers"] = torch.tensor([s])
vbatch["decoder_end_markers"] = torch.tensor([e])

In [17]:
vgenout = model.generate(**vbatch, max_length=30, 
                         return_dict_in_generate=True,
                         output_scores=True, output_hidden_states=True, 
                        output_attentions=True)
tokenizer.batch_decode(vgenout.sequences)

['<pad> VCL。往高處移動。</s>']

In [18]:
list(vgenout.keys())

['sequences',
 'scores',
 'encoder_attentions',
 'encoder_hidden_states',
 'decoder_attentions',
 'cross_attentions',
 'decoder_hidden_states']

In [19]:
## this is what I expect if decoding only uses a single vector
## It's a tuple(gen length) of tuple (decoder layers) of tensor
## (batch_size, num_heads, sequence_length, sequence_length)
vgenout.cross_attentions[4][7].shape

torch.Size([1, 12, 1, 1])

## split the encoder decoder

In [21]:
def extract_encoder_vector(intext, tokenizer, model):    
    vbatch = tokenizer(intext, return_tensors="pt")
    s,e = get_marked_pos(intext)   
    s = vbatch.char_to_token(s)
    e = vbatch.char_to_token(e)
    vbatch["decoder_start_markers"] = torch.tensor([s])
    vbatch["decoder_end_markers"] = torch.tensor([e])
    encoder = model.get_encoder()
    enc_out = encoder(
            input_ids=vbatch["input_ids"], 
            attention_mask=vbatch["attention_mask"])
    enc_vec = enc_out.last_hidden_state[[0],s:e,:] \
                     .mean(1, keepdim=True)
    return enc_vec

def decode_vector(vec, tokenizer, model, max_length=50):
    vgenout = model.generate(decoder_encoder_vector=vec, bos_token_id=0, max_length=max_length)
    return tokenizer.batch_decode(vgenout[:, 1:-1])[0]
enc_vec = extract_encoder_vector("我們<上>山途中欣賞沿途風景。", tokenizer, model)
decode_vector(enc_vec, tokenizer, model)

'VCL。往高處移動。'

## Morphing vectors

In [22]:
enc_vec1 = extract_encoder_vector("我們<上>山途中欣賞沿途風景。", tokenizer, model)
enc_vec2 = extract_encoder_vector("我們在山裡<野餐>。", tokenizer, model)
delta = enc_vec2 - enc_vec1
for i in np.arange(0, 1.01, 0.2):
    print(f"{i:.2f}", decode_vector(enc_vec1+delta*i, tokenizer, model))

0.00 VCL。往高處移動。
0.20 VCL。從參考位置的外面移到參考位置的裡面。
0.40 VCL。從參考位置的外面移到參考位置的裡面。
0.60 nom,VA。進行餐飲活動。
0.80 nom,VA。人們在約定俗成的固定時間內吃正餐。
1.00 nom,VA。人們在約定俗成的固定時間內吃正餐。


In [23]:
enc_vec1 = extract_encoder_vector("我們上山時，天<突然>下起了大雪。", tokenizer, model)
enc_vec2 = extract_encoder_vector("為什麼我的留言板是<空>的？", tokenizer, model)
delta = enc_vec2 - enc_vec1
for i in np.arange(0, 1.01, 0.2):
    print(f"{i:.2f}", decode_vector(enc_vec1+delta*i, tokenizer, model))

0.00 D。表突然出現在腦海中。
0.20 D。表突然出現在腦海中。
0.40 VH。形容突然出現在螢幕上。
0.60 VH。形容特定對象沒有被使用。
0.80 VH。形容比喻特定對象沒有被使用。
1.00 VH。形容比喻特定對象沒有被使用。


In [24]:
enc_vec1 = extract_encoder_vector("那是一位嬌豔如<花>的少女。", tokenizer, model)
enc_vec2 = extract_encoder_vector("以動畫方式慢速地顯示字母的每一<筆>一劃。", tokenizer, model)
delta = enc_vec2 - enc_vec1
for i in np.arange(0, 1.01, 0.2):
    print(f"{i:.2f}", decode_vector(enc_vec1+delta*i, tokenizer, model))

0.00 Na。植物名,薔薇科花屬,葉卵形,花瓣五片,花瓣五片,花瓣五片,花瓣五片,花瓣五片,花瓣五片,花瓣五
0.20 Na。植物名,薔薇科花屬,葉卵形,花瓣五片,花瓣五片,花瓣五片,花瓣五片,花瓣五片,花瓣五片,花瓣五
0.40 Na。以花為形象製成的人造物。
0.60 Na。筆畫的一種,由左往右的筆畫。
0.80 Na。筆畫的一種,由左往右的筆畫。
1.00 Na。筆畫的一種,由左往右的筆畫。


In [25]:
enc_vec1 = extract_encoder_vector("昨天我的<貓>抓了三隻老鼠。", tokenizer, model)
enc_vec2 = extract_encoder_vector("昨天我的貓<抓>了三隻老鼠。", tokenizer, model)
delta = enc_vec2 - enc_vec1
for i in np.arange(0, 1.01, 0.2):
    print(f"{i:.2f}", decode_vector(enc_vec1+delta*i, tokenizer, model))

0.00 Na。哺乳類動物,偶蹄,腳短,身體肥胖,為主要用於食用的家畜。
0.20 Na。以狗為形象製成的人造物。
0.40 Na。以狗為形象製成的人造物。
0.60 VC。用手或手持工具捕捉後述對象。
0.80 VC。用手或手持工具捕捉後述對象。
1.00 VC。用手或手持工具捕捉後述對象。
