In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from mesh_import import mesh

  from tqdm.autonotebook import tqdm


In [3]:
from mesh.senses import CorpusStreamer

In [4]:
streamer = CorpusStreamer()

In [5]:
len(streamer.char_indices.keys())

6960

In [6]:
for x in streamer.char_indices["嵌"][:10]:
    print("".join(y[0] for y in streamer.sentences[x]))

捕捉形象世界之美、人與人之間相互施加的不公、男人對女人的壓迫、成人對孩童的嵌制、受迫者臉上明顯可辨的恐懼、小孩的驚奇眼光、以及環境本身恣意肆虐的破壞力等等。
只見對面一座石山上嵌著兩扇鐵鑄的大門。
嵌在大山巖中。
應該用金屬條鑲嵌。
或其他嵌上之文字或圖樣。
恍若象牙雕刻上細緻鑲嵌的紅寶石。
少年的指尖顫慄嵌入百合花心，
終於嵌入皮肉之間，
幻燈片以及馬賽克鑲嵌。
大都集中在赤嵌樓附近的米街。


In [7]:
from transformers import DistilBertModel, DistilBertTokenizerFast

In [9]:
model_name = "distilbert-base-multilingual-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

In [10]:
from CwnGraph import CwnBase

In [11]:
cwn = CwnBase()

In [12]:
def find_target(text):    
    toks = text.split("<")
    if len(toks) <= 1:
        return (text.replace("<").replace(">"), None)
    buf = toks[0]
    pos = []
    for tok in toks[1:]:
        xs = tok.split(">")
        if len(xs) <= 1:
            buf.append(tok)
        pos.append((len(buf), len(buf)+len(xs[0])))
        buf += "".join(xs)
    return buf, pos

In [13]:
senses = cwn.find_all_senses("聽")

In [14]:
senses[4].all_examples()

['她拿出一<聽>雲南火腿罐頭，帶著幾分羞怯，幾分纏綿地三到他手裡。',
 '服務生先是端上裝滿冰塊的杯子，然後倒入一<聽>雪碧，最後從一個小碟子裡取出兩個類似於檸檬乾的東西，黑黑的，乾乾的，丟入杯內。',
 '由於擔心自己被狗咬後，兩個月內不再餵孩子母乳，而讓孩子改喝奶粉。一<聽>奶粉152元，兩個月要8<聽>，所以向李某要求1000元賠償金。']

In [15]:
txt_list, pos_list = list(zip(*(find_target(x) for x in senses[4].all_examples())))

In [16]:
txt_list

('她拿出一聽雲南火腿罐頭，帶著幾分羞怯，幾分纏綿地三到他手裡。',
 '服務生先是端上裝滿冰塊的杯子，然後倒入一聽雪碧，最後從一個小碟子裡取出兩個類似於檸檬乾的東西，黑黑的，乾乾的，丟入杯內。',
 '由於擔心自己被狗咬後，兩個月內不再餵孩子母乳，而讓孩子改喝奶粉。一聽奶粉152元，兩個月要8聽，所以向李某要求1000元賠償金。')

In [17]:
pos_list

([(4, 5)], [(20, 21)], [(33, 34), (46, 47)])

In [20]:
input_tensors = tokenizer.batch_encode_plus(list(txt_list), 
                            max_length=max(len(x) for x in txt_list)+2, pad_to_max_length=True, return_tensors="pt")


In [21]:
indices = list(((i, input_tensors.char_to_token(i, x[0])) for i, xs in enumerate(pos_list) for x in xs))
batch_indices, seq_indices = list(zip(*indices))
indices

[(0, 5), (1, 21), (2, 34), (2, 45)]

In [24]:
batch_indices

(0, 1, 2, 2)

In [22]:
output = model(**input_tensors)

In [26]:
input_tensors["input_ids"][list(batch_indices), list(seq_indices)]

tensor([6493, 6493, 6493, 6493])

In [29]:
tokenizer.decode(input_tensors["input_ids"][list(batch_indices), list(seq_indices)].numpy())

'聽 聽 聽 聽'

In [30]:
ex_vecs = output[0][batch_indices, seq_indices]

In [31]:
ex_vecs.mean(dim=0)

tensor([ 3.5121e-02, -2.0182e-01,  2.7320e-01,  9.3134e-02,  6.4344e-01,
         5.0692e-01, -2.9535e-01, -6.3205e-02, -1.1674e-01,  5.9907e-01,
         6.3061e-03,  8.8041e-03,  4.8918e-01,  2.8246e-01,  3.9764e-01,
        -2.2272e-01,  7.7137e-01,  3.7720e-01,  3.1480e-01,  2.2762e-01,
        -3.8587e-01, -2.2390e-01,  3.6110e-01,  6.2099e-01, -1.2724e-01,
         2.7379e-01,  4.8247e-01,  4.3855e-01,  6.7397e-01,  1.5576e-01,
         2.8252e-01,  2.4510e-01, -4.5659e-01,  3.3841e-01,  1.9656e-01,
        -3.4483e-01, -2.1604e-01,  2.9457e-02, -1.6058e-01, -2.1672e-01,
        -6.8896e-01,  3.7901e-02,  1.7405e-01,  2.7533e-02,  3.0755e-01,
         3.8229e-02,  3.5489e-01,  6.4658e-01,  8.3770e-02, -7.5271e-01,
         3.8696e-01,  7.0880e-02, -4.9387e-02,  4.5327e-01,  4.0228e-01,
         7.2667e-01,  5.2078e-01, -7.9182e-01,  2.1326e-01, -4.7122e-01,
         1.2816e-01, -3.4977e-01, -3.6761e-02,  7.0867e-01, -1.6343e-01,
        -3.6185e-01,  1.7450e-01,  4.4182e-01,  4.0

In [34]:
import torch

In [35]:
def compute_sense_vec(examples):
    txt_list, pos_list = list(zip(*(find_target(x) for x in examples)))    
    input_tensors = tokenizer.batch_encode_plus(list(txt_list), 
                            max_length=max(len(x) for x in txt_list)+2, pad_to_max_length=True, return_tensors="pt")
    indices = list(((i, input_tensors.char_to_token(i, x[0])) for i, xs in enumerate(pos_list) for x in xs))
    batch_indices, seq_indices = list(zip(*indices))
    with torch.no_grad():
        output = model(**input_tensors)        
    print(tokenizer.decode(input_tensors["input_ids"][batch_indices, seq_indices].numpy()))
    ex_vecs = output[0][batch_indices, seq_indices]    
    sense_vec = ex_vecs.mean(dim=0)
    return sense_vec

In [36]:
compute_sense_vec(senses[0].all_examples())

聽 聽 聽


tensor([-2.0181e-01, -2.8457e-01, -8.3441e-02,  6.7718e-02,  4.5001e-01,
         4.6768e-01, -2.4575e-01, -1.8114e-02, -1.3700e-01,  3.9972e-01,
         3.6378e-01, -5.6755e-01,  2.8658e-01,  4.5391e-01,  2.3958e-01,
         1.3022e-01,  6.3261e-01,  1.2759e-01, -2.5806e-02,  2.9122e-01,
        -1.5269e-01, -1.8227e-01, -4.7937e-02,  1.5334e-01,  8.2083e-02,
         3.1905e-01,  2.4602e-01,  1.0505e-01,  5.2361e-01,  2.8173e-01,
         6.0767e-01,  3.9651e-02, -4.7576e-01,  2.9525e-01,  1.1561e-01,
        -2.1258e-01, -3.2715e-01,  3.5489e-02, -4.7392e-01, -1.6459e-01,
        -6.9454e-01, -2.1962e-01,  9.7888e-02, -1.0657e-01, -3.8266e-02,
        -4.8274e-02,  2.0266e-01,  6.2094e-01, -2.4148e-02, -6.6488e-01,
         6.9578e-01,  3.1110e-01, -1.9590e-01,  2.9522e-02,  4.7166e-01,
         4.8055e-01,  3.5980e-01, -6.6389e-01, -4.4293e-02, -6.0941e-01,
        -1.0726e-01, -5.3521e-01,  1.0045e-01,  4.1892e-01, -3.5342e-01,
        -3.5790e-01,  9.7368e-02,  1.6934e-01,  2.2