In [1]:
import sys
sys.path.append("../src")

In [2]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from morphert.model import *
from tqdm.auto import tqdm
from opencc import OpenCC

In [3]:
t2s = OpenCC('t2s').convert

In [4]:
hs = np.load("../data/ham_seq_768_1k.npy")

In [5]:
data = pd.read_excel("../data/Tsang-2018-MELD-SCH.xlsx", sheet_name=0)
bisyll_items = data.loc[data.length==2, :]
bisyll_wd = bisyll_items.loc[data.lexicality==1, :]
bisyll_wd = bisyll_wd[["id", "word", "lexicality", "N", "RT", "RTSD", "zRT", "zRTSD", "ERR"]]
bisyll_wd.head()

Unnamed: 0,id,word,lexicality,N,RT,RTSD,zRT,zRTSD,ERR
2040,200001,一切,1,40,610.62025,104.615792,-0.744272,0.294925,0.0
2041,200002,一共,1,40,741.71425,248.347155,-0.309554,0.739127,4.761905
2042,200003,一律,1,39,736.614615,147.96963,-0.409607,0.445567,7.142857
2043,200004,一样,1,40,700.4495,296.422704,-0.484063,0.820917,0.0
2044,200005,一般,1,40,618.66625,139.388386,-0.691929,0.478496,0.0


In [6]:
bisyll_wd.shape

(10022, 9)

In [7]:
base_dir = "../data"
with open(base_dir + "/tencent_small_500k.pkl", "rb") as fin:
    (vocabs, embs) = pickle.load(fin)   

In [9]:
import torch
from transformers import BertTokenizer, BertModel, BertPreTrainedModel

In [10]:
N = 500000
base_dir = "../data"
with open(base_dir + "/tencent_small_500k.pkl", "rb") as fin:
    (vocabs, embs) = pickle.load(fin)   
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = MorphertModel.from_pretrained(base_dir + "/morphert_500k")
collator_fn = DataCollator(tokenizer)
model = model.to("cuda")
full_ds = MorphertDataset(np.arange(N), vocabs, embs)
full_emb = np.vstack([full_ds[i]["vec"] for i in range(N)])
in_tencent = lambda x: x in full_ds.vocabs

In [11]:
tokenizer("電腦")

{'input_ids': [101, 7442, 5582, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [12]:
in_embeds = model.bert.embeddings.word_embeddings(torch.tensor([[101, 7442, 5582, 102]]).to("cuda"))

In [13]:
in_embeds.shape

torch.Size([1, 4, 768])

In [14]:
out_embeds = model(inputs_embeds=in_embeds)
out_tokens = model(**tokenizer("電腦", return_tensors="pt").to("cuda"))
torch.allclose(out_embeds.predictions, out_tokens.predictions)

True

In [15]:
from torch.autograd.functional import jacobian
def compute_token_jacobian(tgt_word, tgt_loc, model, tokenizer):
    tgt_loc += 1  # offset the [CLS] token
    assert 1 <= tgt_loc <= len(tgt_word)
    tokens = tokenizer([tgt_word], return_tensors="pt").to("cuda")
    in_embeds = model.bert.embeddings.word_embeddings(tokens.input_ids)
    def partial_effect(x):        
        in_embeds[:,tgt_loc,:] = x
        out = model(inputs_embeds=in_embeds)
        return out.predictions
    J = jacobian(partial_effect, in_embeds[:,tgt_loc,:])
    return J

In [16]:
t2s("電")

'电'

In [17]:
c1_set = [x for x in vocabs if len(x)==2 and x[0] in "電电"]
c2_set = [x for x in vocabs if len(x)==2 and x[1] in "電电"]

In [18]:
len(c1_set)

157

In [19]:
Js = []
for word in tqdm(c1_set):
    J = compute_token_jacobian(word, 0, model, tokenizer).squeeze().cpu().numpy()
    Js.append(J)

  0%|          | 0/157 [00:00<?, ?it/s]

In [20]:
from scipy.spatial.distance import pdist
from itertools import  combinations
distJ = np.zeros((len(Js), len(Js)))
for i, j in combinations(range(len(Js)), 2):
    L1norm = np.abs(Js[i] - Js[j]).sum()
    distJ[i,j] = distJ[j,i] = L1norm

In [21]:
distJ.shape

(157, 157)

In [22]:
import umap
umap_inst = umap.UMAP()
proj = umap_inst.fit_transform(distJ, metric="precomuted")

ImportError: Numba needs NumPy 1.21 or less

In [None]:
plt.plot(proj[:,0],proj[:,1], '.')