In [None]:
!git clone https://github.com/seantyh/morphert

Cloning into 'morphert'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 22 (delta 1), reused 22 (delta 1), pack-reused 0[K
Unpacking objects: 100% (22/22), done.


In [None]:
!pip install -q --progress-bar off transformers umap-learn opencc hdbscan functorch
!pip install -U gensim

[?25l
[?25h[?25l
[?25h[?25l
[?25h[?25l
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[?25l
[?25h[?25l
[?25h[?25l
[?25h[?25l
[?25h[?25l
[?25h[?25l
[?25h  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Building wheel for hdbscan (PEP 517) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.5 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [None]:
import sys
if "./morphert/src" not in sys.path:
  sys.path.append("./morphert/src")

In [None]:
import pickle
from pathlib import Path
from itertools import groupby, combinations
from textwrap import wrap
from tqdm.auto import tqdm
from opencc import OpenCC

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from gensim.models import KeyedVectors
import umap
import hdbscan

import torch
from functorch import jacrev, jacfwd
from transformers import BertTokenizer, BertModel, BertPreTrainedModel
from morphert.model import *

In [None]:
N = 500000
base_dir = Path("/content/drive/MyDrive/LangOn/morphert")
t2s = OpenCC("t2s").convert
with open(base_dir/"tencent_small_500k.pkl", "rb") as fin:
    (vocabs, embs) = pickle.load(fin)   
kv = KeyedVectors(100)
kv.add_vectors(vocabs, embs)
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = MorphertModel.from_pretrained(base_dir/"morphert_500k")
collator_fn = DataCollator(tokenizer)
model = model.to("cuda")
full_ds = MorphertDataset(np.arange(N), vocabs, embs)
full_emb = np.vstack([full_ds[i]["vec"] for i in range(N)])
in_tencent = lambda x: x in full_ds.vocabs

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [None]:
tokenizer("電腦")

{'input_ids': [101, 7442, 5582, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [None]:
in_embeds = model.bert.embeddings.word_embeddings(torch.tensor([[101, 7442, 5582, 102]]).to("cuda"))
out_embeds = model(inputs_embeds=in_embeds)
out_tokens = model(**tokenizer("電腦", return_tensors="pt").to("cuda"))
torch.allclose(out_embeds.predictions, out_tokens.predictions)

True

torch.Size([4, 100])

In [None]:
def compute_token_jacobian_functorch(tgt_word, model, tokenizer):    
    tokens = tokenizer([tgt_word], return_tensors="pt").to("cuda")    
    in_embeds = model.bert.embeddings.word_embeddings(tokens.input_ids)
    def partial_effect(x):                     
        out = model(inputs_embeds=x)
        return out.predictions
    J = jacrev(partial_effect, argnums=0)(in_embeds)
    return J

## Load char-noise dataset

In [None]:
import json
with open(base_dir / "char_senses_with_noises.json", "r") as fin:
  char_dataset = json.load(fin)

In [None]:
item_x = char_dataset[0]

In [None]:
item_x

{'noise_words': ['排汗',
  '排除',
  '排擠',
  '排泄',
  '排序',
  '排行',
  '綵排',
  '排成',
  '安排',
  '排氣',
  '排球',
  '肋排',
  '排骨',
  '牛排',
  '排演',
  '排名',
  '豬排',
  '排入',
  '排便',
  '排乾',
  '排水',
  '排擋',
  '排列',
  '排版',
  '排污',
  '排卵',
  '排隊',
  '排斥'],
 'src_char': ['排'],
 'src_sense': ['08060701'],
 'syno_senses': ['08037501', '09297601'],
 'syno_words': ['排放', '排出']}

In [None]:
# initializae J buffer, it stores Jacobian across items
J_buf = {}

In [None]:
# items not having enough noise words
len([x for x in char_dataset if len(x["syno_words"]) > len(x["noise_words"])])

20

### Compute Jacobians for all targets and noise samples in the dataset

In [None]:
def mark_target(x, pos):
  xlist = list(x)
  xlist[pos] = f"<{x[pos]}>"
  return "".join(xlist)

def compute_pairwise_distances(tgt_char, words, Js, counter_position=False):
  dists = []
  for a, b in combinations(words, 2):
      # L2norm = np.sqrt(((Js[a] - Js[b])**2).sum())
      if counter_position:
        tgt_idx_a = 1-a.index(tgt_char)
        tgt_idx_b = 1-b.index(tgt_char)
      else:
        tgt_idx_a = a.index(tgt_char)
        tgt_idx_b = b.index(tgt_char)
      L1norm = np.abs(Js[a][:,tgt_idx_a,:] - Js[b][:,tgt_idx_b,:]).sum()
      dists.append((mark_target(a, tgt_idx_a),
                    mark_target(b, tgt_idx_b), L1norm))      
  return dists

def compute_pairwise_emb_distances(words, embs):
  dists = []
  for a, b in combinations(words, 2):
    if not (a in embs and b in embs): continue
    cossim = kv.cosine_similarities(embs[a], [embs[b]])[0]
    dists.append((a, b, cossim))
  return dists

In [None]:

import random
rng = random.Random(123)

syno_stats = []
pred_embs = {}
for item_x in tqdm(char_dataset):
  if len(item_x["syno_words"]) > len(item_x["noise_words"]):
    continue
  targets = item_x["syno_words"]
  noises = rng.sample(item_x["noise_words"], k=len(targets))
  words = targets + noises
  for word in words:
    if word in J_buf: continue
    J = compute_token_jacobian_functorch(word, model, tokenizer).squeeze().detach().cpu().numpy()
    # subset the second token (the first character, considering the [CLS] offset)
    J_buf[word] = J[:,1:3,:]
  
  ## predicted word embeddings
  with torch.no_grad():    
    preds = model(**tokenizer(words, return_tensors="pt").to("cuda")).predictions.cpu().numpy()
  pred_embs.update({w: preds[i,:] for i, w in enumerate(words)})

  src_char = item_x["src_char"][0]
  syno_target_dists = compute_pairwise_distances(src_char, targets, J_buf, False)
  syno_counter_dists = compute_pairwise_distances(src_char, targets, J_buf, True)
  noise_target_dists = compute_pairwise_distances(src_char, noises, J_buf, False)
  noise_counter_dists = compute_pairwise_distances(src_char, noises, J_buf, True)
  syno_pred_dists = compute_pairwise_emb_distances(targets, pred_embs)
  syno_tenc_dists = compute_pairwise_emb_distances(targets, kv)
  noise_pred_dists = compute_pairwise_emb_distances(noises, pred_embs)
  noise_tenc_dists = compute_pairwise_emb_distances(noises, kv)
  syno_item = {
      "src_char": src_char,
      "syno_targets": syno_target_dists,
      "syno_counters": syno_counter_dists,
      "noise_targets": noise_target_dists,
      "noise_counters": noise_counter_dists,
      "syno_pred": syno_pred_dists,
      "syno_tenc": syno_tenc_dists,
      "noise_pred": noise_pred_dists,
      "noise_tenc": noise_tenc_dists,      
  }
  syno_stats.append(syno_item)

  0%|          | 0/436 [00:00<?, ?it/s]

In [None]:
## cache J_buf
with open(base_dir / "syno_contrast_Jbuf.pkl", "wb") as fout:
  pickle.dump(J_buf, fout)

In [None]:
with open(base_dir / "syno_contrast_stats.pkl", "wb") as fout:
  pickle.dump(syno_stats, fout)

In [None]:
J_buf["排泄"].shape

(100, 2, 768)

In [None]:
syno_stats[0]

{'noise_counters': [('排<除>', '<安>排', 491.18207)],
 'noise_pred': [('排除', '安排', 0.5637611)],
 'noise_targets': [('<排>除', '安<排>', 518.7595)],
 'noise_tenc': [('排除', '安排', 0.38079652)],
 'src_char': '排',
 'syno_counters': [('排<放>', '排<出>', 459.33392)],
 'syno_pred': [('排放', '排出', 0.65291655)],
 'syno_targets': [('<排>放', '<排>出', 595.8497)],
 'syno_tenc': [('排放', '排出', 0.63070977)]}

In [None]:
syno_frame_data = []
for item in syno_stats:
  syno_frame_data.append((
      np.mean([x[2] for x in item["syno_targets"]]),
      np.mean([x[2] for x in item["syno_counters"]]),
      np.mean([x[2] for x in item["syno_pred"]]),
      np.mean([x[2] for x in item["syno_tenc"]]),
      np.mean([x[2] for x in item["noise_targets"]]),
      np.mean([x[2] for x in item["noise_counters"]]),      
      np.mean([x[2] for x in item["noise_pred"]]),
      np.mean([x[2] for x in item["noise_tenc"]]),
  ))
syno_frame = pd.DataFrame.from_records(
                          syno_frame_data, 
                          columns=["syno_target", "syno_counter", "syno_pred", "syno_tenc", 
                                   "noise_target", "noise_counter", "noise_pred", "noise_tenc"])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [None]:
syno_frame.mean()

syno_target      440.105896
syno_counter     448.861206
syno_pred          0.769539
syno_tenc          0.684650
noise_target     550.764709
noise_counter    603.096252
noise_pred         0.604798
noise_tenc         0.436201
dtype: float64

In [None]:
syno_frame.std()/np.sqrt(syno_frame.shape[0])

syno_target      7.709994
syno_counter     9.200991
syno_pred        0.009241
syno_tenc        0.009530
noise_target     7.152592
noise_counter    9.438117
noise_pred       0.011030
noise_tenc       0.009513
dtype: float64

In [None]:
(syno_frame.syno_target - syno_frame.syno_counter).mean()

-8.75553035736084

In [None]:
(syno_frame.syno_target - syno_frame.syno_counter).std()/np.sqrt(syno_frame.shape[0])

6.258009358996329