In [1]:
!pip install -q transformers wandb

[K     |████████████████████████████████| 4.0 MB 28.8 MB/s 
[K     |████████████████████████████████| 1.8 MB 55.0 MB/s 
[K     |████████████████████████████████| 596 kB 65.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 58.7 MB/s 
[K     |████████████████████████████████| 895 kB 33.0 MB/s 
[K     |████████████████████████████████| 77 kB 7.0 MB/s 
[K     |████████████████████████████████| 181 kB 56.1 MB/s 
[K     |████████████████████████████████| 144 kB 55.1 MB/s 
[K     |████████████████████████████████| 63 kB 1.3 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [2]:
!wget https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d100-v0.2.0-s.tar.gz

--2022-04-16 12:28:37--  https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d100-v0.2.0-s.tar.gz
Resolving ai.tencent.com (ai.tencent.com)... 116.128.164.87
Connecting to ai.tencent.com (ai.tencent.com)|116.128.164.87|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 799232206 (762M) [application/octet-stream]
Saving to: ‘tencent-ailab-embedding-zh-d100-v0.2.0-s.tar.gz’


2022-04-16 12:30:43 (6.10 MB/s) - ‘tencent-ailab-embedding-zh-d100-v0.2.0-s.tar.gz’ saved [799232206/799232206]



In [3]:
import numpy as np
from tqdm.auto import tqdm
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertPreTrainedModel
N = 500000

## Preprocessing Embeddings

In [None]:

embs = np.zeros((N, 100))
vocabs = [""] * N

import gzip
import tarfile
with gzip.open("tencent-ailab-embedding-zh-d100-v0.2.0-s.tar.gz", "r") as gopen:
  tar = tarfile.open(fileobj=gopen)
  tar.next()
  tinfo = tar.next()
  fin = tar.extractfile(tinfo)  
  print(fin.readline())

  for i in tqdm(range(N)):
    ln = fin.readline().decode().strip()
    toks = ln.split(" ")
    vocabs[i] = toks[0]
    embs[i, :] = [float(x) for x in toks[1:]]
embs = embs / np.linalg.norm(embs, axis=1)[:, np.newaxis]

b'2000000 100\n'


  0%|          | 0/500000 [00:00<?, ?it/s]

In [None]:
# write used embeddings to pickle
# with open("drive/MyDrive/LangOn/morphert/tencent_small_500k.pkl", "wb") as fout:
#   pickle.dump((vocabs, embs), fout)

In [4]:
with open("drive/MyDrive/LangOn/morphert/tencent_small_500k.pkl", "rb") as fin:
  (vocabs, embs) = pickle.load(fin)

In [None]:
rng = np.random.RandomState(123)
random_split = np.arange(N)
rng.shuffle(random_split)
train_idxs = random_split[:int(N*.98)]
test_idxs = random_split[int(N*.98):]
print(train_idxs[:10])
print(test_idxs[:10])

[112430 338861 464653 344237 356227  79952 189456 391334 389913  18754]
[143723 366210 147495 155142 356245   4572 460379   8712 124139 309595]


In [None]:
vocabs[6558], embs[6558]

('说什么', array([ 0.11927852, -0.12477621, -0.00780427, -0.20220207, -0.12248462,
         0.09576039, -0.1549168 , -0.05301735,  0.02143643,  0.1200113 ,
        -0.06519454,  0.14092922,  0.12549583, -0.08167701, -0.17365649,
        -0.08256081,  0.03598054, -0.01726389, -0.00291723,  0.13104874,
        -0.03672575, -0.16984083, -0.11822981, -0.02188655, -0.0179612 ,
         0.09294079,  0.13191718,  0.20639436, -0.03133885,  0.02851779,
         0.02657138,  0.07973024, -0.01196658,  0.00117048,  0.03134397,
         0.16240258,  0.0738625 ,  0.0008838 , -0.00090208,  0.02884579,
        -0.05542522, -0.00448262, -0.07189087, -0.11853733, -0.07589301,
         0.15974716,  0.05274018,  0.1863489 ,  0.06927165,  0.01917007,
        -0.11809305, -0.11048221,  0.05883755, -0.01623858, -0.11332046,
        -0.06050496,  0.01738346,  0.15817337, -0.08151137, -0.04895926,
        -0.0893449 , -0.03123172,  0.14792064,  0.0461861 ,  0.0399505 ,
        -0.09982616,  0.11309339, -0.1126666

## Prepare Dataset

In [15]:

class MorphertDataset(Dataset):
  def __init__(self, idxs, vocabs, embs):
    assert max(idxs) < len(vocabs)
    assert max(idxs) < embs.shape[0]
    self.vocabs = vocabs
    self.embs = embs
    self.idxs = idxs

  def __len__(self):
    return len(self.idxs)

  def __getitem__(self, idx):
    idx = self.idxs[idx]
    return {
        "word": self.vocabs[idx],
        "vec": self.embs[idx, :],
    }

  def get_word(self, idx):
    return self.vocabs[self.idxs[idx]]

In [None]:
train_dataset = MorphertDataset(train_idxs, vocabs, embs)
test_dataset = MorphertDataset(test_idxs, vocabs, embs)

In [None]:
len(train_dataset), len(test_dataset)

(490000, 10000)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

In [8]:
class DataCollator:
  def __init__(self, tokenizer, device=None):
    if not device:
      self.device = "cuda" if torch.cuda.is_available() else "cpu"
    else:
      self.device = device
    self.tokenizer = tokenizer

  def __call__(self, Xs):
    words = [x["word"] for x in Xs]
    vec = np.vstack([x["vec"] for x in Xs])
    vec = torch.tensor(vec, dtype=torch.float32).to(self.device)
    input_batch = self.tokenizer(words, return_tensors="pt", padding="longest")
    input_batch = input_batch.to(self.device)     
    return {
        **input_batch, "labels": vec, "words": words
    }

In [None]:
batch = DataCollator(tokenizer)([test_dataset[i] for i in range(5,10)])

In [None]:
batch["input_ids"], batch["labels"].shape

(tensor([[ 101, 5408,  860,  102,    0,    0,    0],
         [ 101, 7770, 5277, 5307, 3845,  102,    0],
         [ 101, 6822,  671, 3635, 2990, 1285,  102],
         [ 101,  679, 1398, 6235, 2428,  102,    0],
         [ 101, 1059, 4413, 4777, 1355,  102,    0]], device='cuda:0'),
 torch.Size([5, 100]))

In [6]:
import torch.nn as nn
from dataclasses import dataclass

@dataclass
class MorphertOutput:
  loss: float
  predictions: np.ndarray

class MorphertModel(BertPreTrainedModel):
  def __init__(self, config, *args, **kwargs):
    super().__init__(config, **kwargs)
    emb_dim = kwargs.get("emb_dim", 100)
    hdim = self.config.hidden_size
    self.bert = BertModel(config)
    self.proj = nn.Linear(hdim, emb_dim)
  
  def forward(self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        
    outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    cls_vec = outputs.last_hidden_state[:, 0]
    pred_vec = self.proj(cls_vec)    

    if labels is not None:
      loss_fct = nn.MSELoss()
      loss = loss_fct(pred_vec, labels)
    else:
      loss = float("NaN")

    return MorphertOutput(loss, pred_vec)

## Model training

In [None]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mseantyh[0m (use `wandb login --relogin` to force relogin)


True

In [None]:
from tqdm.auto import tqdm
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup

In [None]:
lr = 1e-4
warmup_step = 100
batch_size = 16
nepoch = 1

In [None]:
wandb.init(project="morphert", config={"lr": lr, "warmup": warmup_step, "batch_size": batch_size, "nepoch": nepoch})




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
loss,█▆▄▃▄▃▃▃▃▃▃▂▂▂▂▃▂▂▂▂▂▃▂▂▂▂▂▂▂▂▁▂▂▁▂▂▂▂▁▁

0,1
loss,0.0031


In [None]:
collator_fn = DataCollator(tokenizer)
train_loader = DataLoader(train_dataset, collate_fn=collator_fn, batch_size=batch_size, shuffle=True)

In [None]:
model = MorphertModel.from_pretrained("bert-base-chinese").to("cuda")
model.train()
optimizer = optim.AdamW(model.parameters(), lr=2e-4)
scheduler = get_linear_schedule_with_warmup(
              optimizer, 
              warmup_step, 
              len(train_dataset)//batch_size*nepoch)
loss_vec = []

for epoch_idx in range(nepoch):
  n_batch = len(train_dataset)//batch_size
  for batch_idx, batch_x in tqdm(enumerate(train_loader), total=n_batch):
    optimizer.zero_grad()    
    out = model(**batch_x)
    loss = out.loss
    loss.backward()
    optimizer.step()
    scheduler.step()
    if batch_idx % 20 == 0:
      wandb.log({"loss": loss.item()})    


Some weights of the model checkpoint at bert-base-chinese were not used when initializing MorphertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing MorphertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MorphertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MorphertModel were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['proj.bias', 'pro

  0%|          | 0/30625 [00:00<?, ?it/s]

In [None]:
base_dir = "/content/drive/My Drive/LangOn/morphert"
model.save_pretrained(base_dir + "/morphert_500k_b16")

In [None]:
import json
with open(base_dir + "/train_test_split_500k.json", "w") as fout:
  json.dump({"train": train_idxs.tolist(), "test": test_idxs.tolist()}, fout)

## Evaluation

In [9]:
base_dir = "/content/drive/My Drive/LangOn/morphert"
model = MorphertModel.from_pretrained(base_dir + "/morphert_500k")
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
collator_fn = DataCollator(tokenizer)
model = model.to("cuda")

In [10]:
import json

with open(base_dir + "/train_test_split_500k.json", "r") as fin:
  idxs_data = json.load(fin)
  train_idxs = idxs_data["train"]
  test_idxs = idxs_data["test"]

In [11]:
from scipy.spatial.distance import cdist
rng = np.random.RandomState(123)

In [12]:
def compute_metric(ref_words, pred_vecs, ref_emb, ds):
  dist_mat = cdist(pred_vecs, ref_emb)  
  pred_idxs = np.argmin(dist_mat, axis=1)
  pred_words = [ds.get_word(x) for x in pred_idxs]
  # print(*[(a,b) for a, b in zip(pred_words, ref_words)], sep="\n")
  return sum((a==b for a, b in zip(pred_words, ref_words)))

### Training set predictions

In [None]:
sub_idxs = np.arange(len(train_idxs))
rng.shuffle(sub_idxs)
sub_train = MorphertDataset(sub_idxs[:10000], vocabs, embs)
sub_train_emb = np.vstack([sub_train[i]["vec"] for i in range(len(sub_train))])

In [None]:
model.eval()
sub_train_loader = DataLoader(sub_train, collate_fn=collator_fn, batch_size=16, shuffle=True)
n_correct = 0
n_items = 0
with torch.no_grad():
  for batch_x in sub_train_loader:        
    out = model(**batch_x)
    ref_words = batch_x["words"]
    pred_vecs = out.predictions.cpu().numpy()
    n_correct_x = compute_metric(ref_words, pred_vecs, sub_train_emb, sub_train)
    n_correct += n_correct_x
    n_items += len(ref_words)
print("Train(subsample) Acc: {:.4f}".format(n_correct / n_items))

Train(subsample) Acc: 0.7062


### Test eval

In [None]:
test_dataset = MorphertDataset(test_idxs, vocabs, embs)
test_emb = np.vstack([test_dataset[i]["vec"] for i in range(len(test_dataset))])
test_loader = DataLoader(test_dataset, collate_fn=collator_fn, batch_size=16, shuffle=False)

In [None]:
model.eval()
n_correct = 0
n_items = 0
with torch.no_grad():
  for batch_x in tqdm(test_loader):        
    out = model(**batch_x)
    ref_words = batch_x["words"]
    pred_vecs = out.predictions.cpu().numpy()
    n_correct_x = compute_metric(ref_words, pred_vecs, test_emb, test_dataset)
    n_correct += n_correct_x
    n_items += len(ref_words)    
print("Test Acc: {:.4f}".format(n_correct / n_items))

  0%|          | 0/625 [00:00<?, ?it/s]

Test Acc: 0.6693


### First K item accuracies
* k = 500, .89
* k = 1000, .82
* k = 2000, .78

In [None]:
k = 2000
test1k_dataset = MorphertDataset(sorted(test_idxs)[:k], vocabs, embs)
test1k_emb = np.vstack([test1k_dataset[i]["vec"] for i in range(len(test1k_dataset))])
test1k_loader = DataLoader(test1k_dataset, collate_fn=collator_fn, batch_size=16, shuffle=False)

In [None]:
model.eval()
n_correct = 0
n_items = 0
with torch.no_grad():
  for batch_x in tqdm(test1k_loader):        
    out = model(**batch_x)
    ref_words = batch_x["words"]
    pred_vecs = out.predictions.cpu().numpy()
    n_correct_x = compute_metric(ref_words, pred_vecs, test1k_emb, test1k_dataset)
    n_correct += n_correct_x
    n_items += len(ref_words)    
print("Test1k Acc: {:.4f}".format(n_correct / n_items))

  0%|          | 0/125 [00:00<?, ?it/s]

Test1k Acc: 0.7795


## Prediction Exploration

In [16]:
full_ds = MorphertDataset(np.arange(N), vocabs, embs)
full_emb = np.vstack([full_ds[i]["vec"] for i in range(N)])

In [17]:
def predict_neighbors(texts):
  model.eval()
  in_batch = tokenizer(texts, padding=True, return_tensors="pt")
  in_batch = in_batch.to("cuda")
  with torch.no_grad():    
    out = model(**in_batch)  
    pred_vecs = out.predictions.cpu().numpy()
    dist_mat = cdist(pred_vecs, full_emb)  
    pred_idxs = np.argsort(dist_mat, axis=1)

    topk = 5
    for i, word in enumerate(texts):
      pred_x = pred_idxs[i, :]
      pred_words = [full_ds.get_word(x) for x in pred_x[:topk]]
      marker = "*" if word not in full_ds.vocabs else " "
      print(marker, word+":", " ".join(pred_words))

In [18]:
predict_neighbors(["政府", "電影", "鍵盤", "歡天喜地", "欢天喜地", "無三不成禮", "曾昱翔"])

  政府: 政府 政府部门 是政府 财政 部门
  電影: 電影 劇情 電視劇 紀錄片 喜劇
  鍵盤: 鍵盤 電腦 開關 機器 鍵
* 歡天喜地: 快樂 開心 歡樂 慶祝 聯
  欢天喜地: 欢天喜地 欢欢喜喜 喜气洋洋 高高兴兴 笑逐颜开
* 無三不成禮: 絕對 罷 禮 應 絕
* 曾昱翔: 吴秀波 杜淳 金世佳 王晓晨 曹骏


In [None]:
predict_neighbors(["元亨利貞", "見龍在田", "见龙在田", "亢龍有悔", "亢龙有悔"])

* 元亨利貞: 國王 親王 帝國 貴族 蘇丹
* 見龍在田: 復仇 誠 俠 劍 命運
* 见龙在田: 料事如神 痛打落水狗 胆小如鼠 贪生怕死 不识时务
* 亢龍有悔: 妇人之仁 置之死地而后生 冲冠一怒为红颜 愚忠 贪生怕死
  亢龙有悔: 冲冠一怒为红颜 妇人之仁 兔死狗烹 置之死地而后生 贪生怕死


In [None]:
predict_neighbors(["網美", "网美", "自組"])

* 網美: 寫真 攝影師 日韓 華麗 廣告
* 网美: 乐蜂网 芭莎 聚美 天猫 丸美
* 自組: 團隊 機器 單獨 自動 模擬


In [None]:
predict_neighbors(["那還用說", "最好是", "很可以"])

* 那還用說: 當然 話說 或許 不過 畢竟
  最好是: 最好 最好就是 最好是 好是 或者
* 很可以: 应该很好 也是可以 很好 很合适 很适合


In [None]:
predict_neighbors(["三言兩語", "三言两语", "絮聒", "靜諡", "四平八穩", "飛鴻雪泥"])

* 三言兩語: 說話 對話 罵 講 詞
  三言两语: 言语 言辞 言词 敷衍 自说自话
* 絮聒: 聒噪 嬉笑 笑语 哀怨 嘻笑
* 靜諡: 詔 親王 肅 長子 將軍
* 四平八穩: 穩 穩定 勢 靜 順利
* 飛鴻雪泥: 劍 猶 鷹 歸 詩


In [None]:
predict_neighbors(["傷肺", "傷荷包", "食詞", "虛詞", "虛化", "創化"])

* 傷肺: 嚴重 傷 伤胃 藥 臟
* 傷荷包: 傷 嚴重 遭罪 伤身 煩
* 食詞: 漢字 詞 語 發音 豬
* 虛詞: 詞 寫作 漢字 說法 說明
* 虛化: 模擬 變形 轉換 運用 轉變
* 創化: 開拓 創 進化 復興 創造


In [None]:
predict_neighbors(["幹勁", "耍廢", "學測", "熱鍋上的螞蟻", "循環悖論", "朋朋"])

* 幹勁: 熱情 干劲 进取心 冲劲 勁
* 耍廢: 鬧 罵 亂 變態 騙
* 學測: 數學 教學 課程 學術 學科
  心安理得: 心安理得 安安稳稳 安心地 心安 平平静静
* 熱鍋上的螞蟻: 悲劇 愛情 瘋 現實 變態
* 循環悖論: 理論 假設 錯誤 現象 觀點
* 朋朋: 友 友人 几位朋友 其他朋友 亲故


In [19]:
predict_neighbors(["奇獸", "文本探勘", "鯛民", "安靜"])

* 奇獸: 精靈 獸 惡魔 傳說 獵人
* 文本探勘: 资料收集 文献检索 数据挖掘 信息检索 课题研究
* 鯛民: 渔民 岛民 土著人 海怪 土人
* 安靜: 靜 孤獨 氣氛 溫柔 歡樂


In [None]:
word = "名古屋大學"
predict_neighbors([word[:i] for i in range(1, len(word)+1)])
predict_neighbors([word[-i:] for i in range(len(word)-1, 0, -1)])

  名: 名 个 名子 名字 位
* 名古: 奇珍 稀世 古老 三绝 八景
  名古屋: 大阪 福冈 名古屋 东京 日本东京
* 名古屋大: 日本东京 福冈 大阪 名古屋 新宿
* 名古屋大學: 國立 學院 教師 東京 大學
* 古屋大學: 國立 學院 教師 學生會 大學
* 屋大學: 學院 國立 倫敦 實驗室 英國
  大學: 大學 學校 學院 中學 小學
  學: 學習 學 讀 習 大學


In [None]:
word = "陳時中"
predict_neighbors([word[:i] for i in range(1, len(word)+1)])
predict_neighbors([word[-i:] for i in range(len(word)-1, 0, -1)])

  陳: 吳 劉 楊 鄧 鄭
* 陳時: 鄧 蕭 吳 鍾 偉
* 陳時中: 偉 當時 民進黨 國民黨 鄧
* 時中: 期間 當中 過程 並且 階段
  中: 中 中的 中有 当中 中都
