In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../src")

In [3]:
import pickle
import numpy as np
from tqdm.auto import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import torch
from transformers import BertTokenizer, BertModel, BertPreTrainedModel
from opencc import OpenCC
from morphert.model import *
from morphert.utils import compute_metric
from morphert import (get_predict_neighbors_fn, 
                      get_predict_vectors_fn, 
                      get_print_neighbors_fn,
                      get_predict_from_token_fn)
from morphert import compute_space_indices

t2s = OpenCC('t2s')
N = 500000

In [4]:
base_dir = "../data"
with open(base_dir + "/tencent_small_500k.pkl", "rb") as fin:
    (vocabs, embs) = pickle.load(fin)   
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = MorphertModel.from_pretrained(base_dir + "/morphert_500k")
collator_fn = DataCollator(tokenizer)
model = model.to("cuda")
full_ds = MorphertDataset(np.arange(N), vocabs, embs)
full_emb = np.vstack([full_ds[i]["vec"] for i in range(N)])
in_tencent = lambda x: x in full_ds.vocabs

In [5]:
predict_neighbors = get_predict_neighbors_fn(tokenizer, model, full_emb, full_ds)
print_neighbors = get_print_neighbors_fn(tokenizer, model, full_emb, full_ds)
predict_vectors = get_predict_vectors_fn(tokenizer, model)
predict_from_token = get_predict_from_token_fn(tokenizer, model)

In [6]:
data = pd.read_excel("../data/Tsang-2018-MELD-SCH.xlsx", sheet_name=0)

In [50]:
bisyll_items = data.loc[data.length==2, :]
bisyll_nw = bisyll_items.loc[data.lexicality==2, :]
bisyll_nw = bisyll_nw[["id", "word", "lexicality", "N", "RT", "RTSD", "zRT", "zRTSD", "ERR"]]
bisyll_nw.head()

Unnamed: 0,id,word,lexicality,N,RT,RTSD,zRT,zRTSD,ERR
12062,210023,一讳,2,37,880.347838,340.828969,0.0448,0.926231,7.5
12063,210024,一卖,2,38,907.074474,362.495826,0.252914,1.068396,9.52381
12064,210025,一阶,2,12,1012.0725,463.383231,0.582634,1.604937,71.428571
12065,210026,一视,2,29,904.806207,283.251551,0.254147,0.871846,27.5
12066,210027,一作,2,35,937.799429,362.058181,0.297648,1.103455,12.5


In [54]:
char_indices = {}
for itemx in tqdm(bisyll_items.itertuples()):
    word = itemx.word
    if itemx.lexicality != 1: continue
    for ch_i, ch in enumerate(word):
        if ch in char_indices:
            continue
        ch_i = ch_i+1
        char_indices[ch] = {
            "stroke": getattr(itemx, f"C{ch_i}stroke"),
            "nwf": getattr(itemx, f"C{ch_i}nwf"),
            "logcf": np.log(getattr(itemx, f"C{ch_i}cf")),
            "nom": getattr(itemx, f"C{ch_i}nom"),
            "nop": getattr(itemx, f"C{ch_i}nop")            
        }    
char_indices["我"]

0it [00:00, ?it/s]

{'stroke': 7.0,
 'nwf': 44.0,
 'logcf': 14.537721272509618,
 'nom': 1.0,
 'nop': 1.0}

In [52]:
for idx in ("stroke", "nwf", "logcf", "nom", "nop"):
    for ch_i in range(1,3):
        field = f"C{ch_i}{idx}"
        bisyll_nw[field] = [char_indices.get(x, {}).get(idx, 0) 
                            for x in bisyll_nw.word.str.slice(ch_i-1,ch_i)]

In [53]:
bisyll_nw.head()

Unnamed: 0,id,word,lexicality,N,RT,RTSD,zRT,zRTSD,ERR,C1stroke,C2stroke,C1nwf,C2nwf,C1logcf,C2logcf,C1nom,C2nom,C1nop,C2nop
12062,210023,一讳,2,37,880.347838,340.828969,0.0448,0.926231,7.5,1.0,6.0,801.0,7.0,13.311422,4.49981,11.0,2.0,1.0,1.0
12063,210024,一卖,2,38,907.074474,362.495826,0.252914,1.068396,9.52381,1.0,8.0,801.0,54.0,13.311422,9.305014,11.0,4.0,1.0,1.0
12064,210025,一阶,2,12,1012.0725,463.383231,0.582634,1.604937,71.428571,1.0,6.0,801.0,21.0,13.311422,7.336286,11.0,5.0,1.0,1.0
12065,210026,一视,2,29,904.806207,283.251551,0.254147,0.871846,27.5,1.0,8.0,801.0,91.0,13.311422,9.653808,11.0,6.0,1.0,1.0
12066,210027,一作,2,35,937.799429,362.058181,0.297648,1.103455,12.5,1.0,7.0,801.0,185.0,13.311422,11.129731,11.0,10.0,1.0,2.0


## Add morphert indices

In [69]:
from itertools import islice
def batch_fn(it, bsize): 
    return iter(lambda: list(islice(it, bsize)), [])

In [70]:
list(batch_fn(iter(range(10)), 3))

[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]

In [87]:
# predict neighbors
predn_map = {}
word_it = map(lambda x: x.word, bisyll_nw.itertuples())
batch_it = batch_fn(word_it, 20)
for batch in tqdm(batch_it, total=bisyll_nw.shape[0]//20):
    preds = predict_neighbors(batch, topk=50)
    predn_map.update({
        w: p for w, p in zip(batch, preds)
    })

  0%|          | 0/501 [00:00<?, ?it/s]

In [88]:
# import pickle
# with open("../data/pred_neighbors_bisyll_nw_MELD_SCH.pkl", "wb") as fout:
#     pickle.dump(predn_map, fout)

In [90]:
spidx_list = []
dist_mtop5 = []
dist_range = []
dist_mean = []
for itemx in tqdm(bisyll_nw.itertuples(), total=bisyll_nw.shape[0]):
    tgt_word = itemx.word
    predn = predn_map.get(tgt_word)    
    spidx = compute_space_indices(predn)
    dist_mtop5.append(spidx.dist_mtop5)
    dist_mean.append(spidx.dist_mean)
    dist_range.append(spidx.dist_range)
bisyll_nw["dist_mtop5"] = dist_mtop5
bisyll_nw["dist_range"] = dist_range
bisyll_nw["dist_mean"] = dist_mean

  0%|          | 0/10022 [00:00<?, ?it/s]

In [91]:
bisyll_nw.head()

Unnamed: 0,id,word,lexicality,N,RT,RTSD,zRT,zRTSD,ERR,C1stroke,...,C2nwf,C1logcf,C2logcf,C1nom,C2nom,C1nop,C2nop,dist_mtop5,dist_range,dist_mean
12062,210023,一讳,2,37,880.347838,340.828969,0.0448,0.926231,7.5,1.0,...,7.0,13.311422,4.49981,11.0,2.0,1.0,1.0,0.798487,0.031312,0.77227
12063,210024,一卖,2,38,907.074474,362.495826,0.252914,1.068396,9.52381,1.0,...,54.0,13.311422,9.305014,11.0,4.0,1.0,1.0,0.80218,0.057408,0.753191
12064,210025,一阶,2,12,1012.0725,463.383231,0.582634,1.604937,71.428571,1.0,...,21.0,13.311422,7.336286,11.0,5.0,1.0,1.0,0.940573,0.079146,0.889373
12065,210026,一视,2,29,904.806207,283.251551,0.254147,0.871846,27.5,1.0,...,91.0,13.311422,9.653808,11.0,6.0,1.0,1.0,0.828356,0.039003,0.792429
12066,210027,一作,2,35,937.799429,362.058181,0.297648,1.103455,12.5,1.0,...,185.0,13.311422,11.129731,11.0,10.0,1.0,2.0,0.745595,0.056122,0.703436


In [92]:
bisyll_nw.to_csv("../data/meld_bisyll_nw.csv", index=False)