In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../src")

In [3]:
from fluidvec import *

In [4]:
import torch
import pickle
from torch.optim import AdamW
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
from pathlib import Path
from fluidvec.dataset import TrainDataset, get_dataloader
# torch.autograd.set_detect_anomaly(True)

In [5]:
from gensim.models import KeyedVectors

In [7]:
model_dir = Path("../data/model/fluidvec-compo.6")
new_vs = VocabSet.load(model_dir)
hypers = torch.load(model_dir/"hypers.pt")
new_model = FluidVecSG(len(new_vs.word_vocab), len(new_vs.char_vocab), len(new_vs.compo_vocab), **hypers)
new_model.load_state_dict(torch.load(model_dir/"model.pt", map_location=torch.device("cpu")))

device:  cpu
n_neg_sample:  5


<All keys matched successfully>

In [8]:
kv = KeyedVectors(hypers["dim"])

In [9]:
words = sorted(list(new_vs.word_vocab.vocab.keys()), key=new_vs.word_vocab.vocab.get)
chars = sorted(list(new_vs.char_vocab.vocab.keys()), key=new_vs.char_vocab.vocab.get)
compos = sorted(list(new_vs.compo_vocab.vocab.keys()), key=new_vs.compo_vocab.vocab.get)
word_vecs = new_model.word_emb.weight.detach().numpy()
char_vecs = new_model.char_emb.weight.detach().numpy()
compo_vecs = new_model.compo_emb.weight.detach().numpy()
assert len(words) == word_vecs.shape[0]
assert len(chars) == char_vecs.shape[0]
assert len(compos) == compo_vecs.shape[0]

In [10]:
kv.add_vectors(words, word_vecs)
kv.add_vectors(chars, char_vecs)
kv.add_vectors(compos, compo_vecs)

In [11]:
import pickle
with open("../data/word_mappings.pkl", "rb") as fin:
    word_mappings = pickle.load(fin)
word_mappings.update({
    "<UNK>": {'compos': [0], 'chars': [0], 'word': 0},
    "<PAD>": {'compos': [1], 'chars': [1], 'word': 1}
})

In [12]:
kv.most_similar("電腦")

[('設備', 0.9537464380264282),
 ('硬體', 0.9529376029968262),
 ('主機', 0.9517901539802551),
 ('應用', 0.9511823654174805),
 ('軟件', 0.9496055841445923),
 ('裝置', 0.9487616419792175),
 ('技術', 0.9484235644340515),
 ('電算', 0.9476643800735474),
 ('程式', 0.9476211667060852),
 ('網路', 0.9466950297355652)]

In [13]:
def make_model_input(word):
    item = {k: v for k, v in word_mappings[word].items()}
    item["compos"] = [new_vs.compo_vocab.decode(x) for x in item["compos"]]
    item["chars"] = [new_vs.char_vocab.decode(x) for x in item["chars"]]    
    return item["chars"] + item["compos"]

In [31]:
make_model_input("人物")

['_人', '物_', '<COMPO_NA>', '⿰0-牛', '⿰1-勿']

In [32]:
kv.most_similar(make_model_input("人物"))

[('⿰0-氵', 0.38182327151298523),
 ('⿰0-扌', 0.35032176971435547),
 ('⿰0-亻', 0.34808748960494995),
 ('⿰0-言', 0.3213883936405182),
 ('⿱1-心', 0.3032844662666321),
 ('⿱0-艹', 0.2988269329071045),
 ('⿰0-木', 0.2956547737121582),
 ('⿰1-攵', 0.2914125621318817),
 ('⿰0-糹', 0.27892595529556274),
 ('⿰0-忄', 0.2753599286079407)]

In [30]:
kv.most_similar(["_人", "設_"])

[('_也_', 0.2797718942165375),
 ('_在_', 0.272450715303421),
 ('_，_', 0.2661602199077606),
 ('_為_', 0.25626084208488464),
 ('紀伯倫', 0.24607113003730774),
 ('對_', 0.2428818941116333),
 ('_及_', 0.2286974936723709),
 ('法管學院', 0.22854962944984436),
 ('安佛羅西', 0.22707603871822357),
 ('廣告法', 0.2257523536682129)]