In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../src")

In [3]:
from fluidvec import *

In [4]:
import torch
import pickle
from torch.optim import AdamW
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
from pathlib import Path
from fluidvec.dataset import TrainDataset, get_dataloader
# torch.autograd.set_detect_anomaly(True)

In [5]:
from gensim.models import KeyedVectors

In [6]:
model_dir = Path("../data/model/fluidvec-compo.5")
new_vs = VocabSet.load(model_dir)
hypers = torch.load(model_dir/"hypers.pt")
new_model = FluidVecSG(len(new_vs.word_vocab), len(new_vs.char_vocab), len(new_vs.compo_vocab), **hypers)
new_model.load_state_dict(torch.load(model_dir/"model.pt"))

device:  cpu
n_neg_sample:  3


<All keys matched successfully>

In [7]:
kv = KeyedVectors(hypers["dim"])

In [8]:
words = sorted(list(new_vs.word_vocab.vocab.keys()), key=new_vs.word_vocab.vocab.get)
chars = sorted(list(new_vs.char_vocab.vocab.keys()), key=new_vs.char_vocab.vocab.get)
compos = sorted(list(new_vs.compo_vocab.vocab.keys()), key=new_vs.compo_vocab.vocab.get)
word_vecs = new_model.word_emb.weight.detach().numpy()
char_vecs = new_model.char_emb.weight.detach().numpy()
compo_vecs = new_model.compo_emb.weight.detach().numpy()
assert len(words) == word_vecs.shape[0]
assert len(chars) == char_vecs.shape[0]
assert len(compos) == compo_vecs.shape[0]

In [9]:
kv.add_vectors(words, word_vecs)
kv.add_vectors(chars, char_vecs)
kv.add_vectors(compos, compo_vecs)

In [10]:
import pickle
with open("../data/word_mappings.pkl", "rb") as fin:
    word_mappings = pickle.load(fin)
word_mappings.update({
    "<UNK>": {'compos': [0], 'chars': [0], 'word': 0},
    "<PAD>": {'compos': [1], 'chars': [1], 'word': 1}
})

In [11]:
kv.most_similar("電腦")

[('設備', 0.9537464380264282),
 ('硬體', 0.9529376029968262),
 ('主機', 0.9517901539802551),
 ('應用', 0.9511823654174805),
 ('軟件', 0.9496055841445923),
 ('裝置', 0.9487616419792175),
 ('技術', 0.9484235644340515),
 ('電算', 0.9476643800735474),
 ('程式', 0.9476211667060852),
 ('網路', 0.9466950297355652)]

In [12]:
def make_model_input(word):
    item = {k: v for k, v in word_mappings[word].items()}
    item["compos"] = [new_vs.compo_vocab.decode(x) for x in item["compos"]]
    item["chars"] = [new_vs.char_vocab.decode(x) for x in item["chars"]]    
    return item["chars"] + item["compos"]

In [22]:
make_model_input("電腦桌")

['_電', '腦', '桌_', '⿱0-雨', '⿱1-⿻日乚', '⿰0-月', '⿰1-𡿺', '⿱0-⺊', '⿱1-杲']

In [25]:
kv.most_similar(make_model_input("雜誌"))

[('柚子核', 0.271586537361145),
 ('鬼頭刀', 0.2588353157043457),
 ('⿺0-走', 0.2395337074995041),
 ('悲欣', 0.23821423947811127),
 ('_諱', 0.2362767606973648),
 ('⿸0-䧹', 0.2349133938550949),
 ('梭倫', 0.23455743491649628),
 ('書平', 0.233648419380188),
 ('上軌道', 0.23044215142726898),
 ('界山', 0.22549574077129364)]

In [21]:
kv.most_similar([""])

[('馳', 0.9685375690460205),
 ('獸', 0.9677475094795227),
 ('盜', 0.9674520492553711),
 ('鳳', 0.964853048324585),
 ('與', 0.9646254181861877),
 ('虛', 0.9638371467590332),
 ('銳', 0.9635658264160156),
 ('劍', 0.9627208113670349),
 ('則', 0.9626174569129944),
 ('懼', 0.962375283241272)]