In [1]:
import json
from pathlib import Path
from gensim.models import Word2Vec

In [2]:
nlp_path = Path("../data/tlds_data_mc4_tw_000.nlp.json")
if nlp_path.exists():
    print("Load from file cache")
    data = json.loads(nlp_path.read_text())

Load from file cache


In [3]:
print(data[0]["nlp"][:10])

[['頂級', 'A'], ['手鐲料', 'Na'], ['淡綠色', 'Na'], ['豆種', 'Na'], ['緬甸玉', 'Na'], ['翡翠', 'Na'], ['開運', 'Nd'], ['印章', 'Na'], ['臍帶', 'Na'], ['印章', 'Na']]


In [4]:
docs = []
for item_x in data:
    seq_x = []
    for tok_x in item_x["nlp"]:
        seq_x.append(f"{tok_x[0]}.{tok_x[1]}")
    docs.append(seq_x)

In [5]:
len(docs), docs[0][:10]

(1000,
 ['頂級.A',
  '手鐲料.Na',
  '淡綠色.Na',
  '豆種.Na',
  '緬甸玉.Na',
  '翡翠.Na',
  '開運.Nd',
  '印章.Na',
  '臍帶.Na',
  '印章.Na'])

In [6]:
%%time
w2v_model = Word2Vec(docs)

CPU times: total: 406 ms
Wall time: 546 ms


In [7]:
w2v_model.wv.most_similar("印章.Na")

[('寶寶.Na', 0.9979808330535889),
 ('婚禮.Na', 0.9973534941673279),
 ('香菇.Na', 0.9973424077033997),
 ('頂級.A', 0.9973317384719849),
 ('達人.Na', 0.9972850680351257),
 ('配件.Na', 0.9972473978996277),
 ('精選.VC', 0.9972197413444519),
 ('專線.Na', 0.9971712231636047),
 ('時尚.Na', 0.997072160243988),
 ('大賞.Na', 0.996990978717804)]

In [8]:
vocab = w2v_model.wv.index_to_key
len(vocab)

7961

In [9]:
lemmas = {}
for word_x in vocab:
    lemma_x, pos_x = word_x.split(".")
    if lemma_x in lemmas:
        lemmas[lemma_x].append(word_x)
    else:
        lemmas[lemma_x] = [word_x]    

In [47]:
multi_pos_words = []
for lemma_x, words in lemmas.items():
    if len(words)>1 and len(lemma_x)>1:
        multi_pos_words.append(lemma_x)
" ".join(multi_pos_words[:20])

'可以 教學 生活 服務 學習 計畫 辦理 中心 工作 使用 發展 專業 進行 教授 協助 健康 參與 參加 研究 安全'

In [48]:
target = "發展"
for word_x in lemmas[target]:
    neigh_words = []
    for neigh_x in w2v_model.wv.most_similar(word_x):
        neigh_words.append(neigh_x[0])
    print(word_x, ":", " ".join(neigh_words))

發展.VC : 服務.VC 技術.Na 推動.VC 合作.VH 人才.Na 人文.Na 專業.VH 創新.VC 協盟.Na 產業.Na
發展.Nv : 原住民族.Na 投資.VC 法規.Na 行銷.Na 農業.Na 都市.Na 工業.Na 設立.VC 貿易.Na 我國.Nc
發展.Na : 觀光.VA 能源.Na 法律.Na 媒體.Na 華人.Na 生態.Na 消費.Na 青年.Na 執行長.Na 和平.VH
