## Train sense emb

In [1]:
from pathlib import Path
class TaggedCorpusIter:
    def __init__(self, base_dir):
        self.base_dir = Path(base_dir)
    
    def __iter__(self):
        for file_x in self.base_dir.glob("*.txt"):            
            fin = file_x.open("r", encoding="UTF-8")
            for ln in fin:
                yield ln.strip().split()            
            fin.close()            

In [2]:
from itertools import islice
tag_iter = TaggedCorpusIter("../data/dt-asbc/")

In [3]:
%%time
from gensim.models import Word2Vec
model = Word2Vec(sentences=tag_iter, vector_size=200, window=5, 
                 min_count=3, workers=8, sg=1, epochs=5)

CPU times: user 5min 43s, sys: 2.43 s, total: 5min 46s
Wall time: 1min


In [4]:
model.wv.save_word2vec_format("../data/dt-asbc.word2vec.kv.bin", binary=True)

In [5]:
from pathlib import Path
import hashlib
paths = ["../data/dt-asbc.word2vec.kv.bin"]
for path_x in paths:
    h = hashlib.sha1()
    h.update(Path(path_x).read_bytes())
    print(path_x, h.hexdigest()[:6], sep="\t")

../data/dt-asbc.word2vec.kv.bin	45e6e2


## KeyedVectors

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [8]:
from sensEM import SenseKeyedVectors

In [9]:
kv = SenseKeyedVectors.load_from_kv("../data/dt-asbc.word2vec.kv.bin")

In [10]:
kv.show_neighbors_bysenses("便宜")

{'便宜(06763501) f= 110742: 形容物品價錢比預期低。)': [('貴', 0.8054711222648621),
  ('經濟', 0.7892798781394958),
  ('划算', 0.7595694065093994),
  ('貴上', 0.7516638040542603),
  ('低廉', 0.7478659152984619),
  ('價錢', 0.7245354652404785),
  ('昂貴', 0.7211244106292725),
  ('免稅店', 0.7169173955917358),
  ('外帶', 0.7143236398696899),
  ('地攤貨', 0.7070028185844421)],
 '便宜(06763502) f= 77884: 用比預期低的價錢所完成的交易買賣。)': [('賣乖', 0.8740313053131104),
  ('心軟', 0.8577039241790771),
  ('奈何', 0.8567598462104797),
  ('動不了', 0.854430615901947),
  ('雞皮疙瘩', 0.8543466925621033),
  ('要不', 0.8538295030593872),
  ('冷', 0.852941632270813),
  ('一文不值', 0.8529177904129028),
  ('承受不了', 0.8526293635368347),
  ('猜錯', 0.8518838882446289)],
 '便宜(06763503) f= 86647: 使物品價錢降低。)': [('貴', 0.9088633060455322),
  ('零售價', 0.9056693315505981),
  ('掉', 0.8955847024917603),
  ('三０％', 0.8921447396278381),
  ('進口量', 0.8917911648750305),
  ('４０萬', 0.8882931470870972),
  ('九％', 0.8878161311149597),
  ('２萬', 0.8859727382659912),
  ('８０萬', 0.8848020434379578),