In [None]:
from multivec.cython.multivec import BilingualModel, MonolingualModel
import gzip

In [2]:
%%time
corpora = [b"./un16m.bin", b'./un4m.lemma.bin']
corpus = corpora[0]
model = BilingualModel(corpus)

CPU times: user 11.5 s, sys: 3.33 s, total: 14.9 s
Wall time: 14.8 s


In [3]:
model.dimension

200

In [4]:
en_model = model.src_model
zh_model = model.trg_model

In [5]:
w = '國家' # '資產' # '獲益' #'餵奶' # 
enlist = model.src_closest(w.encode(), n=20)
' | '.join([e.decode() for (e, d) in enlist])

'governments | institutions | industrialized | developing | national | regions | neighbouring | subnational | region | regional | capacities | recipient | less-developed | ownership | middle-income | neighboring | Industrialized | strategies | sovereign | economies'

In [6]:
w = 'sensibility'
zhlist = model.trg_closest(w.encode(), n=20)
print([z.decode() for (z, d) in zhlist])

['敏鋭', '氣質', '好奇心', '社會意識', '認知', '特質', '敏感性', '道德觀念', '自我認識', '理解力', '敏感度', '激發起', '思想道德', '感性', '覺悟', '政治覺悟', '解構', '感知', '積極向上', '表達能力']


In [7]:
def analogy(w1, w2, w3):
    '''
    Solves problems of the type:
    w1 : w2 :: w3 : __
    '''
    closest_words = []
    try:
        w1v = model.src_model.word_vec(w1.encode())
        w2v = model.src_model.word_vec(w2.encode())
        w3v = model.trg_model.word_vec(w3.encode())
        w4v = w3v + (w2v - w1v)
        closest_words = [w.decode() for (w, d) in model.trg_model.closest_to_vec(w4v, n=15)]
        closest_words = [w for w in closest_words if w not in [w1, w2, w3]]
    except:
        pass
    if len(closest_words) == 0:
        print(':-(')
    else:
        print('{} : {} :: {} : {}'.format(w1, w2, w3, closest_words[0]))
        print(closest_words)

In [8]:
w1 = 'Scotland'
w2 = 'Edinburgh'
w3 = '英格蘭'; 
analogy(w1, w2, w3)

Scotland : Edinburgh :: 英格蘭 : 愛丁堡
['愛丁堡', '劍橋', '加的夫', '愛丁堡大學', '諾丁漢', '七海', '曼徹斯特', '格拉斯哥', '新澤西', '達拉斯', '英國倫敦', '約克郡', '鹽湖城', '利物浦']


In [9]:
w = 'king'
zhlist = model.trg_closest(w.encode(), n=20)
' | '.join([z.decode() for (z, d) in zhlist])

'國王 | 賈南 | 皇帝 | 之王 | 國父 | 一世 | 統治者 | 公爵 | 諾羅敦 | 陛下 | 托亞 | 登基 | 王室 | 君主 | 女皇 | 大公 | 王朝 | 敬愛 | 女王 | 塔努馬'

In [10]:
def analogy2(w1, w2, w3):
    '''
    Solves problems of the type:
    w1 : w2 :: w3 : __
    '''
    closest_words = []
    try:
        w1v = model.trg_model.word_vec(w1.encode())
        w2v = model.trg_model.word_vec(w2.encode())
        w3v = model.src_model.word_vec(w3.encode())
        w4v = w3v + (w2v - w1v)
        closest_words = [w.decode() for (w, d) in model.src_model.closest_to_vec(w4v, n=15)]
        closest_words = [w for w in closest_words if w not in [w1, w2, w3]]
    except:
        pass
    if len(closest_words) == 0:
        print(':-(')
    else:
        print('{} : {} :: {} : {}'.format(w1, w2, w3, closest_words[0]))
        print(closest_words)

In [11]:
# adj-modifier [JJ] (w2) + noun [NO] (w1)
w1 = '疾病'
w2 = '先天'
w3 = 'disease'
# adv-modifier (w2) + verb (w1)
w1 = '氣候'
w2 = '正常'
w3 = 'phenomenon'

w1 = '男'
w2 = '國王'
w3 = 'woman'

# base=N, collocate=V
w1 = '根基'# '根本'
w2 = '鞏固' #'動搖'
w3 = 'foundations'

# base=N, collocate=V
w1 = '秘密'# '根本'
w2 = '發現' #'動搖'
w3 = 'secrets'

# base=N, collocate=V
w1 = '人選'# '根本'
w2 = '推薦' #'動搖'
w3 = 'candidate'

# verb [V] (w2) + direct object [DO] (w1)
w1 = '犯罪'
w2 = '打擊'
w3 = 'crime'; 

# verb [V] (w2) + direct object [DO] (w1)
w1 = '耕耘'
w2 = '默默'
w3 = 'work'; 

# verb [V] (w2) + direct object [DO] (w1)
w1 = '名譽'
w2 = '損害'
w3 = 'reputation'; 

w1 = '隱私'
w2 = '侵犯'
w3 = 'privacy'; 

w1 = '精緻'
w2 = '生活'
w3 = 'fine'; 

w1v = zh_model.word_vec(w1.encode())
w2v = zh_model.word_vec(w2.encode())
w3v = en_model.word_vec(w3.encode())
w4v = w3v + (w2v - w1v)
closest_words = [w.decode() for (w, d) in en_model.closest_to_vec(w4v, n=25)]
print(' | '.join(closest_words))

fine | life | imprisonment | living | live | lives | fines | people | citizens | lived | fined | kip | 20.000.000 | citizen | perpétua | RM10 | inhabitants | 5.000.000 | riel | families | person | Rp | rupees | dong | man


In [12]:
 en_model.closest_to_vec(w4v, n=25)

[(b'fine', 0.71402907371521),
 (b'life', 0.6275485157966614),
 (b'imprisonment', 0.5957995653152466),
 (b'living', 0.5903329253196716),
 (b'live', 0.5238180160522461),
 (b'lives', 0.521121621131897),
 (b'fines', 0.5015294551849365),
 (b'people', 0.48274001479148865),
 (b'citizens', 0.46281448006629944),
 (b'lived', 0.45351529121398926),
 (b'fined', 0.4439633786678314),
 (b'kip', 0.4424514174461365),
 (b'20.000.000', 0.4390920400619507),
 (b'citizen', 0.43767303228378296),
 (b'perp\xc3\xa9tua', 0.4332098662853241),
 (b'RM10', 0.4323761463165283),
 (b'inhabitants', 0.4301243722438812),
 (b'5.000.000', 0.4230095446109772),
 (b'riel', 0.42221710085868835),
 (b'families', 0.4221489727497101),
 (b'person', 0.42182105779647827),
 (b'Rp', 0.42133381962776184),
 (b'rupees', 0.41881904006004333),
 (b'dong', 0.4179326891899109),
 (b'man', 0.4169285297393799)]

In [13]:
def collocate(w1, w2, w3):
    '''
    Given:
        Chinese base w1 and Chinese collocate w2
    Find:
        candidates for collocate to English base w3
    '''
    closest_words = []
    try:
        w1v = model.trg_model.word_vec(w1.encode())
        w2v = model.trg_model.word_vec(w2.encode())
        w3v = model.src_model.word_vec(w3.encode())
        w4v = w3v + (w2v - w1v)
        closest_words = [w.decode() for (w, d) in model.src_model.closest_to_vec(w4v, n=15)]
        closest_words = [w for w in closest_words if w not in [w1, w2, w3]]
    except:
        pass
    if len(closest_words) == 0:
        print(':-(')
    else:
        print('{} : {} :: {} : {}'.format(w1, w2, w3, closest_words[0]))
        print(closest_words)

In [14]:
# adj-modifier [JJ] (w2) + noun [NO] (w1)
w1 = '貧窮'
w2 = ''
w1 = '疾病'
w2 = '治療'
w3 = 'disease'
w3 = 'poverty'


# verb [V] (w2) + direct object [DO] (w1)
w1 = '因素'
w2 = '確定'
w3 = 'factors'

w1 = '犯罪'
w2 = '打擊'
w3 = 'crime'
collocate(w1, w2, w3)

犯罪 : 打擊 :: crime : combating
['combating', 'fight', 'combat', 'terrorism', 'trafficking', 'corruption', 'COMBATING', 'combatting', 'Combatting', 'prevention', 'Combating', 'scourge', 'suppression', 'anti-trafficking']


In [15]:
def collocate2(w1, w2, w3):
    '''
    Given:
        Chinese base w1 and Chinese collocate w2
    Find:
        candidates for collocate to English base w3
    '''
    closest_words = []
    try:
        w1v = model.src_model.word_vec(w1.encode())
        w2v = model.src_model.word_vec(w2.encode())
        w3v = model.trg_model.word_vec(w3.encode())
        w4v = w3v + (w2v - w1v)
        closest_words = [w.decode() for (w, d) in model.trg_model.closest_to_vec(w4v, n=15)]
        closest_words = [w for w in closest_words if w not in [w1, w2, w3]]
    except:
        pass
    if len(closest_words) == 0:
        print(':-(')
    else:
        print('{} : {} :: {} : {}'.format(w1, w2, w3, closest_words[0]))
        print(closest_words)

In [16]:
w1 = 'victim'
w2 = 'prominent'
w3 = '犧牲者'

w1 = 'victim'
w2 = 'abuse'
w3 = '受害者'

collocate2(w1, w2, w3)

victim : abuse :: 受害者 : 虐待
['虐待', '性虐待', '剝削', '性暴力', '濫用', '性侵犯', '暴力', '欺凌', '凌辱', '凌虐', '暴力行為', '虐待老人', '性', '家庭暴力']


In [17]:
import numpy as np
from scipy.spatial.distance import cosine as cos_dist
cos_dist(model.src_model.word_vec('disease'.encode()), 
         model.trg_model.word_vec('疾病'.encode()))

0.09458018600005236

In [18]:
w1 = '策略' 
w2 = '嚴厲'  
w3 = 'tactic'
analogy2(w1, w2, w3)

策略 : 嚴厲 :: tactic : harsher
['harsher', 'harshest', 'sanction', 'punishments', 'severest', 'draconian', 'merciless', 'crackdowns', 'harshly', 'punishment', 'brutal', 'ruthless', 'shamelessly', 'callously']


In [19]:
w3 = 'execution'
w1 = 'rank' 
w2 = 'prioritize'  

w1 = 'decision' 
w2 = 'light'  
w3 = '決定'
analogy(w1, w2, w3)

decision : light :: 決定 : 有鑒於
['有鑒於', '鑒於', '銘記', '考慮', '牢記', '加以分析', '無先例', '審慎考慮', '宜于', '審視', '到現', '時局', '須予', '參照', '過高估計']


In [20]:
#en_model.sent_vec(b'Hong Kong') - en_model.word_vec(b'Hong') - en_model.word_vec(b'Kong')


In [21]:
def en2zh(txt, n=5):
    retval = []
    for (x,s) in model.trg_closest(txt.encode(), n):
        retval.append(x.decode())
    return retval

def zh2en(txt, n=5):
    retval = []
    for (x,s) in model.src_closest(txt.encode(), n):
        retval.append(x.decode())
    return retval

In [22]:
def enSynonyms(txt, n=5):
    retval = []
    for (x,s) in model.src_model.closest(txt.encode(), n):
        retval.append(x.decode())
    return retval

def zhSynonyms(txt, n=5):
    retval = []
    for (x,s) in model.trg_model.closest(txt.encode(), n):
        retval.append(x.decode())
    return retval


### 英語近義詞

In [23]:
words = '''
embryonic 
'''
words = words.strip().split()
' | '.join(enSynonyms(words[-1], n=25))

'incipient | embryo | infancy | cloned | immature | formative | skeletal | embryos | quails | metabolism | aberration | hepatic | vivo | hypodynamy | germination | vitro | maturation | chromosomal | primitive | hormones | organism | microscopic | inhibitor | germ | hormone'

In [24]:
words = '''
一窩蜂
'''
words = words.strip().split()
' | '.join(zhSynonyms(words[-1], n=25))

'投機者 | 投機性 | 多國公司 | 掠奪性 | 爭先恐後 | 寡頭 | 擠走 | 投機 | 拉動 | 自然而然 | 拋售 | 有權有勢 | 接二連三 | 爭相 | 從眾 | 套利 | 壓低 | 出人意料 | 非正統 | 不擇手段 | 擠出 | 市場化 | 壟斷市場 | 投資商 | 貪得無厭'

In [25]:
words = '''
訴訟 灌輸 萌芽 巧取豪奪 名存實亡 盛氣凌人 雞皮疙瘩 不堪回首 
'''
words = words.strip().split()
word   = words[-1]
#print(' | '.join(zh2en(word, 35)))

corpora = ['UNv1.0.en-zh.combined.txt', 'nytimes.txt', 'roclaws.zh-en.combined.txt', 'frog.zh-en.combined.txt']
corpus = corpora[1]
! grep -Pi -B1 -e "$word" $corpus --color=always

His flamboyant present overwrites his distressing past. It’s the eternal sunshine of the spotless Trump.
他招搖過市的當下，遮蓋了他[01;31m[K不堪回首[m[K的過往——完美無瑕的特朗普閃出永恆的光。
[36m[K--[m[K
Today, Kiel is still a seafaring town. Submarines are still built here. Sailors bob on Kiel Sound. Instead of dreadnoughts and battle cruisers, hulking cruise and cargo ships dominate the urban shoreline. But the city is a mosaic of ill-matched architecture and buildings that have changed purpose. Wilhelm’s proud Naval Academy, for instance, is now the parliamentary seat of Schleswig-Holstein, the state of which Kiel is the capital. The mishmash lends a defeated air to a place still pondering how to remember and interpret a dreadful 20th century.
今天的基爾，依然是座港口城市。潛水艇還在這裡建造。一群群水手徜徉在基爾的海灣。不同於昔日滿眼無畏艦、巡洋戰艦的場景，今天，龐大的遊船、貨船佔據着這座城市的海岸線。這裡的建築年代、風格迥異，早已遠離了舊日的各種用途。比如，威廉二世當年威風凜凜的海軍學院，如今已成為石勒蘇益格-荷爾斯泰因州（Schleswig-Holstein，首府為基爾）的議會大樓。如此古今交疊的混雜，給這座仍然對[01;31m[K不堪回首[m[K的20世紀試圖進行反思和詮釋的城市，籠罩了一種迷茫與挫敗的氛圍。
[36m[K--[m[K
Germany,

In [26]:
%%time
words = '''
embryonic 
'''
words = words.strip().split()
w = words[-1]; print(w)
' | '.join(en2zh(w, n=50))

embryonic
CPU times: user 646 ms, sys: 11.9 ms, total: 658 ms
Wall time: 652 ms


'萌芽 | 幹細胞 | 初級階段 | 胚胎 | 發育階段 | 牙齦 | 大腦 | 類固醇 | 初步階段 | 孕育 | 授精 | 無性繁殖 | 動物模型 | 體外 | 性腺 | 雌激素 | 垂體 | 代謝 | 嬰兒期 | 細胞 | 乳汁 | 體細胞 | 染色體 | 失重 | 發育畸形 | 雛形 | 腎癌 | 囊腫 | 開發階段 | 現階段 | 細胞培養 | 配子 | 高級階段 | 激素 | 受精 | 先天 | 克隆人 | 生殖細胞 | 癌 | 晚期 | 人工呼吸 | 單細胞 | 分泌 | 為時尚早 | 支氣管 | 腫瘤 | 克隆 | 胚 | 已近尾聲 | 斑馬魚'

In [27]:
words = '''
夠用 足夠 萬劫 不復 問題 轉換 提供 深邃 萌芽 
'''
words = words.strip().split()
' | '.join(zhSynonyms(words[-1], 25))

'初級階段 | 起步 | 沉淪 | 休眠狀態 | 消退 | 壽終正寢 | 消亡 | 牙齦 | 醞釀 | 孕育 | 發芽 | 後期 | 初具 | 苗頭 | 潛伏 | 開花 | 初創 | 慢慢 | 消散 | 深陷 | 湧現 | 無政府 | 初期 | 捲土重來 | 方興未艾'