In [1]:
from multivec import BilingualModel, MonolingualModel

In [2]:
from opencc import OpenCC
ocs2t = OpenCC('s2t')
oct2s = OpenCC('t2s')

In [3]:
model = BilingualModel(b'/mnt/c/NLP/collo/un4m.lemma.bin')

In [4]:
model.dimension
#model.get_counts()
#get_counts()
a = model.src_model.get_counts()
a[b'border']

12083

In [5]:
# The following is unnecessary since the sepearate monolingual models are already built into the bilingual model
#en_model = MonolingualModel(b'/mnt/c/NLP/collo/un4m.lemma.en.bin')
#zh_model = MonolingualModel(b'/mnt/c/NLP/collo/un4m.lemma.zhs.bin')

In [6]:
zh_model = model.trg_model
en_model = model.src_model

In [7]:
w = ['恨', '程度'][1]
enlist = model.src_closest(oct2s.convert(w).encode(), n=12)
print([e.decode() for (e, d) in enlist])

['degree', 'extent', 'intensity', 'severity', 'magnitude', 'vary', 'depend', 'great', 'size', 'largely', 'gravity', 'considerably']


In [8]:
def enSynonyms(txt, n=5):
    retval = []
    for (x,s) in en_model.closest(txt.encode(), n):
        retval.append(x.decode())
    return retval

def zhSynonyms(txt, n=5):
    retval = []
    for (x,s) in zh_model.closest(oct2s.convert(txt).encode(), n):
        retval.append(ocs2t.convert(x.decode()))
    return retval


In [9]:
wen = ['basis', 'fixture', 'widely'][2]
wzh = ['恨', '程度', '前提'][2]
enSynonyms(wen, n=15)
#zhSynonyms(wzh, n=15)

['broadly',
 'widespread',
 'disseminate',
 'extensively',
 'universally',
 'commonly',
 'generally',
 'wide',
 'broad',
 'extensive',
 'publicize',
 'dissemination',
 'readily',
 'publicity',
 'diffuse']

In [10]:
w = ['farmland', 'pasture', 'entrench', 'scope', 'unduly', \
     'extrapolate', 'demographic', 'fixture', 'sequential', 'widely'][9]
zhlist = model.trg_closest(w.encode(), n=20)
print([ocs2t.convert(z.decode()) for (z, d) in zhlist])

['廣爲', '人知', '散發', '廣泛', '廣大', '傳播', '流傳', '普遍', '散播', '普遍地', '公衆', '涉面', '家喻戶曉', '大範圍', '廣予', '大衆', '廣爲人知', '更廣', '廣泛地', '散佈']


In [11]:
def analogy(w1, w2, w3):
    '''
    Solves problems of the type:
    w1 : w2 :: w3 : __
    '''
    closest_words = []
    try:
        w1v = en_model.word_vec(w1.encode('utf8'))
        w2v = en_model.word_vec(w2.encode('utf8'))
        w3v = zh_model.word_vec(oct2s.convert(w3).encode('utf8'))
        w4v = w3v + (w2v - w1v)
        closest_words = [ocs2t.convert(w.decode()) for (w, d) in zh_model.closest_to_vec(w4v, n=15)]
        closest_words = [w for w in closest_words if w not in [w1, w2, w3]]
    except:
        pass
    if len(closest_words) == 0:
        print(':-(')
    else:
        print('{} : {} :: {} : {}'.format(w1, w2, w3, closest_words[0]))
        print(closest_words)

In [12]:
w1 = 'man'
w2 = 'king'
w3 = '女'; 
analogy(w1, w2, w3)

man : king :: 女 : 王后
['王后', '議員', '女皇', '國王', '公主', '負責人', '奧克蘭', '愛丁堡', '錫蘭', '親王', '陛下', '議員們', '雪蘭莪州', '全國']


In [13]:
w1 = 'man'
w2 = 'doctor'
w3 = '女'; 
analogy(w1, w2, w3)

man : doctor :: 女 : 醫生
['醫生', '護士', '心理學家', '助產士', '醫務', '醫師', '藥劑師', '獄醫', '醫學院', '工作者', '內科', '就醫', '看醫生', '行醫']


In [14]:
w1 = 'male'
w2 = 'lawyer'
w3 = '女性'; 
analogy(w1, w2, w3)

male : lawyer :: 女性 : 律師
['律師', '辯護人', '律師們', '法官', '醫生', '公設', 'Bulgan', 'Robbin-Coker', '公證員', '執業', '心理學家', '記者', '工作者', '辯護', 'K-12']


In [15]:
w1 = 'hangar' #'smear' #'paltry'
w2 = 'airplane' # 'libel' # 'significance'
w3 = '車庫' # '注意' #'平凡'; 
analogy(w1, w2, w3)

hangar : airplane :: 車庫 : 停車場
['停車場', '停車位', '客機', '停在', '起降', '旅客', '行人', '上車', '載客', '走道', '登機', '乘客', '停車', '運輸機']


In [16]:
w1 = '犯罪'
w2 = '打擊'
w3 = 'crime'; 

#w1 = '疾病'
#w2 = '先天'
#w3 = 'disease'; 

w1v = zh_model.word_vec(oct2s.convert(w1).encode('utf8'))
w2v = zh_model.word_vec(oct2s.convert(w2).encode('utf8'))
w3v = en_model.word_vec(w3.encode('utf8'))
w4v = w3v + (w2v - w1v)
closest_words = [(w.decode(), d) for (w, d) in en_model.closest_to_vec(w4v, n=15)]
closest_words

[('terrorism', 0.5728011727333069),
 ('fight', 0.566207230091095),
 ('combat', 0.5621457695960999),
 ('crime', 0.5423240661621094),
 ('combating', 0.54147869348526),
 ('corruption', 0.5165905356407166),
 ('trafficking', 0.47082212567329407),
 ('scourge', 0.47066861391067505),
 ('drug-', 0.456204354763031),
 ('CORRUPTION', 0.4367782175540924),
 ('evil', 0.41706717014312744),
 ('suppress', 0.4149443805217743),
 ('bulwark', 0.41396206617355347),
 ('a/54/37', 0.4136147201061249),
 ('menace', 0.41042423248291016)]

In [17]:
w1 = '榮'
w2 = '殊'
w3 = 'honour'; 
w1 = '現實'
w2 = '殘酷'
w3 = 'reality'; 
w1v = zh_model.word_vec(oct2s.convert(w1).encode('utf8'))
w2v = zh_model.word_vec(oct2s.convert(w2).encode('utf8'))
w3v = en_model.word_vec(w3.encode('utf8'))
w4v = w3v + (w2v - w1v)
closest_words = [(w.decode(), d) for (w, d) in en_model.closest_to_vec(w4v, n=25)]
closest_words

[('brutal', 0.7233069539070129),
 ('cruelty', 0.7097128629684448),
 ('merciless', 0.6643000841140747),
 ('barbarity', 0.6310014128684998),
 ('brutality', 0.6243674755096436),
 ('tyrannical', 0.6238163709640503),
 ('inhumane', 0.6227946281433105),
 ('atrocious', 0.6057685017585754),
 ('appalling', 0.6051443219184875),
 ('savage', 0.5973511934280396),
 ('barbarous', 0.5931642651557922),
 ('vile', 0.5890387892723083),
 ('savagery', 0.5860393047332764),
 ('oppression', 0.5858331918716431),
 ('inhuman', 0.5800017714500427),
 ('cruelly', 0.5778768062591553),
 ('brutally', 0.5777223706245422),
 ('barbaric', 0.5751890540122986),
 ('atrocity', 0.5711759924888611),
 ('terrible', 0.5679929256439209),
 ('humiliating', 0.5676820874214172),
 ('horrendous', 0.5666106343269348),
 ('humiliation', 0.5654634237289429),
 ('ordeal', 0.5634199380874634),
 ('ruthless', 0.5585915446281433)]

In [18]:
w1 = 'rain' 
w2 = 'heavy'  
w3 = '雨'; 
analogy(w1, w2, w3)

rain : heavy :: 雨 : 重型
['重型', '繁重', '沉重', '過重', '重物', '負擔', '搬運', '攜帶', '過多', '扛', '龐大', '背上', '嚴', '軍用型', '揹負']


In [19]:
w1 = 'reform' 
w2 = 'swift'  
w3 = '改革'; 
analogy(w1, w2, w3)

reform : swift :: 改革 : 迅速
['迅速', '快速', '迅捷', '立即', '從速', '很快', '更快', '反應', '儘速', '快捷', '迅即', '加快', '馬上', '果斷', '響應']


In [20]:
w1 = 'investigation' 
w2 = 'relentless'  
w3 = '調查'; 
analogy(w1, w2, w3)

investigation : relentless :: 調查 : 無情
['無情', '奮鬥', '掙扎', '不懈地', '堅韌不拔', '光榮', '鍥而不捨', '狡猾', '老套', '英勇', '富足', '呈正', '再現', '中庸', '無與倫比']


In [21]:
w1 = 'honor' 
w2 = 'distinct'  
w3 = '榮譽'; 
analogy(w1, w2, w3)

honor : distinct :: 榮譽 : 區別於
['區別於', '有別於', '獨特性', '稱謂', '屬性', '特徵', '不同於', '特性', '與衆不同', '截然不同', '獨特', '嘉獎', '稱號', '殊榮']


In [22]:
w1 = 'organisation' 
w2 = 'nonprofit'  
w3 = '機構'; 
analogy(w1, w2, w3)

organisation : nonprofit :: 機構 : 機關
['機關', '部門', '機制', '實體', '系統', '署', '體制', '構', '組織', '第49/178', '部委', '職能', '當局', '領域']


In [23]:
#import numpy as np
w1 = 'nonprofit'
w1 = w1.encode('utf8')
w2 =  '女孩'
w2 = oct2s.convert(w2).encode('utf8')
x = en_model.word_vec(w1)
y = zh_model.word_vec(w2)
for (w, s) in model.trg_closest(w1, n=10):
    print(ocs2t.convert(w.decode('utf8')), s)
model.similarity(w1,w2)
#print(np.linalg.norm(x-y))


非盈利 0.5742483139038086
非贏利 0.5694575309753418
營利性 0.5489251613616943
非營利 0.5425974726676941
非贏利性 0.5210208296775818
非盈利性 0.5199751853942871
非營利性 0.511893093585968
慈善 0.5034415125846863
organization 0.49441787600517273
non-governmental 0.47668296098709106


-0.030717693269252777

In [24]:
import math
def euclidean_dist(p1, p2):
    return math.sqrt() 

In [25]:
def en2zht(txt, n=5):
    retval = []
    for (x,s) in model.trg_closest(txt.encode('utf8'), n):
        retval.append(ocs2t.convert(x.decode('utf8')))
    return retval

def zht2en(txt, n=5):
    retval = []
    for (x,s) in model.src_closest(oct2s.convert(txt).encode('utf8'), n):
        retval.append(x.decode('utf8'))
    return retval

In [26]:
def enSynonyms(txt, n=5):
    retval = []
    for (x,s) in en_model.closest(txt.encode('utf8'), n):
        retval.append(x.decode('utf8'))
    return retval

def zhtSynonyms(txt, n=5):
    retval = []
    for (x,s) in zh_model.closest(oct2s.convert(txt).encode('utf8'), n):
        retval.append(ocs2t.convert(x.decode('utf8')))
    return retval


In [27]:
enSynonyms('promote')

['foster', 'promotion', 'facilitate', 'stimulate', 'enhance']

In [28]:
zhtSynonyms('針對', n=15)

['對付',
 '對',
 '針對性',
 '對象',
 '施暴',
 '涉及',
 '矛頭',
 '瞄準',
 '面向',
 '因應',
 '對準',
 '指向',
 '具體',
 '如對',
 '受害']

In [29]:
words = {0:'壓倒性', 1:'打擊', 2:'犯罪', 3:'報到', 4:'别有用心',
         5:'紛亂',   6:'凌駕', 7:'穩健', 8:'款項', 9:'封建',
         10:'基礎', 11:'變遷', 12:'動態', 13:'鞭策', 14:'督促',
         15:'永恆', 16:'法則', 17:'亙古不變', 18:'穩健', 19:'維護',
         20:'檢舉', 21:'天才', 22:'繼續', 23:'推廣', 24:'宣導',
         25:'崗位', 26:'思維', 27:'幹部', 28:'持續', 29:'永續', 30:'焚燒',
         31:'阻絕', 32:'斷絕', 33:'體現', 34:'用途', 35:'著作', 36:'針對',
         37:'結合'
        }
print(', '.join(zht2en(words[12], 15)))

dynamic, evolution, trend, evolve, demographic, crustal, recent, dynamically, landscape, dynamical, abreast, interrelationship, tectonic, migration, emerge


In [30]:
words = {0:'shirk', 1:'credibility', 2:'legitimacy', 3:'cynical', 4:'permeate',
         5:'tergiversate', 6:'eternal', 7:'crusade', 8:'splinter', 9:'whining',
         10:'product'
        }
en2zht(words[6], n=10)

['永恆', '熱愛', '夢想', '神聖', '尊崇', '神', '美麗', '象徵', '靈魂', '祝福']

In [31]:
words = {0:'推廣', 1:'抱怨', 2:'作品', 3:'格局', 4:'事業',
         5:'掛牌', 6:'定位', 7:'', 8:'', 9:'',
         10:'', 11:'', 12:'', 13:'', 14:''}

print(zhtSynonyms(words[3], 12))
zht2en(words[3], 12)

['型態', '趨勢', '趨向', '模式', '規律', '型式', '動態', '變換', '動向', '形態', '變化', '失衡']


['pattern',
 'trend',
 'change',
 'evolution',
 'evolve',
 'changed',
 'demographic',
 'skewed',
 'consumption',
 'dynamic',
 'pyramid',
 'landscape']

In [32]:
similarity_ngrams?

Object `similarity_ngrams` not found.


In [33]:
w1 = 'similar today activity'
w1 = w1.encode('utf8')
w2 =  '類似 活動 今天'
w2 = oct2s.convert(w2).encode('utf8')
model.similarity_ngrams(w1, w2)

0.14274300634860992

$$
r = \sqrt{x^2 + y^2}
$$

In [34]:
w2 =  '類似 活動'
#w2 = oct2s.convert(w2).encode('utf8')
oct2s.convert(w2)

'类似 活动'

In [35]:
w1 = 'similar today activity .'
w1 = w1.encode('utf8')
w2 =  '類似 活動 今天 。'
w2 = oct2s.convert(w2).encode('utf8')
model.similarity_bag_of_words(w1, w2)

0.810400128364563

In [36]:
w = 'racism'
model.trg_model.closest(w.encode(), 20)

[(b'intolerance', 0.7335898876190186),
 (b'\xe6\x8e\x92\xe5\xa4\x96\xe4\xb8\xbb\xe4\xb9\x89', 0.6756048798561096),
 (b'\xe4\xb8\x8d\xe5\xae\xbd\xe5\xae\xb9', 0.6631632447242737),
 (b'\xe5\xbc\x82\xe5\xb7\xb1', 0.6553645133972168),
 (b'A/57/204', 0.6120954155921936),
 (b'Racism', 0.5789712071418762),
 (b'\xe5\xbf\x83\xe7\x90\x86.', 0.5706595182418823),
 (b'\xe6\x8e\x92\xe5\xa4\x96', 0.5623793005943298),
 (b'\xe4\xbb\x87\xe8\xa7\x86', 0.5460746884346008),
 (b'\xe9\x98\xbf\xe6\xb1\x89\xe6\xb1\x89\xe4\xbd\x90', 0.5401766300201416),
 (b'56/267', 0.5376922488212585),
 (b'\xe5\x8f\x8d\xe6\xad\xa7\xe8\xa7\x86', 0.5332741737365723),
 (b'\xe5\xae\xb9\xe5\xbf\x8d', 0.5312885046005249),
 (b'\xe7\xa7\x8d\xe6\x97\x8f', 0.5269216299057007),
 (b'\xe5\x8f\x8d\xe7\xa7\x8d\xe6\x97\x8f', 0.5264531970024109),
 (b'\xe8\x8e\xab\xe9\x87\x8c\xe6\x96\xaf\xc2\xb7\xe6\xa0\xbc\xe8\x8e\xb1\xe8\x8e\xb1\xc2\xb7\xe9\x98\xbf\xe6\xb1\x89\xe6\xb1\x89\xe4\xbd\x90',
  0.5204504728317261),
 (b'\xe4\xb8\x8d\xe5\xae\xb9\xe5\x

In [37]:
model.window_size

8

In [38]:
model.trg_model.word_vec('policy'.encode())

array([-9.26872715e-02,  4.45970073e-02,  3.88409290e-03,  2.05608383e-02,
        3.64419609e-01, -3.87760364e-02, -1.87767167e-02, -1.88943714e-01,
       -1.04873581e-02,  1.51205003e-01, -2.56903589e-01, -8.46443027e-02,
        2.05461964e-01, -6.24464601e-02, -2.54521906e-01,  4.23786044e-03,
       -3.28986526e-01, -1.32155731e-01,  4.63660061e-02, -2.53398150e-01,
       -1.40020534e-01,  1.47980303e-01, -2.28821799e-01, -5.03258184e-02,
       -3.08285296e-01,  2.22403612e-02, -2.34639451e-01,  3.44327956e-01,
       -8.60769153e-02,  5.24819680e-02,  1.15766719e-01, -2.98959851e-01,
        1.50657147e-01,  1.74558297e-01, -9.36421752e-03, -9.59974602e-02,
       -4.16328251e-01,  3.28109354e-01,  4.22030967e-03, -4.08003516e-02,
        3.13971601e-02, -2.82063801e-02, -1.97839588e-01, -4.90760691e-02,
        2.64514200e-02, -2.38319784e-01,  1.81103237e-02, -2.17561692e-01,
       -9.01859775e-02, -1.67960837e-01, -1.07588060e-01,  1.84607133e-01,
        3.27855200e-01,  

In [39]:
model.src_model.save_sent_vectors

<function MonolingualModel.save_sent_vectors>