In [1]:
import numpy as np
from sklearn.cluster import KMeans

from gensim.models.keyedvectors import KeyedVectors
import procrustes
from scipy.stats import ortho_group, wasserstein_distance

import os
import sys
import urllib.request
import requests
import time
import datetime
import pickle
import json

In [2]:
def embedding_load():
    """
    Load pretrained embedding vec
    """
    en_model = KeyedVectors.load_word2vec_format('./fasttext/wiki.en.vec')
    ko_model = KeyedVectors.load_word2vec_format('./fasttext/wiki.ko.vec')
    
    return en_model, ko_model

In [3]:
en_model, ko_model = embedding_load()

In [9]:
with open('./data/ko_noun_dict(vocab_size-1250).pkl', "rb") as f:
    ko_noun_dict = pickle.load(f)
    
with open('./data/en_noun_dict(vocab_size-1250).pkl', "rb") as f:
    en_noun_dict = pickle.load(f)
    
with open('./data/ko_verb_dict(vocab_size-547).pkl', "rb") as f:
    ko_verb_dict = pickle.load(f)
    
with open('./data/en_verb_dict(vocab_size-547).pkl', "rb") as f:
    en_verb_dict = pickle.load(f)

In [70]:
ko_data = np.array(list(ko_noun_dict.values()) + list(ko_verb_dict.values()))
en_data = np.array(list(en_noun_dict.values()) + list(en_verb_dict.values())) 
ko_noun_data = np.array(list(ko_noun_dict.values()))
en_noun_data = np.array(list(en_noun_dict.values())) 
ko_verb_data = np.array(list(ko_verb_dict.values()))
en_verb_data = np.array(list(en_verb_dict.values())) 

In [59]:
len(ko_data)

1797

In [67]:
result_orth = procrustes.orthogonal(ko_data, en_data, scale=False, translate=False)
result_generic = procrustes.generic(ko_data, en_data, scale=False, translate=False)
# result_orth_2side = procrustes.orthogonal_2sided(ko_data, en_data, scale=False, translate=False)
result_orth_2side_2 = procrustes.orthogonal_2sided(ko_data, en_data, single=False, scale=False, translate=False)
result_permut = procrustes.permutation(ko_data, en_data)
# result_soft = procrustes.softassign(ko_data, en_data, kopt=2)
result_rotate = procrustes.rotational(ko_data, en_data)
result_symmetric = procrustes.symmetric(ko_data, en_data)

# result_kopt = procrustes.kopt.ko

In [81]:
def train_procrustes(ko_data, en_data, print_f=True):
    
#     ko_data = ko_data[:1000]
#     en_data = en_data[:500]
    
    result_orth = procrustes.orthogonal(ko_data, en_data, scale=False, translate=False)
    result_generic = procrustes.generic(ko_data, en_data, scale=False, translate=False)
    # result_orth_2side = procrustes.orthogonal_2sided(ko_data, en_data, scale=False, translate=False)
    result_orth_2side_2 = procrustes.orthogonal_2sided(ko_data, en_data, single=False, scale=False, translate=False)
    result_permut = procrustes.permutation(ko_data, en_data)
    # result_soft = procrustes.softassign(ko_data, en_data, kopt=2)
    result_rotate = procrustes.rotational(ko_data, en_data)
    result_symmetric = procrustes.symmetric(ko_data, en_data)
    print(len(ko_data))
    if print_f:
        print('Orth error : ', result_orth.error / len(ko_data))
        print('Generic error : ', result_generic.error/ len(ko_data))
        print('Permute error : ', result_permut.error/ len(ko_data))
        print('Rotate error : ', result_rotate.error/ len(ko_data))
        print('Symmetric error : ', result_symmetric.error/ len(ko_data))
    
    else:
        return 0 

In [82]:
train_procrustes(ko_noun_data, en_noun_data)
train_procrustes(ko_verb_data, en_verb_data)

1250
Orth error :  16.40404037708212
Generic error :  9.524618462204234
Permute error :  33.82008071566402
Rotate error :  16.404044191924118
Symmetric error :  12.009193847268763
547
Orth error :  12.555398170736241
Generic error :  4.819750563867699
Permute error :  23.232146164195456
Rotate error :  12.555442788835053
Symmetric error :  8.104849233360943


In [68]:
print('orth :',  result_orth.error / 1797)
result_generic.error / 1797, result_orth_2side_2.error / 1797, result_permut.error / 1797, result_rotate.error / 1797, result_symmetric.error/ 1797

orth : 17.63603402100375


(10.306013171183766,
 0.8793697731406674,
 31.84782558862164,
 17.63611423973675,
 12.12190783264706)

In [16]:
result_generic

 error: 18519.905668617226
 new_a: array([[-0.46317  ,  0.35326  , -0.61864  , ..., -0.40615  ,  0.30542  ,
        -0.52318  ],
       [ 0.21332  , -0.0042801, -0.22429  , ..., -0.39047  ,  0.077551 ,
        -0.39871  ],
       [-0.16752  , -0.16467  , -0.35177  , ..., -0.15186  , -0.017687 ,
        -0.62178  ],
       ...,
       [-0.053361 , -0.26477  , -0.18502  , ..., -0.34708  , -0.20677  ,
        -0.109    ],
       [-0.33763  ,  0.22257  , -0.47841  , ...,  0.11032  , -0.10531  ,
        -0.067513 ],
       [ 0.0045994, -0.016532 , -0.059695 , ..., -0.041754 , -0.13798  ,
         0.13473  ]], dtype=float32)
 new_b: array([[ 0.039245 ,  0.026392 , -0.08336  , ...,  0.74819  , -0.24179  ,
         0.61913  ],
       [ 0.13463  , -0.14842  , -0.0069594, ..., -0.022981 ,  0.42084  ,
        -0.01948  ],
       [-0.14917  , -0.090087 , -0.4054   , ..., -0.060252 ,  0.10322  ,
         0.2516   ],
       ...,
       [-0.16571  , -0.024714 , -0.20383  , ...,  0.55517  ,  0.082316 

In [17]:
sample_change_vec = np.dot(ko_model.vectors[:5000], result_generic.t)
sample_change_vocab = list(ko_model.vocab)[:5000]

In [39]:
sample_change_vocab[4000:5020]

['되었다고',
 '관계는',
 '필요로',
 'arm',
 '포',
 '사고로',
 '들어간',
 '시와',
 '동일',
 '대한제국의',
 '끝으로',
 '주기',
 '샐러맨더',
 '身',
 '크기가',
 '만화가',
 '궤도',
 '막',
 '상태에',
 '않는다는',
 '基',
 '디젤',
 '있다가',
 'set',
 '본부',
 '계기가',
 '가스',
 '신화의',
 '구간은',
 '거기에',
 '섬식',
 '아테네',
 '엔딩',
 '모양으로',
 '값을',
 'air',
 '일부로',
 '하다가',
 '특집',
 '徐',
 '숫자',
 '의석',
 'facebook',
 '체중',
 'msl',
 '불러',
 '참가하였다',
 'nba',
 'に',
 '상황',
 '남긴',
 '탄생',
 '不',
 'mp',
 '사이를',
 '대중',
 'youtube',
 '자유를',
 '이해',
 '기록하며',
 '전원',
 'pp',
 '현역',
 '제외',
 '일어난다',
 '후한',
 '관측',
 '정원',
 '담은',
 '記',
 '않았던',
 '매',
 '땅을',
 '的',
 '출판',
 '명예',
 '하',
 '청년',
 '靑',
 '방송된',
 '루수',
 '청주',
 '다큐멘터리',
 '없어서',
 '주전',
 '카메라',
 '定',
 '곧바로',
 '정도가',
 '집에서',
 '배경',
 '혼자',
 'der',
 '퍼시픽',
 '민영화에',
 '이중',
 '설치되었다',
 '하우스',
 '영구',
 '발매한',
 '표현',
 '라이선스',
 '찾기',
 '천체',
 '샌프란시스코',
 '트윈스',
 '개인적인',
 '전적',
 '세계에',
 '직책',
 '시즈오카',
 '주어진',
 '미군',
 '사유',
 '확대',
 '河',
 '특수한',
 '중국에서',
 '예정',
 '편의',
 '리더',
 '참',
 '西',
 '코난',
 '이끌었다',
 '닌텐도',
 '인구의',
 'mr',
 '지나치게',
 '이어지는',
 '가을',
 '시조

In [18]:
en_model.add(sample_change_vocab, sample_change_vec, replace=True)

In [46]:
en_model.wv.most_similar('관계는', topn=100)

  if __name__ == '__main__':


[('관계가', 0.8391407132148743),
 ('관계를', 0.7764248251914978),
 ('관계에', 0.7076321244239807),
 ('관련이', 0.6967226266860962),
 ('관계', 0.6891700625419617),
 ('직접적인', 0.6761817932128906),
 ('상황이', 0.6239739656448364),
 ('외교', 0.6234864592552185),
 ('태도를', 0.6218520998954773),
 ('사이가', 0.6161556243896484),
 ('입장을', 0.6150044202804565),
 ('그러나', 0.6109069585800171),
 ('서로', 0.6048915386199951),
 ('유지하고', 0.5998672246932983),
 ('관련하여', 0.5971484780311584),
 ('간의', 0.5947040319442749),
 ('만나', 0.5940428972244263),
 ('하면서', 0.5927482843399048),
 ('이는', 0.5892982482910156),
 ('일은', 0.5884603261947632),
 ('그리하여', 0.587111234664917),
 ('관계로', 0.5869655609130859),
 ('결과는', 0.5868374109268188),
 ('전혀', 0.586111307144165),
 ('하지만', 0.5860042572021484),
 ('관심이', 0.5847064852714539),
 ('반응을', 0.5808483362197876),
 ('노력을', 0.5787227153778076),
 ('상황을', 0.5786213278770447),
 ('관련된', 0.5782474875450134),
 ('상호', 0.5781601667404175),
 ('정부는', 0.5779614448547363),
 ('성격이', 0.5774530172348022),
 ('공식적인', 0.57712

In [47]:
tic = time.time()
change_vec = np.dot(ko_model.vectors, result_generic.t)
change_vocab = list(ko_model.vocab)
en_model.add(change_vocab, change_vec, replace=True)
tok = time.time()
tictok = str(datetime.timedelta(seconds=tok-tic))
print(f'Transform and Add process end. Time spend : {tictok}')

Transform and Add process end. Time spend : 0:00:08.790378


In [58]:
en_model.wv.most_similar('프랑크')

  if __name__ == '__main__':


IndexError: index 2522956 is out of bounds for axis 0 with size 2522955

In [54]:
en_model.add(change_vocab, change_vec, replace=True)

In [57]:
en_model.index2word[2522956]

'프랑크'

In [None]:
len(en_model.vocab)

In [None]:
en_model.word_vec('사라졌다')