In [1]:
import numpy as np
from sklearn.cluster import KMeans

from gensim.models.keyedvectors import KeyedVectors
import procrustes
from scipy.stats import ortho_group, wasserstein_distance

import os
import sys
import urllib.request
import requests
import time
import datetime
import pickle
import json

In [2]:
def embedding_load():
    """
    Load pretrained embedding vec
    """
    en_model = KeyedVectors.load_word2vec_format('./fasttext/wiki.en.vec')
    ko_model = KeyedVectors.load_word2vec_format('./fasttext/wiki.ko.vec')
    
    return en_model, ko_model

In [113]:
en_model, ko_model = embedding_load()

In [9]:
with open('./data/ko_noun_dict(vocab_size-1250).pkl', "rb") as f:
    ko_noun_dict = pickle.load(f)
    
with open('./data/en_noun_dict(vocab_size-1250).pkl', "rb") as f:
    en_noun_dict = pickle.load(f)
    
with open('./data/ko_verb_dict(vocab_size-547).pkl', "rb") as f:
    ko_verb_dict = pickle.load(f)
    
with open('./data/en_verb_dict(vocab_size-547).pkl', "rb") as f:
    en_verb_dict = pickle.load(f)

In [70]:
ko_data = np.array(list(ko_noun_dict.values()) + list(ko_verb_dict.values()))
en_data = np.array(list(en_noun_dict.values()) + list(en_verb_dict.values())) 
ko_noun_data = np.array(list(ko_noun_dict.values()))
en_noun_data = np.array(list(en_noun_dict.values())) 
ko_verb_data = np.array(list(ko_verb_dict.values()))
en_verb_data = np.array(list(en_verb_dict.values())) 

In [59]:
len(ko_data)

1797

In [67]:
result_orth = procrustes.orthogonal(ko_data, en_data, scale=False, translate=False)
result_generic = procrustes.generic(ko_data, en_data, scale=False, translate=False)
# result_orth_2side = procrustes.orthogonal_2sided(ko_data, en_data, scale=False, translate=False)
result_orth_2side_2 = procrustes.orthogonal_2sided(ko_data, en_data, single=False, scale=False, translate=False)
result_permut = procrustes.permutation(ko_data, en_data)
# result_soft = procrustes.softassign(ko_data, en_data, kopt=2)
result_rotate = procrustes.rotational(ko_data, en_data)
result_symmetric = procrustes.symmetric(ko_data, en_data)

# result_kopt = procrustes.kopt.ko

In [87]:
def train_procrustes(ko_data, en_data, print_f=True, return_state='Generic'):
    
#     ko_data = ko_data[:1000]
#     en_data = en_data[:500]
    
    result_orth = procrustes.orthogonal(ko_data, en_data, scale=False, translate=False)
    result_generic = procrustes.generic(ko_data, en_data, scale=False, translate=False)
    # result_orth_2side = procrustes.orthogonal_2sided(ko_data, en_data, scale=False, translate=False)
    result_orth_2side_2 = procrustes.orthogonal_2sided(ko_data, en_data, single=False, scale=False, translate=False)
    result_permut = procrustes.permutation(ko_data, en_data)
    # result_soft = procrustes.softassign(ko_data, en_data, kopt=2)
    result_rotate = procrustes.rotational(ko_data, en_data)
    result_symmetric = procrustes.symmetric(ko_data, en_data)
    print(len(ko_data))
    if print_f:
        print('Orth error : ', result_orth.error / len(ko_data))
        print('Generic error : ', result_generic.error/ len(ko_data))
        print('Permute error : ', result_permut.error/ len(ko_data))
        print('Rotate error : ', result_rotate.error/ len(ko_data))
        print('Symmetric error : ', result_symmetric.error/ len(ko_data))
        if return_state == 'Generic':
            return result_generic
    
    else:
        return 0 

In [88]:
noun_ko2en = train_procrustes(ko_noun_data, en_noun_data)
noun_en2ko = train_procrustes(en_noun_data, ko_noun_data)
verb_ko2en = train_procrustes(ko_verb_data, en_verb_data)
verb_en2ko = train_procrustes(en_verb_data, ko_verb_data)

1250
Orth error :  16.40404037708212
Generic error :  9.524618462204234
Permute error :  33.82008071566402
Rotate error :  16.404044191924118
Symmetric error :  12.009193847268763
1250
Orth error :  16.40404037708212
Generic error :  9.711636035447986
Permute error :  33.82008071566402
Rotate error :  16.40404296303643
Symmetric error :  12.906590739041302
547
Orth error :  12.555398170736241
Generic error :  4.819750563867699
Permute error :  23.232146164195456
Rotate error :  12.555442788835053
Symmetric error :  8.104849233360943
547
Orth error :  12.55539585898217
Generic error :  5.309752554069588
Permute error :  23.232146164195456
Rotate error :  12.555441739279884
Symmetric error :  9.216309826209493


In [115]:
# sample_change_vec = np.dot(ko_model.vectors[:5000], result_generic.t)
# sample_change_vocab = list(ko_model.vocab)[:5000]
change_vec_noun = np.dot(en_model.vectors, noun_en2ko.t)
change_vec_verb = np.dot(en_model.vectors, verb_en2ko.t)

In [116]:
change_vec_noun.shape, change_vec_verb.shape

((2519370, 300), (2519370, 300))

In [130]:
from scipy.stats import wasserstein_distance

distance = 0
for i in range(50000):
    distance += wasserstein_distance(change_vec_noun[i], change_vec_verb[i])
print(distance/50000)

0.1432145169224416


In [122]:
en_model.vocab

{',': <gensim.models.keyedvectors.Vocab at 0x7fe278aedb70>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7fe278ca7a90>,
 'the': <gensim.models.keyedvectors.Vocab at 0x7fe278aedba8>,
 '</s>': <gensim.models.keyedvectors.Vocab at 0x7fe278ca7780>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7fe278aedc18>,
 '-': <gensim.models.keyedvectors.Vocab at 0x7fe278ca7710>,
 'in': <gensim.models.keyedvectors.Vocab at 0x7fe278aedc50>,
 'and': <gensim.models.keyedvectors.Vocab at 0x7fe278ca7748>,
 "'": <gensim.models.keyedvectors.Vocab at 0x7fe278aedcc0>,
 ')': <gensim.models.keyedvectors.Vocab at 0x7fe278ca7ef0>,
 '(': <gensim.models.keyedvectors.Vocab at 0x7fe278aedcf8>,
 'to': <gensim.models.keyedvectors.Vocab at 0x7fe278ca7dd8>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7fe278aedd68>,
 'is': <gensim.models.keyedvectors.Vocab at 0x7fe278ca7c50>,
 'was': <gensim.models.keyedvectors.Vocab at 0x7fe278aeddd8>,
 'on': <gensim.models.keyedvectors.Vocab at 0x7fe278ca7828>,
 's': <gensim.models.keyed

In [114]:
ko_model.vectors.shape

(879129, 300)

In [100]:
help(ko_model.add)

Help on method add in module gensim.models.keyedvectors:

add(entities, weights, replace=False) method of gensim.models.keyedvectors.Word2VecKeyedVectors instance
    Append entities and theirs vectors in a manual way.
    If some entity is already in the vocabulary, the old vector is kept unless `replace` flag is True.
    
    Parameters
    ----------
    entities : list of str
        Entities specified by string ids.
    weights: list of numpy.ndarray or numpy.ndarray
        List of 1D np.array vectors or a 2D np.array of vectors.
    replace: bool, optional
        Flag indicating whether to replace vectors for entities which already exist in the vocabulary,
        if True - replace vectors, otherwise - keep old vectors.



In [131]:
tmp_ko_model.add(list(en_model.vocab), change_vec_verb)

In [133]:
ko_model.add(list(en_model.vocab), change_vec_verb)

In [134]:
ko_model.vectors.shape

(3319279, 300)

In [135]:
ko_model.save('./data/emb_en2ko_verb.w2v')

In [18]:
en_model.add(sample_change_vocab, sample_change_vec, replace=True)

In [46]:
en_model.wv.most_similar('관계는', topn=100)

  if __name__ == '__main__':


[('관계가', 0.8391407132148743),
 ('관계를', 0.7764248251914978),
 ('관계에', 0.7076321244239807),
 ('관련이', 0.6967226266860962),
 ('관계', 0.6891700625419617),
 ('직접적인', 0.6761817932128906),
 ('상황이', 0.6239739656448364),
 ('외교', 0.6234864592552185),
 ('태도를', 0.6218520998954773),
 ('사이가', 0.6161556243896484),
 ('입장을', 0.6150044202804565),
 ('그러나', 0.6109069585800171),
 ('서로', 0.6048915386199951),
 ('유지하고', 0.5998672246932983),
 ('관련하여', 0.5971484780311584),
 ('간의', 0.5947040319442749),
 ('만나', 0.5940428972244263),
 ('하면서', 0.5927482843399048),
 ('이는', 0.5892982482910156),
 ('일은', 0.5884603261947632),
 ('그리하여', 0.587111234664917),
 ('관계로', 0.5869655609130859),
 ('결과는', 0.5868374109268188),
 ('전혀', 0.586111307144165),
 ('하지만', 0.5860042572021484),
 ('관심이', 0.5847064852714539),
 ('반응을', 0.5808483362197876),
 ('노력을', 0.5787227153778076),
 ('상황을', 0.5786213278770447),
 ('관련된', 0.5782474875450134),
 ('상호', 0.5781601667404175),
 ('정부는', 0.5779614448547363),
 ('성격이', 0.5774530172348022),
 ('공식적인', 0.57712

In [47]:
tic = time.time()
change_vec = np.dot(ko_model.vectors, result_generic.t)
change_vocab = list(ko_model.vocab)
en_model.add(change_vocab, change_vec, replace=True)
tok = time.time()
tictok = str(datetime.timedelta(seconds=tok-tic))
print(f'Transform and Add process end. Time spend : {tictok}')

Transform and Add process end. Time spend : 0:00:08.790378


In [58]:
en_model.wv.most_similar('프랑크')

  if __name__ == '__main__':


IndexError: index 2522956 is out of bounds for axis 0 with size 2522955

In [54]:
en_model.add(change_vocab, change_vec, replace=True)

In [57]:
en_model.index2word[2522956]

'프랑크'

In [None]:
len(en_model.vocab)

In [None]:
en_model.word_vec('사라졌다')