In [8]:
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from gensim.models import TranslationMatrix, translation_matrix

def normalised(mat, axis=-1, order=2):
        """Utility function to normalise the rows of a numpy array."""
        norm = np.linalg.norm(
            mat, axis=axis, ord=order, keepdims=True)
        norm[norm == 0] = 1
        return mat / norm
    
def export_to_KeyedVectors(transformed_source_model, target_model, file_path):
        
        target_model.init_sims()
        
        print("transform_source_model.shape = ", transformed_source_model.mat.shape)
        print("target_model.shape = ", target_model.wv.vectors_norm.shape)

        two_languages_index2word = transformed_source_model.index2word + target_model.wv.index2word
        normalised_transform_embedding = normalised(transformed_source_model.mat)
        print("Normalization is finished!")
        
        two_languages_word_vector = np.concatenate((normalised_transform_embedding,\
                                              target_model.wv.vectors_norm), axis=0)
        print("concatenate_model.shape = ", two_languages_word_vector.shape)

        out = open(file_path,'w')
        out.write(str(two_languages_word_vector.shape[0]) + " " + str(two_languages_word_vector.shape[1]))
        out.write("\n")
        for each_word in two_languages_index2word:
            out.write(each_word + " ")
            out.write(' '.join(map(str, two_languages_word_vector[two_languages_index2word.index(each_word)])) + "\n")
        out.close()

        print("KeyedVectors have exported to", file_path)

# English & Chinese Mapping

In [3]:
chinese_vec_file = "../out/MLDoc/MLDoc-chinese-word2vec_NV_s100w5m15n10.vec"
english_vec_file = "../out/MLDoc/MLDoc-english-word2vec_NV_s100w5m15n10sam1e-5.vec"

chinese_model = Word2Vec.load(chinese_vec_file)
english_model = Word2Vec.load(english_vec_file)

In [4]:
f = open("/home/ponshane/Downloads/zh-en.txt", "r")

word_pairs = []

for line in f.readlines():  
    line = line.rstrip("\n").split(" ")
    #print(line[0], line[1].lower())
    chinese_word = line[0]
    english_word = line[1].lower()
    if chinese_word in chinese_model.wv.index2word and english_word in english_model.wv.index2word:
        word_pairs.append((chinese_word, english_word))

f.close()

In [5]:
len(word_pairs)

911

In [None]:
trans_model = TranslationMatrix(chinese_model.wv, english_model.wv)
trans_model.train(word_pairs)
trans_model.translate(["增加", "政府"], topn=10)

In [49]:
# transform datatype
chinese_model_space = translation_matrix.Space(chinese_model.wv.vectors, index2word=chinese_model.wv.index2word)
# transform space
transformed_Chinese_model = trans_model.apply_transmat(chinese_model_space)

In [58]:
export_to_KeyedVectors(transformed_Chinese_model, english_model, "../out/MLDoc/Chinese_English_wordvectors.vec")

transform_source_model.shape =  (6733, 100)
target_model.shape =  (19982, 100)
Normalization is finished!
concatenate_model.shape =  (26715, 100)
KeyedVectors have exported to ../out/MLDoc/Chinese_English_wordvectors.vec


In [18]:
# test phase
Concatenated_model = KeyedVectors.load_word2vec_format("../out/MLDoc/Chinese_English_wordvectors.vec")

In [31]:
Concatenated_model.most_similar("government", topn=20)

[('goverment', 0.8741818070411682),
 ('cabinet', 0.7273430824279785),
 ('减税', 0.7260865569114685),
 ('pledge', 0.7130036354064941),
 ('parliament', 0.70453280210495),
 ('minister', 0.7030107975006104),
 ('overspend', 0.6900051236152649),
 ('reform', 0.6828949451446533),
 ('administration', 0.6777451634407043),
 ('devolve', 0.6767010688781738),
 ('opposition', 0.6755403280258179),
 ('urge', 0.6747902631759644),
 ('克拉克', 0.6732919216156006),
 ('coalition', 0.6729280948638916),
 ('财政', 0.6727367043495178),
 ('预算案', 0.6697919368743896),
 ('政府', 0.6610081791877747),
 ('追加预算', 0.6601465940475464),
 ('dovish', 0.6584186553955078),
 ('fino', 0.6577595472335815)]

In [82]:
# export to embedding projector
file_path = "../out/MLDoc/Concatenated_embeddings.tsv"
out = open(file_path,'w')

for each_word in Concatenated_model.wv.index2word:
    out.write('\t'.join(map(str, Concatenated_model[each_word])) + "\n")

out.close()

# expoert to metadata file
file_path = "../out/MLDoc/Concatenated_metadata.tsv"
out = open(file_path,'w')
out.write("word\tlanguage\n")
for each_word in Concatenated_model.wv.index2word:
    if each_word in chinese_model.wv.index2word:
        out.write(each_word+"\tchinese\n")
    else:
        out.write(each_word+"\tenglish\n")

out.close()



# English & Chinese Mapping - Version II (use handcraft dictionary)

In [5]:
chinese_vec_file = "../out/MLDoc/MLDoc-chinese-word2vec_NV_s100w5m15n10.vec"
english_vec_file = "../out/MLDoc/MLDoc-english-word2vec_NV_s100w5m15n10sam1e-5.vec"

chinese_model = Word2Vec.load(chinese_vec_file)
english_model = Word2Vec.load(english_vec_file)

chinese_model.wv.save_word2vec_format("/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/word-vectors/MLDoc-chinese-word2vec_NV_s100w5m15n10.txt")
english_model.wv.save_word2vec_format("/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/word-vectors/MLDoc-english-word2vec_NV_s100w5m15n10sam1e-5.txt")

In [11]:
f = open("/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/hand-craft-control-dictionary.txt", "r")

word_pairs = []

for line in f.readlines():  
    line = line.rstrip("\n").split(",")
    #print(line[0], line[1].lower())
    chinese_word = line[1]
    english_word = line[0].lower()
    if chinese_word in chinese_model.wv.index2word and english_word in english_model.wv.index2word:
        word_pairs.append((chinese_word, english_word))

f.close()

print(len(word_pairs))

In [None]:
trans_model = TranslationMatrix(chinese_model.wv, english_model.wv)
trans_model.train(word_pairs)
trans_model.translate(["增加", "政府"], topn=10)

In [14]:
# transform datatype
chinese_model_space = translation_matrix.Space(chinese_model.wv.vectors, index2word=chinese_model.wv.index2word)
# transform space
transformed_Chinese_model = trans_model.apply_transmat(chinese_model_space)

In [16]:
export_to_KeyedVectors(transformed_Chinese_model, english_model, "/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/word-vectors/Hand_craft_Chinese_English_wordvectors.vec")

transform_source_model.shape =  (6733, 100)
target_model.shape =  (19982, 100)
Normalization is finished!
concatenate_model.shape =  (26715, 100)
KeyedVectors have exported to /home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/word-vectors/Hand_craft_Chinese_English_wordvectors.vec


In [12]:
# test phase
Concatenated_handcraft_model = KeyedVectors.load_word2vec_format("/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/word-vectors/Hand_craft_Chinese_English_wordvectors.vec")

In [13]:
print(Concatenated_handcraft_model.similarity("国会议员", "议员"))
Concatenated_handcraft_model.most_similar("government", topn=20)

0.8889483322831442


[('goverment', 0.8741818070411682),
 ('cabinet', 0.7273430824279785),
 ('pledge', 0.7130036354064941),
 ('国会议员', 0.7065216302871704),
 ('parliament', 0.70453280210495),
 ('minister', 0.7030107975006104),
 ('努力', 0.6921780705451965),
 ('法案', 0.6915416717529297),
 ('overspend', 0.6900051236152649),
 ('reform', 0.6828949451446533),
 ('主张', 0.6807999610900879),
 ('税制', 0.6798949241638184),
 ('行政', 0.6795258522033691),
 ('administration', 0.6777451634407043),
 ('devolve', 0.6767010688781738),
 ('opposition', 0.6755403280258179),
 ('urge', 0.6747902631759644),
 ('coalition', 0.6729280948638916),
 ('减税', 0.6719828248023987),
 ('白宫', 0.666300892829895)]

In [32]:
# export to embedding projector
file_path = "/home/ponshane/Desktop/Concatenated_Handcraft_embeddings.tsv"
out = open(file_path,'w')

for each_word in Concatenated_handcraft_model.wv.index2word:
    out.write('\t'.join(map(str, Concatenated_handcraft_model[each_word])) + "\n")

out.close()

# expoert to metadata file
file_path = "/home/ponshane/Desktop/Concatenated_Handcraft_metadata.tsv"
out = open(file_path,'w')
out.write("word\tlanguage\n")
for each_word in Concatenated_handcraft_model.wv.index2word:
    if each_word in chinese_model.wv.index2word:
        out.write(each_word+"\tchinese\n")
    else:
        out.write(each_word+"\tenglish\n")

out.close()

  """
  


# Using Procrustes alignment to align space

In [5]:
import numpy as np

import sys
# set python syspath to point out location of our self-writing module
sys.path.append("/home/ponshane/work_dir/CLTM/src/codebase/")

from fasttext import FastVector

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

def load_bilingual_dict(dictionary_path, reverse_source_target = False, sperator = " "):
    bilingual_dict = []
    with open(dictionary_path, "r") as infile:
        for line in infile:
            temp = line.strip("\n").split(sperator)
            if reverse_source_target == True:
                bilingual_dict.append((temp[1], temp[0]))
            elif reverse_source_target == False:
                bilingual_dict.append((temp[0], temp[1]))
        return bilingual_dict

def export_to_required_input_of_CLTM(en_dictionary, zh_dictionary, outpath):
    fout = open(outpath, "w")
    
    vocab_sizes = en_dictionary.n_words + zh_dictionary.n_words
    out_line = str(vocab_sizes) + " " + str(en_dictionary.n_dim) + "\n"
    fout.write(out_line)
    
    for token in en_dictionary.id2word:
        vector_components = ["%.6f" % number for number in en_dictionary[token]]
        vector_as_string = " ".join(vector_components)

        out_line = token + " " + vector_as_string + "\n"
        fout.write(out_line)

    for token in zh_dictionary.id2word:
        vector_components = ["%.6f" % number for number in zh_dictionary[token]]
        vector_as_string = " ".join(vector_components)

        out_line = token + " " + vector_as_string + "\n"
        fout.write(out_line)
    fout.close()
    
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [3]:
# load back the space
zh_dictionary = FastVector(vector_file='/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/word-vectors/MLDoc-chinese-word2vec_NV_s100w5m15n10.txt')
en_dictionary = FastVector(vector_file='/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/word-vectors/MLDoc-english-word2vec_NV_s100w5m15n10sam1e-5.txt')

print(FastVector.cosine_similarity(en_dictionary["govenment"], zh_dictionary["政党"]))

# load back dictionary
bilingual_dict = load_bilingual_dict(dictionary_path="/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/hand-craft-control-dictionary.txt",
                                     reverse_source_target= True, sperator=",")

# form the training matrices
source_matrix, target_matrix = make_training_matrices(
    zh_dictionary, en_dictionary, bilingual_dict)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
zh_dictionary.apply_transform(transform)

print(FastVector.cosine_similarity(en_dictionary["govenment"], zh_dictionary["政党"]))

reading word vectors from /home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/word-vectors/MLDoc-chinese-word2vec_NV_s100w5m15n10.txt
reading word vectors from /home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/word-vectors/MLDoc-english-word2vec_NV_s100w5m15n10sam1e-5.txt
-0.18624678268317557
0.6779499719647438


In [6]:
# 100% aligned space
outpath = "/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/word-vectors/Procrustes_Hand_craft_Chinese_English_wordvectors.vec"
export_to_required_input_of_CLTM(en_dictionary, zh_dictionary, outpath)

In [9]:
Concatenated_Procrustes_handcraft_model = KeyedVectors.load_word2vec_format(outpath)

In [14]:
print(Concatenated_Procrustes_handcraft_model.similarity("国会议员", "议员"))
Concatenated_Procrustes_handcraft_model.most_similar("government", topn=20)

0.9045977079702342


[('goverment', 0.874181866645813),
 ('cabinet', 0.7273430824279785),
 ('国会议员', 0.7221354842185974),
 ('减税', 0.7218571901321411),
 ('pledge', 0.7130035758018494),
 ('parliament', 0.7045326828956604),
 ('minister', 0.7030106782913208),
 ('overspend', 0.6900051832199097),
 ('歧见', 0.6886298656463623),
 ('reform', 0.6828948259353638),
 ('主张', 0.6817994117736816),
 ('administration', 0.6777451038360596),
 ('devolve', 0.6767009496688843),
 ('税制', 0.6764950752258301),
 ('opposition', 0.6755402088165283),
 ('urge', 0.6747902035713196),
 ('coalition', 0.672927975654602),
 ('援助', 0.6701107025146484),
 ('会面', 0.6621206998825073),
 ('议员', 0.6615113019943237)]

In [15]:
# export to embedding projector
file_path = "/home/ponshane/Desktop/Concatenated_Procrustes_Handcraft_embeddings.tsv"
out = open(file_path,'w')

for each_word in Concatenated_Procrustes_handcraft_model.wv.index2word:
    out.write('\t'.join(map(str, Concatenated_Procrustes_handcraft_model[each_word])) + "\n")

out.close()

# expoert to metadata file
file_path = "/home/ponshane/Desktop/Concatenated_Procrustes_Handcraft_metadata.tsv"
out = open(file_path,'w')
out.write("word\tlanguage\n")
for each_word in Concatenated_Procrustes_handcraft_model.wv.index2word:
    if each_word in zh_dictionary.id2word:
        out.write(each_word+"\tchinese\n")
    else:
        out.write(each_word+"\tenglish\n")

out.close()

  """
  


# English & Japanesne Mapping

In [2]:
japanese_vec_file = "../out/MLDoc/MLDoc-japanese-word2vec_NV_s100w5m15n10sam1e-4.vec"
english_vec_file = "../out/MLDoc/MLDoc-english-word2vec_NV_s100w5m15n10sam1e-5.vec"

japanese_model = Word2Vec.load(japanese_vec_file)
english_model = Word2Vec.load(english_vec_file)

In [9]:
f = open("/home/ponshane/Downloads/ja-en.txt", "r")

word_pairs = []

for line in f.readlines():
    line = line.rstrip("\n").split("\t")
    #print(line[0], line[1].lower())
    japanese_word = line[0]
    english_word = line[1].lower()
    if japanese_word in japanese_model.wv.index2word and english_word in english_model.wv.index2word:
        word_pairs.append((japanese_word, english_word))

f.close()

In [27]:
print(len(word_pairs))
word_pairs[100:125]

3030


[('終了', 'quit'),
 ('終了', 'finish'),
 ('終了', 'end'),
 ('終了', 'exit'),
 ('中心', 'center'),
 ('中心', 'centre'),
 ('内容', 'content'),
 ('文化', 'culture'),
 ('イギリス', 'england'),
 ('中国', 'china'),
 ('指定', 'designation'),
 ('指定', 'specify'),
 ('位置', 'position'),
 ('人物', 'person'),
 ('公開', 'public'),
 ('公開', 'publish'),
 ('最終', 'last'),
 ('最終', 'final'),
 ('確認', 'confirmation'),
 ('確認', 'confirm'),
 ('一般', 'general'),
 ('歴史', 'history'),
 ('選挙', 'election'),
 ('計画', 'planning'),
 ('計画', 'plan')]

In [17]:
trans_model = TranslationMatrix(japanese_model.wv, english_model.wv)
trans_model.train(word_pairs)
#trans_model.translate(["存在", "研究"], topn=10)

In [18]:
# transform datatype
japanese_model_space = translation_matrix.Space(japanese_model.wv.vectors, index2word=japanese_model.wv.index2word)
# transform space
transformed_Japanese_model = trans_model.apply_transmat(japanese_model_space)

In [20]:
export_to_KeyedVectors(transformed_Japanese_model, english_model, "../out/MLDoc/Japanese_English_wordvectors.vec")

transform_source_model.shape =  (12894, 100)
target_model.shape =  (19982, 100)
Normalization is finished!
concatenate_model.shape =  (32876, 100)
KeyedVectors have exported to ../out/MLDoc/Japanese_English_wordvectors.vec


In [23]:
Concatenated_model = KeyedVectors.load_word2vec_format("../out/MLDoc/Japanese_English_wordvectors.vec")

In [36]:
file_path = "../out/MLDoc/Concatenated_ja-en_embeddings.tsv"
out = open(file_path,'w')

for each_word in Concatenated_model.wv.index2word:
    out.write('\t'.join(map(str, Concatenated_model[each_word])) + "\n")

out.close()

# expoert to metadata file
file_path = "../out/MLDoc/Concatenated_ja-en_metadata.tsv"
out = open(file_path,'w')
out.write("word\tlanguage\n")
for each_word in Concatenated_model.wv.index2word:
    if each_word in japanese_model.wv.index2word:
        out.write(each_word+"\tjapanese\n")
    else:
        out.write(each_word+"\tenglish\n")

out.close()

