In [1]:
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from gensim.models import TranslationMatrix, translation_matrix

def normalised(mat, axis=-1, order=2):
        """Utility function to normalise the rows of a numpy array."""
        norm = np.linalg.norm(
            mat, axis=axis, ord=order, keepdims=True)
        norm[norm == 0] = 1
        return mat / norm
    
def export_to_KeyedVectors(transformed_source_model, target_model, file_path):
        
        target_model.init_sims()
        
        print("transform_source_model.shape = ", transformed_source_model.mat.shape)
        print("target_model.shape = ", target_model.wv.vectors_norm.shape)

        two_languages_index2word = transformed_source_model.index2word + target_model.wv.index2word
        normalised_transform_embedding = normalised(transformed_source_model.mat)
        print("Normalization is finished!")
        
        two_languages_word_vector = np.concatenate((normalised_transform_embedding,\
                                              target_model.wv.vectors_norm), axis=0)
        print("concatenate_model.shape = ", two_languages_word_vector.shape)

        out = open(file_path,'w')
        out.write(str(two_languages_word_vector.shape[0]) + " " + str(two_languages_word_vector.shape[1]))
        out.write("\n")
        for each_word in two_languages_index2word:
            out.write(each_word + " ")
            out.write(' '.join(map(str, two_languages_word_vector[two_languages_index2word.index(each_word)])) + "\n")
        out.close()

        print("KeyedVectors have exported to", file_path)

In [2]:
chinese_vec_file = "/home/ponshane/jupyter_working_dir/cross-lingual-topic-analysis/UM_Corpus_vectors/2018-02-19-ponshane-um-corpus-chinese-NEWS-word2vec_NV_s100w5m15n10s1e-04.vec"
english_vec_file = "/home/ponshane/jupyter_working_dir/cross-lingual-topic-analysis/UM_Corpus_vectors/2018-02-19-ponshane-um-corpus-english-NEWS-word2vec_NV_s100w5m15n10s8e-05.vec"

chinese_model = Word2Vec.load(chinese_vec_file)
english_model = Word2Vec.load(english_vec_file)

In [18]:
print(english_model.wv.vectors.shape)
print(chinese_model.wv.vectors.shape)

(21571, 100)
(29818, 100)


In [8]:
f = open("./UM-Corpus-hand-craft-zh-en-3000.txt", "r")

word_pairs = []

for line in f.readlines():
    line = line.rstrip("\n").split(" ")
    #print(line[0], line[1].lower())
    chinese_word = line[0]
    english_word = line[1].lower()
    if chinese_word in chinese_model.wv.index2word and english_word in english_model.wv.index2word:
        word_pairs.append((chinese_word, english_word))

f.close()

In [9]:
len(word_pairs)

2746

In [None]:
trans_model = TranslationMatrix(chinese_model.wv, english_model.wv)
trans_model.train(word_pairs)
trans_model.translate(["增加", "政府"], topn=10)

In [49]:
# transform datatype
chinese_model_space = translation_matrix.Space(chinese_model.wv.vectors, index2word=chinese_model.wv.index2word)
# transform space
transformed_Chinese_model = trans_model.apply_transmat(chinese_model_space)

In [58]:
export_to_KeyedVectors(transformed_Chinese_model, english_model, "/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/word-vectors/Hand_Craft_Chinese_English_WordVectors.vec")

transform_source_model.shape =  (6733, 100)
target_model.shape =  (19982, 100)
Normalization is finished!
concatenate_model.shape =  (26715, 100)
KeyedVectors have exported to ../out/MLDoc/Chinese_English_wordvectors.vec


In [16]:
# test phase
Concatenated_model = KeyedVectors.load_word2vec_format("/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/word-vectors/Hand_Craft_Chinese_English_WordVectors.vec")

In [17]:
Concatenated_model.most_similar("government", topn=20)

[('政府', 0.9339044094085693),
 ('pledged', 0.894436240196228),
 ('granting', 0.8840190172195435),
 ('美国政府', 0.8780467510223389),
 ('mandate', 0.8754358887672424),
 ('联邦政府', 0.8733851909637451),
 ('governments', 0.8718053102493286),
 ('proposals', 0.8657378554344177),
 ('immigration', 0.8610117435455322),
 ('几比', 0.860572099685669),
 ('authorities', 0.8599981069564819),
 ('authority', 0.8594396114349365),
 ('捐助国', 0.8571164608001709),
 ('donors', 0.853731632232666),
 ('达成协议', 0.8526170253753662),
 ('控制权', 0.8514776229858398),
 ('reforms', 0.8506940603256226),
 ('demanding', 0.8499373197555542),
 ('legislation', 0.846879243850708),
 ('bailout', 0.8448300361633301)]

In [82]:
# export to embedding projector
file_path = "./UM-Corpus_handcraft_Concatenated_embeddings.tsv"
out = open(file_path,'w')

for each_word in Concatenated_model.wv.index2word:
    out.write('\t'.join(map(str, Concatenated_model[each_word])) + "\n")

out.close()

# expoert to metadata file
file_path = "./UM-Corpus_handcraft_Concatenated_metadata.tsv"
out = open(file_path,'w')
out.write("word\tlanguage\n")
for each_word in Concatenated_model.wv.index2word:
    if each_word in chinese_model.wv.index2word:
        out.write(each_word+"\tchinese\n")
    else:
        out.write(each_word+"\tenglish\n")

out.close()



In [19]:
import sys
# set python syspath to point out location of our self-writing module
sys.path.append("/home/ponshane/work_dir/CLTM/src/codebase/")

from Docs_Input_Generator import *

In [20]:
root_path = "/home/ponshane/work_dir/temp/"
vector_path = "/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/word-vectors/80dim-vec-hand-craft.txt"
show_up_dict = export_selected_documents(word_vector_path=vector_path, output_path=root_path,
                                             doc_num=25000)

Already process 2500 documents
Time elapsed (hh:mm:ss.ms) 0:00:04.221036
Already process 5000 documents
Time elapsed (hh:mm:ss.ms) 0:00:04.243962
Already process 7500 documents
Time elapsed (hh:mm:ss.ms) 0:00:04.266402
Already process 10000 documents
Time elapsed (hh:mm:ss.ms) 0:00:06.057480
Already process 12500 documents
Time elapsed (hh:mm:ss.ms) 0:00:06.082258
Already process 15000 documents
Time elapsed (hh:mm:ss.ms) 0:00:06.106703
Already process 17500 documents
Time elapsed (hh:mm:ss.ms) 0:00:06.130905
Already process 20000 documents
Time elapsed (hh:mm:ss.ms) 0:00:07.872977
Already process 22500 documents
Time elapsed (hh:mm:ss.ms) 0:00:07.898923
Already process 25000 documents
Time elapsed (hh:mm:ss.ms) 0:00:07.924207
Already process 25000 documents
Time elapsed (hh:mm:ss.ms) 0:00:07.924236
