In [1]:
import numpy as np

import sys
# set python syspath to point out location of our self-writing module
sys.path.append("/home/ponshane/work_dir/CLTM/src/codebase/")

from fasttext import FastVector

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

def load_bilingual_dict(dictionary_path):
    bilingual_dict = []
    with open(dictionary_path, "r") as infile:
        for line in infile:
            temp = line.strip("\n").split(" ")
            bilingual_dict.append((temp[0], temp[1]))
        return bilingual_dict

def export_to_required_input_of_CLTM(en_dictionary, zh_dictionary, outpath):
    fout = open(outpath, "w")
    for token in en_dictionary.id2word:
        vector_components = ["%.6f" % number for number in en_dictionary[token]]
        vector_as_string = " ".join(vector_components)

        out_line = token + " " + vector_as_string + "\n"
        fout.write(out_line)

    for token in zh_dictionary.id2word:
        vector_components = ["%.6f" % number for number in zh_dictionary[token]]
        vector_as_string = " ".join(vector_components)

        out_line = token + " " + vector_as_string + "\n"
        fout.write(out_line)
    fout.close()
    
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

# Align two word space

In [2]:
en_dictionary = FastVector(vector_file='/home/ponshane/Downloads/cc.en.300.vec', max_vocab_size=100000)
zh_dictionary = FastVector(vector_file='/home/ponshane/Downloads/cc.zh.300.vec', max_vocab_size=100000)

reading word vectors from /home/ponshane/Downloads/cc.en.300.vec
reading word vectors from /home/ponshane/Downloads/cc.zh.300.vec


In [3]:
print(FastVector.cosine_similarity(en_dictionary["divide"], zh_dictionary["分裂"]))

0.028019036355238686


In [4]:
bilingual_dict = load_bilingual_dict(dictionary_path="/home/ponshane/Downloads/en-zh.0-5000.txt")

In [5]:
# form the training matrices
source_matrix, target_matrix = make_training_matrices(
    en_dictionary, zh_dictionary, bilingual_dict)

In [6]:
# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
en_dictionary.apply_transform(transform)

In [7]:
print(FastVector.cosine_similarity(en_dictionary["divide"], zh_dictionary["分裂"]))

0.44382278145384374


# Build language classfier to estimate the language effect of word vector

In [8]:
# build a classifier
from sklearn.linear_model import LogisticRegression
samples = np.concatenate((en_dictionary.embed, zh_dictionary.embed), axis=0)
target = [0] * en_dictionary.embed.shape[0] + [1] * zh_dictionary.embed.shape[0]
print(samples.shape, len(target))

(4000000, 300) 4000000


In [9]:
classifier = LogisticRegression()  # default value
classifier.fit(samples, target)  # do not return

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
del samples, target

# Export the required input of CLTM
1. 100% aligned space
2. 10%, 20%, 30%, 40%, 50%, 60%, 70%, 80%, 90% aligned space
3. check the word types of training input document and word types of aligned space

In [11]:
# 100% aligned space
outpath = "/home/ponshane/work_dir/CLTM-Experiments/Data/Fasttext/100perc-en-zh-wiki-space.txt"
export_to_required_input_of_CLTM(en_dictionary, zh_dictionary, outpath)

In [12]:
# 10%, 20%, 30%, 40%, 50%, 60%, 70%, 80%, 90% aligned space

for perc in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
    topRemovedN = (en_dictionary.embed.shape[1] - en_dictionary.embed.shape[1] * (perc/100)) // 2
    topRemovedN = int(topRemovedN)
    
    removal_dimension_list = list(np.argsort(classifier.coef_)[0][:topRemovedN]) + list(np.argsort(classifier.coef_)[0][-topRemovedN:])

    en_sub = np.delete(en_dictionary.embed, removal_dimension_list, 1)
    zh_sub = np.delete(zh_dictionary.embed, removal_dimension_list, 1)
    
    outpath = "/home/ponshane/work_dir/CLTM-Experiments/Data/Fasttext/" + str(perc) + "perc-en-zh-wiki-space.txt"
    fout = open(outpath, "w")
    for token_id in en_dictionary.word2id.values():
        vector_components = ["%.6f" % number for number in en_sub[token_id,:]]
        vector_as_string = " ".join(vector_components)

        out_line = en_dictionary.id2word[token_id] + " " + vector_as_string + "\n"
        fout.write(out_line)

    for token_id in zh_dictionary.word2id.values():
        vector_components = ["%.6f" % number for number in zh_sub[token_id,:]]
        vector_as_string = " ".join(vector_components)

        out_line = zh_dictionary.id2word[token_id] + " " + vector_as_string + "\n"
        fout.write(out_line)
    fout.close()

# for MLDoc

In [21]:
# check the word types of training input document and word types of aligned space

doc_path = "/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/CLTM-MLDoc.txt"

missing_words = set()
with open(doc_path, "r") as handler:
    for line in handler:
        for word in line.strip("\n").split(" "):
            if word in missing_words:
                continue
            elif word not in en_dictionary.word2id.keys() and word not in zh_dictionary.word2id.keys():
                missing_words.add(word)

new_doc_path = "/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/CLTM-MLDoc-fileterd-by-fasttext.txt"
with open(doc_path, "r") as handler, open(new_doc_path, "w") as newer:
    for line in handler:
        temp = []
        for word in line.strip("\n").split(" "):
            if word not in missing_words:
                temp.append(word)
        one_line = " ".join(temp) + "\n"
        newer.write(one_line)

In [22]:
eng_num = 0
for word in missing_words:
    if isEnglish(word):
        eng_num+=1
print(len(missing_words), eng_num)

4273 1913


# for UM-Corpus 25K

In [3]:
# check the word types of training input document and word types of aligned space
doc_path = "/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/50K-sampled-docs/selected50KDos.txt"

missing_words = set()
with open(doc_path, "r") as handler:
    for line in handler:
        for word in line.strip("\n").split(" "):
            if word in missing_words:
                continue
            elif word not in en_dictionary.word2id.keys() and word not in zh_dictionary.word2id.keys():
                missing_words.add(word)

new_doc_path = "/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/50K-sampled-docs/selected50KDos-fileterd-by-fasttext.txt"
with open(doc_path, "r") as handler, open(new_doc_path, "w") as newer:
    for line in handler:
        temp = []
        for word in line.strip("\n").split(" "):
            if word not in missing_words:
                temp.append(word)
        one_line = " ".join(temp) + "\n"
        newer.write(one_line)

In [4]:
eng_num = 0
for word in missing_words:
    if isEnglish(word):
        eng_num+=1
print(len(missing_words), eng_num)

5583 1810
