In [1]:
from numpy.random import random_sample
from collections import Counter

# export 100% fully translated corpus

In [2]:
file = open("/home/ponshane/jupyter_working_dir/cross-lingual-topic-analysis/LFTM-input-files/2018-09-27-ponshane-50K-document-list.txt",
            'r')

cn_docs = open("../out/PLDA_Inputs/50K-cn-docs.txt", "w")
en_docs = open("../out/PLDA_Inputs/50K-en-docs.txt", "w")

for idx, doc in enumerate(file.readlines()):
    if idx % 2 != 0:
        cn_docs.writelines(str(idx-1) + '\t' + 'CN' + '\t' + doc)
    else:
        en_docs.writelines(str(idx) + '\t' + 'EN' + '\t' + doc)

cn_docs.close()
en_docs.close()

# define the function to help export ?% fully translated corpus

In [2]:
def generate_incomplete_translated_corpus(file_path, num_of_doc, export_file_name, sampling_rate=0.5):
    file = open(file_path, 'r')
    
    ###
    # Block 1
    ###
    
    # 原先使用 len(file.readlines())，但此方法會推動迭代器
    # 會影響到 Block2 的迭代器，因此改為將篇數作為參數輸入
    number_of_sentences = num_of_doc
    
    # 若非偶數，以下邏輯會出錯
    assert number_of_sentences % 2 == 0
    
    sampling_sentence_index = [False]*number_of_sentences
    # 先決定之後採樣的對象是英文句or中文句子
    # 0.5 做為標準讓中、英文各自有機率被拋棄
    for idx in range(0, number_of_sentences, 2):
        rnd = random_sample()
        if rnd < 0.5:
            # let english sentence is target of sampling
            sampling_sentence_index[idx] = True
        else:
            # let chinese sentence is target of sampling
            sampling_sentence_index[idx+1] = True

    ###
    # Block 2
    ###
    
    cn_docs = open("../out/PLDA_Inputs/"+export_file_name+"50K-cn-docs.txt", "w")
    en_docs = open("../out/PLDA_Inputs/"+export_file_name+"50K-en-docs.txt", "w")
    
    sample_out_count = 0
    
    for idx, doc in enumerate(file.readlines()):
        
        # looping each sentences
        if idx % 2 != 0:
            # 如果當前的採樣對象是本句，則進行按照機率來保留(sampling_rate)
            # 寫出時需要 idx-1 目的在於對應英文句子的idx, PLDA 的需求！
            if sampling_sentence_index[idx]:
                keep_rnd = random_sample()
                if keep_rnd < sampling_rate:
                    cn_docs.writelines(str(idx-1) + '\t' + 'CN' + '\t' + doc)
                else:
                    sample_out_count +=1
                    cn_docs.writelines(str(idx-1) + '\t' + 'CN' + '\t\n')
            else:
                cn_docs.writelines(str(idx-1) + '\t' + 'CN' + '\t' + doc)
        else:
            if sampling_sentence_index[idx]:
                keep_rnd = random_sample()
                if keep_rnd < sampling_rate:
                    en_docs.writelines(str(idx) + '\t' + 'EN' + '\t' + doc)
                else:
                    sample_out_count +=1
                    en_docs.writelines(str(idx) + '\t' + 'EN' + '\t\n')
            else:
                en_docs.writelines(str(idx) + '\t' + 'EN' + '\t' + doc)

    cn_docs.close()
    en_docs.close()
    
    # just check the probabilty function works in our expectation!
    # 舉例：若文章數為50K，由於一開始的想法就是一定會保留一邊的語言
    # 另一邊以機率採樣的方式保留，所以期望值應該是 50K/2 * sampling_rate
    print(Counter(sampling_sentence_index), 1-(sample_out_count/(num_of_doc/2)))

In [57]:
# for testing
generate_incomplete_translated_corpus("/home/ponshane/jupyter_working_dir/cross-lingual-topic-analysis/LFTM-input-files/2018-09-27-ponshane-50K-document-list.txt",
                                      num_of_doc = 50000,
                                      export_file_name="75percKeep-",
                                      sampling_rate=0.75)

Counter({False: 25000, True: 25000}) 0.75176


# Generate partial document pairs
1. Call the function: generate_incomplete_translated_corpus 
2. loop each threshold to produce partial document pairs

In [3]:
threshold_list = [10, 25, 50, 75, 90]
document_path = "/home/ponshane/jupyter_working_dir/cross-lingual-topic-analysis/LFTM-input-files/2018-09-27-ponshane-50K-document-list.txt"
for each_threshold in threshold_list:
    print("Now on threshold: ", str(each_threshold/100))
    generate_incomplete_translated_corpus(file_path=document_path, num_of_doc=50000,
                                         export_file_name=str(each_threshold)+"percKeep-",
                                         sampling_rate=each_threshold/100)

Now on threshold:  0.1
Counter({True: 25000, False: 25000}) 0.10155999999999998
Now on threshold:  0.25
Counter({False: 25000, True: 25000}) 0.24731999999999998
Now on threshold:  0.5
Counter({True: 25000, False: 25000}) 0.49628000000000005
Now on threshold:  0.75
Counter({False: 25000, True: 25000}) 0.7472799999999999
Now on threshold:  0.9
Counter({True: 25000, False: 25000}) 0.90036
