In [None]:
import logging
import sys
sys.path.append('..')

from ptm import JointGibbsLDA, JointCorpus
from ptm.utils import get_top_words
from codebase.topic_evaluator import *

import random

In [None]:
# prepare corpus
corpus = JointCorpus(source_corpus_file="../out/JointLDA_Inputs/50K_English_UM_Corpus.txt",
                     target_corpus_file="../out/JointLDA_Inputs/50K_Chinese_UM_Corpus.txt")

In [None]:
f = open("../out/JointLDA_Inputs/um-corpus-news-top-translation.csv")

updated_source_dict = {}
updated_target_dict = {}

# rebuild vocab dict for query JointLDA's topic word distribution
reconcatenate_dict = []

for line in f.readlines():
    line = line.rstrip("\n").split(",")
    #######
    # Notice the order here!
    #######
    source_word = line[0].lower()
    target_word = line[1]
    if target_word in corpus.target_dict.token2id.keys() and source_word in corpus.source_dict.token2id.keys():
        if target_word not in updated_target_dict.keys() and source_word not in updated_source_dict.keys():
            updated_target_dict[target_word] = len(updated_target_dict)
            updated_source_dict[source_word] = len(updated_source_dict)
            reconcatenate_dict.append((source_word, target_word))
f.close()

assert len(updated_target_dict) == len(updated_source_dict) == len(reconcatenate_dict)

In [None]:
random_pair_file = open("../out/JointLDA_Inputs/10perc-um-corpus-news-top-translation.csv", "w")
for each_pair in reconcatenate_dict:
    if random.random() > 0.9:
        random_pair_file.write(each_pair[0] + "," + each_pair[1] + "\n")
random_pair_file.close()

In [None]:
corpus.update_doctionary("../out/JointLDA_Inputs/10perc-um-corpus-news-top-translation.csv")
corpus.convert_raw_corpus_to_trainable_corpus()

In [None]:
# train model
n_topic=20
model = JointGibbsLDA(n_doc=len(corpus.docs), n_concept=corpus.n_concept, n_s_vocab=corpus.n_s_vocab,
                      n_t_vocab=corpus.n_t_vocab, n_topic=n_topic)
model.fit(corpus.docs, corpus.language_flags, max_iter=100)

In [None]:
# show top words
for ti in range(n_topic):
    top_words = get_top_words(model.TW, corpus.reconcatenate_dict, ti, n_words=30)
    print('Topic', ti ,': ', top_words)
    #print('Topic', ti ,': ', ','.join(top_words))

In [None]:
def split_topics_by_languages(n_topic, n_words, model, corpus):
    source_topic_list = []
    target_topic_list = []
    for ti in range(n_topic):
        top_words = get_top_words(model.TW, corpus.reconcatenate_dict, ti, n_words=n_words)
        source_temp = []
        target_temp = []
        for word in top_words:
            if isinstance(word, tuple):
                source_temp.append(word[0])
                target_temp.append(word[1])
            elif isinstance(word, str):
                try:
                    word.encode("ascii")
                    source_temp.append(word)
                except UnicodeEncodeError:
                    target_temp.append(word)
        source_topic_list.append(source_temp)
        target_topic_list.append(target_temp)
    return source_topic_list, target_topic_list

In [None]:
source_topic_list, target_topic_list = split_topics_by_languages(n_topic=20, n_words=100,
                                                                 model=model, corpus=corpus)

In [None]:
corpus_file = "/home/ponshane/work_dir/CLTM/src/out/CLTM_Inputs/2018-12-19/selected50KDos.txt"
cooccurence_matrix, _, compound_dictionary, num_of_documents = documents_to_cooccurence_matrix(corpus_file,
                                                                                               is_pickle=False)

In [None]:
for each_top in range(5, 55, 5):
    c_s = coherence_score(cn_topic=target_topic_list, en_topic=source_topic_list,
                topk=each_top, cooccurence_matrix=cooccurence_matrix,
                compound_dictionary=compound_dictionary, num_of_documents=num_of_documents,
                coherence_method="npmi")
    j_s = avg_jaccard_similarity_between_topics(target_topic_list, source_topic_list, each_top)
    print(each_top, c_s, j_s)