In [1]:
import logging
import sys
sys.path.append('..')

from ptm import JointGibbsLDA, JointCorpus
from ptm.utils import get_top_words
from codebase.topic_evaluator import *

In [2]:
# prepare corpus
corpus = JointCorpus(source_corpus_file="../out/JointLDA_Inputs/50K_English_UM_Corpus.txt",
                     target_corpus_file="../out/JointLDA_Inputs/50K_Chinese_UM_Corpus.txt")

corpus.update_doctionary("../out/JointLDA_Inputs/um-corpus-news-top-translation.csv")

corpus.convert_raw_corpus_to_trainable_corpus()

2019-01-14 18:12:45 INFO:JointCorpus:size of concept: 2720, size of source vocab: 9695, size of target vocab: 18449
2019-01-14 18:12:45 INFO:JointCorpus:Successfully generate idx corpus 'self.docs' and language flags 'self.language_flags'


In [3]:
# train model
n_topic=20
model = JointGibbsLDA(n_doc=len(corpus.docs), n_concept=corpus.n_concept, n_s_vocab=corpus.n_s_vocab,
                      n_t_vocab=corpus.n_t_vocab, n_topic=n_topic)
model.fit(corpus.docs, corpus.language_flags, max_iter=100)

2019-01-14 18:12:52 INFO:JointGibbsLDA:[ITER] 0,	elapsed time:2.86,	log_likelihood:-3658856.63
2019-01-14 18:12:55 INFO:JointGibbsLDA:[ITER] 1,	elapsed time:3.04,	log_likelihood:-3468978.38
2019-01-14 18:12:58 INFO:JointGibbsLDA:[ITER] 2,	elapsed time:3.08,	log_likelihood:-3375433.12
2019-01-14 18:13:01 INFO:JointGibbsLDA:[ITER] 3,	elapsed time:3.11,	log_likelihood:-3315378.07
2019-01-14 18:13:04 INFO:JointGibbsLDA:[ITER] 4,	elapsed time:3.25,	log_likelihood:-3272095.78
2019-01-14 18:13:08 INFO:JointGibbsLDA:[ITER] 5,	elapsed time:3.26,	log_likelihood:-3234532.18
2019-01-14 18:13:11 INFO:JointGibbsLDA:[ITER] 6,	elapsed time:3.50,	log_likelihood:-3205299.62
2019-01-14 18:13:14 INFO:JointGibbsLDA:[ITER] 7,	elapsed time:3.29,	log_likelihood:-3181762.67
2019-01-14 18:13:17 INFO:JointGibbsLDA:[ITER] 8,	elapsed time:3.46,	log_likelihood:-3159860.80
2019-01-14 18:13:21 INFO:JointGibbsLDA:[ITER] 9,	elapsed time:3.19,	log_likelihood:-3139396.37
2019-01-14 18:13:24 INFO:JointGibbsLDA:[ITER] 10,	

In [5]:
# show top words
for ti in range(n_topic):
    top_words = get_top_words(model.TW, corpus.reconcatenate_dict, ti, n_words=50)
    print('Topic', ti ,': ', top_words)
    #print('Topic', ti ,': ', ','.join(top_words))

Topic 0 :  [('year', '年度') ('world', '世界') 'percent' ('food', '食物') '可以' 'billion'
 ('people', '人们') 'million' ('school', '学校') ('bank', '银行') '时候' '没有'
 ('child', '孩子') ('economy', '经济体') ('america', '全美') ('group', '集团') '一起'
 '这样' '可能' 'half' ('japan', '日本') ('happening', '情况') '这种' '表示'
 ('china', '中国') ('plant', '植物') ('tax', '税收') ('like', '喜欢') 'high'
 ('budget', '预算') ('participate', '参加') ('planting', '种植') ('life', '生活')
 'thousand' '有些' 'seven' ('rate', '房价') ('party', '派对') '不能'
 ('student', '学生') ('protection', '保护') ('think', '认为') '看到'
 ('suggest', '建议') ('development', '开发') '一种' ('marry', '结婚')
 ('united', '团结') ('growth', '增长速度') 'cut']
Topic 1 :  [('people', '人们') '工作' '美国' ('student', '学生') '大学' ('obtain', '获得')
 ('men', '男士') ('water', '水中') ('project', '项目') '公司' ('work', '做事')
 'found' ('provide', '提供') ('need', '需要') ('way', '途径') '通过'
 ('team', '团队') ('time', '时间') '得到' 'ice' ('hope', '希望') ('use', '使用')
 ('data', '数据') '亿美元' '申请' ('future', '未来') '可以' ('invest

In [24]:
def split_topics_by_languages(n_topic, n_words, model, corpus):
    source_topic_list = []
    target_topic_list = []
    for ti in range(n_topic):
        top_words = get_top_words(model.TW, corpus.reconcatenate_dict, ti, n_words=n_words)
        source_temp = []
        target_temp = []
        for word in top_words:
            if isinstance(word, tuple):
                source_temp.append(word[0])
                target_temp.append(word[1])
            elif isinstance(word, str):
                try:
                    word.encode("ascii")
                    source_temp.append(word)
                except UnicodeEncodeError:
                    target_temp.append(word)
        source_topic_list.append(source_temp)
        target_topic_list.append(target_temp)
    return source_topic_list, target_topic_list

In [32]:
source_topic_list, target_topic_list = split_topics_by_languages(n_topic=20, n_words=100,
                                                                 model=model, corpus=corpus)

In [30]:
corpus_file = "/home/ponshane/work_dir/CLTM/src/out/CLTM_Inputs/2018-12-19/selected50KDos.txt"
cooccurence_matrix, _, compound_dictionary, num_of_documents = documents_to_cooccurence_matrix(corpus_file,
                                                                                               is_pickle=False)

<class 'scipy.sparse.csc.csc_matrix'>
(27673, 25000) (27673, 27673)


In [34]:
for each_top in range(5, 55, 5):
    c_s = coherence_score(cn_topic=target_topic_list, en_topic=source_topic_list,
                topk=each_top, cooccurence_matrix=cooccurence_matrix,
                compound_dictionary=compound_dictionary, num_of_documents=num_of_documents,
                coherence_method="npmi")
    j_s = avg_jaccard_similarity_between_topics(target_topic_list, source_topic_list, each_top)
    print(each_top, c_s, j_s)

5 0.041470876278132444 0.015263157894736848
10 -0.02753921104196132 0.009210526315789478
15 -0.07723562105547122 0.007368421052631582
20 -0.10883361012970212 0.0061842105263157894
25 -0.12168628597710082 0.005368421052631581
30 -0.142465876496626 0.004736842105263162
35 -0.15651740657686633 0.0042857142857142825
40 -0.17582171465747526 0.00407894736842105
45 -0.18762349767385794 0.0036842105263157877
50 -0.19970412525621675 0.0034210526315789492
