In [None]:
import logging

from ptm import JointGibbsLDA, JointCorpus
import pickle

# Use 50K UM-Corpus and infer [10, 20, 30, 40, 50] topics

In [None]:
# prepare corpus
corpus = JointCorpus(source_corpus_file="/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/50K-sampled-docs/50K_English_UM_Corpus.txt",
                     target_corpus_file="/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/50K-sampled-docs/50K_Chinese_UM_Corpus.txt")

corpus.update_doctionary("/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/50K-sampled-docs/um-corpus-news-top-translation.csv")

corpus.convert_raw_corpus_to_trainable_corpus()

In [None]:
# train model
for n_topic in range(10, 51, 10):
    model = JointGibbsLDA(n_doc=len(corpus.docs), n_concept=corpus.n_concept, n_s_vocab=corpus.n_s_vocab,
                          n_t_vocab=corpus.n_t_vocab, n_topic=n_topic)
    model.fit(corpus.docs, corpus.language_flags, max_iter=1000)
    
    file_name = "JointLDA-" + str(n_topic) + "topics.pickle"
    with open('/home/ponshane/work_dir/CLTM-Experiments/Results/UM-Corpus-50K-sampled-docs/model-comparison/'+file_name, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# testing cell
n_topic = 10
file_name = "JointLDA-" + str(n_topic) + "topics.pickle"
'/home/ponshane/work_dir/CLTM-Experiments/Results/UM-Corpus-50K-sampled-docs/model-comparison/'+file_name

# set topics = 10 and use [100K, 200K, 400K, 800K] UM-Corpus

In [None]:
for size in [100, 200, 400, 800]:
    source = "/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/{}K-sampled-docs/{}K_English_UM_Corpus.txt".format(str(size), str(size))
    target = "/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/{}K-sampled-docs/{}K_Chinese_UM_Corpus.txt".format(str(size), str(size))
    print(source)
    print(target)
    
    # prepare corpus
    corpus = JointCorpus(source_corpus_file=source, target_corpus_file=target)
    corpus.update_doctionary("/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/50K-sampled-docs/um-corpus-news-top-translation.csv")
    corpus.convert_raw_corpus_to_trainable_corpus()
    
    n_topic = 10
    model = JointGibbsLDA(n_doc=len(corpus.docs), n_concept=corpus.n_concept, n_s_vocab=corpus.n_s_vocab,
                          n_t_vocab=corpus.n_t_vocab, n_topic=n_topic)
    model.fit(corpus.docs, corpus.language_flags, max_iter=1000)
    
    file_name = "JointLDA-" + str(n_topic) + "topics.pickle"
    file_path = '/home/ponshane/work_dir/CLTM-Experiments/Results/UM-Corpus-{}K-sampled-docs/'.format(str(size))
    with open(file_path+file_name, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Use MLDoc and infer [10, 20, 30, 40, 50] topics

In [None]:
# prepare corpus
corpus = JointCorpus(source_corpus_file="/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/MLDoc-English.txt",
                     target_corpus_file="/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/MLDoc-Chinese.txt")

corpus.update_doctionary("/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/MLDoc_EN_ZH_dictionaries.csv")

corpus.convert_raw_corpus_to_trainable_corpus()

In [None]:
# train model
for n_topic in range(10, 51, 10):
    model = JointGibbsLDA(n_doc=len(corpus.docs), n_concept=corpus.n_concept, n_s_vocab=corpus.n_s_vocab,
                          n_t_vocab=corpus.n_t_vocab, n_topic=n_topic)
    model.fit(corpus.docs, corpus.language_flags, max_iter=1000)
    
    file_name = "JointLDA-" + str(n_topic) + "topics.pickle"
    with open('/home/ponshane/work_dir/CLTM-Experiments/Results/MLDoc/model-comparison/'+file_name, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

# HandCraft UM-CORPUS

In [None]:
with open("/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/UM-Corpus-hand-craft-zh-en-3000.txt", "r") as handler:
    for line in handler:
        temp = line.strip("\n").split(" ")
        print(temp[1] + "," +temp[0])

In [None]:
# prepare corpus
corpus = JointCorpus(source_corpus_file="/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/50K-sampled-docs/50K_English_UM_Corpus.txt",
                     target_corpus_file="/home/ponshane/work_dir/CLTM-Experiments/Data/UM-Corpus/50K-sampled-docs/50K_Chinese_UM_Corpus.txt")

corpus.update_doctionary("./JointLDA-UM-Corpus-hand-craft-zh-en-3000.txt")

corpus.convert_raw_corpus_to_trainable_corpus()

In [None]:
# train model
for n_topic in range(10, 51, 10):
    model = JointGibbsLDA(n_doc=len(corpus.docs), n_concept=corpus.n_concept, n_s_vocab=corpus.n_s_vocab,
                          n_t_vocab=corpus.n_t_vocab, n_topic=n_topic)
    model.fit(corpus.docs, corpus.language_flags, max_iter=1000)
    
    file_name = "JointLDA-hand-craft-" + str(n_topic) + "topics.pickle"
    with open('/home/ponshane/work_dir/CLTM-Experiments/Results/UM-Corpus-50K-sampled-docs/hand-craft-mapping-comparison/'+file_name, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

# HandCraft MLDoc

In [None]:
"""
with open("/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/hand-craft-control-dictionary.txt", "r") as handler:
    for line in handler:
        line = line.rstrip("\n").split(",")
        print(line[0], line[1])
"""

In [None]:
# prepare corpus
corpus = JointCorpus(source_corpus_file="/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/MLDoc-English.txt",
                     target_corpus_file="/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/MLDoc-Chinese.txt")

corpus.update_doctionary("/home/ponshane/work_dir/CLTM-Experiments/Data/MLDoc/hand-craft-control-dictionary.txt")

corpus.convert_raw_corpus_to_trainable_corpus()

In [None]:
# train model
for n_topic in range(10, 51, 10):
    model = JointGibbsLDA(n_doc=len(corpus.docs), n_concept=corpus.n_concept, n_s_vocab=corpus.n_s_vocab,
                          n_t_vocab=corpus.n_t_vocab, n_topic=n_topic)
    model.fit(corpus.docs, corpus.language_flags, max_iter=1000)
    
    file_name = "JointLDA-hand-craft-" + str(n_topic) + "topics.pickle"
    with open('/home/ponshane/work_dir/CLTM-Experiments/Results/MLDoc/hand-craft-mapping-comparison/'+file_name, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)