In [67]:
import sys
# set python syspath to point out location of our self-writing module
sys.path.append("/home/ponshane/work_dir/CLTM/src")

from codebase.PMLDA import *
from gensim.models.wrappers import LdaMallet

import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import parfit.parfit as pf
import os

In [64]:
def transform_pd_to_numpy_data(Corpus, language):

    '''
    data_count = pd_Corpus.groupby('Data_type').size()
    training_size = data_count["train"]
    testing_size = data_count["test"]
    dev_size = data_count["dev"]
    input_feature_size = len(pd_Corpus["theta"][0])
    '''

    class_dictionary = {}
    class_count = Corpus.groupby('Class').size()
    for key, _ in class_count.iteritems():
        class_dictionary[key] = len(class_dictionary)

    class_size = len(class_dictionary)

    Data_Object = {}
    Data_Object["x_train"] = np.array(Corpus[(Corpus["Data_type"] == "train") &
                                            (Corpus["language"] == "English")]["theta"].values.tolist(), dtype="float")
    Data_Object["y_train"] = np.array(Corpus[(Corpus["Data_type"] == "train") &
                                            (Corpus["language"] == "English")]["Class"])
    Data_Object["x_dev"] = np.array(Corpus[(Corpus["Data_type"] == "dev") &
                                            (Corpus["language"] == "English")]["theta"].values.tolist(), dtype="float")
    Data_Object["y_dev"] = np.array(Corpus[(Corpus["Data_type"] == "dev") &
                                            (Corpus["language"] == "English")]["Class"])
    '''
    Data_Object["x_test"] = np.array(Corpus[Corpus["Data_type"] == "test"]["theta"].values.tolist())
    Data_Object["y_test"] = keras.utils.to_categorical(np.array(Corpus[Corpus["Data_type"] == "test"]["Class"].apply(
                                lambda x: class_dictionary[x]).tolist()), num_classes=class_size)
    '''
    Data_Object["x_test"] = np.array(Corpus[(Corpus["Data_type"] == "test") &
                                            (Corpus["language"] == language)]["theta"].values.tolist(), dtype="float")
    Data_Object["y_test"] = np.array(Corpus[(Corpus["Data_type"] == "test") &
                                                                       (Corpus["language"] == language)]["Class"])

    return Data_Object, class_dictionary

# MLDoc English to Chinese

In [24]:
pm = PMLDA(source_model_path="../out/Mallet_Mono_LDA/MLDoc-Chinese-vs-English-iter500-alpha01-cn.model",
    target_model_path="../out/Mallet_Mono_LDA/MLDoc-English-vs-Chinese-iter500-alpha01-en.model",
    vector_path="../out/CLTM_Inputs/90dim-MLDoc-engAndchi.txt")

pm.train(top_n_representative_words=50, num_of_topic=10)

INFO:root:Start to initialize PMLDA
INFO:gensim.utils:loading LdaMallet object from ../out/Mallet_Mono_LDA/MLDoc-Chinese-vs-English-iter500-alpha01-cn.model
INFO:gensim.utils:loading id2word recursively from ../out/Mallet_Mono_LDA/MLDoc-Chinese-vs-English-iter500-alpha01-cn.model.id2word.* with mmap=None
INFO:gensim.utils:loaded ../out/Mallet_Mono_LDA/MLDoc-Chinese-vs-English-iter500-alpha01-cn.model
INFO:gensim.utils:loading LdaMallet object from ../out/Mallet_Mono_LDA/MLDoc-English-vs-Chinese-iter500-alpha01-en.model
INFO:gensim.utils:loading id2word recursively from ../out/Mallet_Mono_LDA/MLDoc-English-vs-Chinese-iter500-alpha01-en.model.id2word.* with mmap=None
INFO:gensim.utils:loaded ../out/Mallet_Mono_LDA/MLDoc-English-vs-Chinese-iter500-alpha01-en.model
INFO:gensim.models.utils_any2vec:loading projection weights from ../out/CLTM_Inputs/90dim-MLDoc-engAndchi.txt
INFO:gensim.models.utils_any2vec:loaded (26715, 90) matrix from ../out/CLTM_Inputs/90dim-MLDoc-engAndchi.txt
INFO:root

[3, 2, 4, 2, 3, 2]


In [25]:
pm.member_of_clusters

{0: ['C2', 'C8', 'E4'],
 1: ['C6', 'E1'],
 2: ['C1', 'C7', 'E3', 'E9'],
 3: ['C3', 'C4'],
 4: ['C0', 'C5', 'C9'],
 5: ['E2', 'E6'],
 6: ['E7'],
 7: ['E8'],
 8: ['E5'],
 9: ['E0']}

In [26]:
# recalculating theta
# 讓原始 topic index 為 key，對應到的 cross-lignual topic index 為 value
chinese_topic_dictionary = {}
english_topic_dictionary = {}
for cross_index, original_indexes in pm.member_of_clusters.items():
    for each_member in original_indexes:
        if "C" in each_member:
            chinese_topic_dictionary[int(each_member.strip("C"))] = cross_index
        elif "E" in each_member:
            english_topic_dictionary[int(each_member.strip("E"))] = cross_index

In [55]:
chinese_doc_topics = "../out/Mallet_Mono_LDA/MLDoc-Chinese-vs-English-topic10-doctopics.txt"
english_doc_topics = "../out/Mallet_Mono_LDA/MLDoc-English-vs-Chinese-topic10-doctopics.txt"

chinese_model = LdaMallet.load("../out/Mallet_Mono_LDA/MLDoc-Chinese-vs-English-iter500-alpha01-cn.model")

INFO:gensim.utils:loading LdaMallet object from ../out/Mallet_Mono_LDA/MLDoc-Chinese-vs-English-iter500-alpha01-cn.model
INFO:gensim.utils:loading id2word recursively from ../out/Mallet_Mono_LDA/MLDoc-Chinese-vs-English-iter500-alpha01-cn.model.id2word.* with mmap=None
INFO:gensim.utils:loaded ../out/Mallet_Mono_LDA/MLDoc-Chinese-vs-English-iter500-alpha01-cn.model


In [56]:
def recalculate_theta(mapping_dictionary, original_theta_file):
    
    all_transformed_theta = []
    
    orginal_theta = chinese_model.read_doctopics(fname=original_theta_file)
    for each_theta in orginal_theta:
        
        transformed_theta = [0]*len(mapping_dictionary)
        assert len(each_theta) == len(mapping_dictionary)
        
        for idx, topic_tuple in enumerate(each_theta):
            transformed_theta[mapping_dictionary[idx]] += topic_tuple[1]
        
        all_transformed_theta.append(transformed_theta)
        
    return all_transformed_theta

In [65]:
chinese_transformed_theta = recalculate_theta(chinese_topic_dictionary, chinese_doc_topics)
english_transformed_theta = recalculate_theta(english_topic_dictionary, english_doc_topics)
PMLDA_theta = chinese_transformed_theta + english_transformed_theta

In [66]:
# Corpus load back and  describe class column
Corpus = pd.read_pickle("../out/MLDoc/tagged_englishAndchinese_corpus_pd.pkl")

grid = {
    'C': [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1e0],
    'penalty': ['l2'],
    'n_jobs': [-1],
    'solver': ['lbfgs'],
    'multi_class': ['ovr']
}

paramGrid = ParameterGrid(grid)

each_corpus = Corpus.copy()

each_corpus["theta"] = PMLDA_theta
Data_Object, _ = transform_pd_to_numpy_data(each_corpus, language="Chinese")

bestModel, bestScore, _, _ = pf.bestFit(LogisticRegression, paramGrid,
           Data_Object["x_train"], Data_Object["y_train"], Data_Object["x_dev"], Data_Object["y_dev"],
           metric = accuracy_score, scoreLabel = "Accuracy", showPlot=False)

acc = bestModel.score(X=Data_Object["x_test"], y=Data_Object["y_test"])
print(acc)

-------------FITTING MODELS-------------


[Parallel(n_jobs=-1)]: Batch computation too fast (0.0284s.) Setting batch_size=14.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   4 out of  11 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   6 out of  11 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    0.2s finished


-------------SCORING MODELS-------------
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
0.43


[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    0.1s finished
