In [52]:
import logging
import sys
sys.path.append('..')

from ptm import JointGibbsLDA, JointCorpus
from ptm.utils import get_top_words
from codebase.topic_evaluator import *
from MLDoc_Class_predictor_by_theta import transform_pd_to_numpy_data

import pandas as pd
import numpy as np

In [50]:
def normalize(v):
    norm=np.linalg.norm(v, ord=1)
    if norm==0:
        norm=np.finfo(v.dtype).eps
    return v/norm

In [None]:
Corpus = pd.read_pickle("../out/MLDoc/Shuffled_RS168_tagged_englishAndchinese_corpus_pd.pkl")

In [9]:
# source_idx and target_idx 用來記得 pd_index and corpus idx 的對應
source_idx = dict()
target_idx = dict()
source_file = open("../out/JointLDA_Inputs/MLDoc_English.txt", "w")
target_file = open("../out/JointLDA_Inputs/MLDoc_Chinese.txt", "w")

for idx, row in Corpus.iterrows():
    if row["language"] == "English":
        source_idx[idx] = len(source_idx)
        source_file.write(" ".join(row["extracted_text"]) + "\n")
    elif row["language"] == "Chinese":
        target_idx[idx] = len(target_idx)
        target_file.write(" ".join(row["extracted_text"]) + "\n")
source_file.close()
target_file.close()

In [11]:
# prepare corpus
corpus = JointCorpus(source_corpus_file="../out/JointLDA_Inputs/MLDoc_English.txt",
                     target_corpus_file="../out/JointLDA_Inputs/MLDoc_Chinese.txt")

In [None]:
en_cn_dict = open("/home/ponshane/Downloads/zh-en.txt", "r")
f = open("../out/JointLDA_Inputs/MLDoc_EN_ZH_dictionaries.csv", "w")
for line in en_cn_dict.readlines():
    line = line.rstrip("\n").split(" ")
    target_word = line[0]
    source_word = line[1]
    #print(source_word, target_word)
    if source_word in corpus.source_dict.token2id.keys() and target_word in corpus.target_dict.token2id.keys():
        print(source_word, target_word)
        f.write(source_word + "," + target_word + "\n")
en_cn_dict.close()
f.close()

In [29]:
corpus.update_doctionary("../out/JointLDA_Inputs/MLDoc_EN_ZH_dictionaries.csv")
corpus.convert_raw_corpus_to_trainable_corpus()

2019-01-04 16:18:06 INFO:JointCorpus:size of concept: 514, size of source vocab: 11021, size of target vocab: 6688
2019-01-04 16:18:06 INFO:JointCorpus:Successfully generate idx corpus 'self.docs' and language flags 'self.language_flags'


In [None]:
# train model
n_topic=10
model = JointGibbsLDA(n_doc=len(corpus.docs), n_concept=corpus.n_concept, n_s_vocab=corpus.n_s_vocab,
                      n_t_vocab=corpus.n_t_vocab, n_topic=n_topic)
model.fit(corpus.docs, corpus.language_flags, max_iter=100)
# ~ 11 mins

In [None]:
target_idx

In [126]:
thetas = []
for idx, row in Corpus.iterrows():
    if idx in source_idx.keys():
        thetas.append(normalize(model.DT[source_idx[idx], :]))
    elif idx in target_idx.keys():
        thetas.append(normalize(model.DT[(len(source_idx) + target_idx[idx]), :]))

In [41]:
from sklearn.model_selection import ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import parfit.parfit as pf

In [127]:
grid = {
        'C': [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1e0],
        'penalty': ['l2'],
        'solver': ['lbfgs'],
        'multi_class': ['ovr']}

paramGrid = ParameterGrid(grid)

    
Corpus["theta"] = thetas
Data_Object, _ = transform_pd_to_numpy_data(Corpus, language="Chinese")
bestModel, bestScore, _, _ = pf.bestFit(LogisticRegression, paramGrid,
           Data_Object["x_train"], Data_Object["y_train"], Data_Object["x_dev"], Data_Object["y_dev"],
           metric = accuracy_score, scoreLabel = "Accuracy", showPlot=False)

acc = bestModel.score(X=Data_Object["x_test"], y=Data_Object["y_test"])
print(acc, bestScore)

-------------FITTING MODELS-------------


[Parallel(n_jobs=-1)]: Batch computation too fast (0.0321s.) Setting batch_size=12.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   4 out of  11 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   6 out of  11 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    0.2s finished


-------------SCORING MODELS-------------
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
This model/metric cannot use predict_proba. Using predict for scoring instead.
0.15825 0.85


[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    0.1s finished


In [57]:
# show top words
for ti in range(n_topic):
    top_words = get_top_words(model.TW, corpus.reconcatenate_dict, ti, n_words=30)
    print('Topic', ti ,': ', top_words)
    #print('Topic', ti ,': ', ','.join(top_words))

Topic 0 :  ['中国' 'say' 'cent' '增长' 'price' '企业' 'tonne' ('oil', '石油') '亿元' '报导' '北京'
 'trader' '路透社' '日电' '今年' ('month', '月份') '亿美元' '同比' '投资' 'barrel' '发展'
 '万吨' ('increase', '增加') 'gas' ('week', '星期') '经济' '指出' '人民币' '国家'
 ('exit', '出口')]
Topic 1 :  ['say' 'market' 'trade' ('dollar', '美元') 'trader' 'dealer' '表示' 'close'
 ('week', '星期') 'day' '路透社' 'point' 'future' '日电' 'yen' 'end' '中国' 'mark'
 'price' 'level' ('rise', '上升') 'see' ('fall', '秋天') ('contract', '合同')
 ('sell', '出售') ('stock', '股票') '美国' 'analyst' 'trading' 'gold']
Topic 2 :  ['say' ('corporate', '公司') '香港' '投资' 'police' ('people', '人民') '路透社'
 ('government', '政府') '日电' 'force' 'official' ('peace', '平安') 'kill'
 ('city', '城市') ('shanghai', '上海') 'rebel' '集团' '台湾' 'attack' 'report'
 'town' '港元' 'tell' 'spokesman' 'troop' 'security' '项目' 'refugee'
 ('capital', '首都') 'area']
Topic 3 :  ['表示' '经济' '路透社' '成长' '央行' '可能' '预期' '日本' '日电' '指出' '美国' '德国'
 ('rate', '利率') 'win' '认为' ('current', '目前') '分析师' '显示' '维持' '准备' 'say'
 '经济学家'

In [None]:
corpus.docs[len(source_idx)+114]

In [125]:
# 745:0, 2068:1, 1171: 95, 5415: 114, 50: 224,
#target_idx

In [None]:
[corpus.target_dict[word] for word in Corpus.loc[5415]["extracted_text"]]

In [None]:
Corpus.loc[50]["extracted_text"]

In [104]:
ch_arr = np.empty((10))
en_arr = np.empty((10))
for idx, row in Corpus.iterrows():
    if row["language"] == "Chinese":
        ch_arr += row["theta"]
    elif row["language"] == "English":
        en_arr += row["theta"]

In [107]:
ch_arr / len(corpus.target_corpus)

array([0.12033252, 0.1129089 , 0.08639831, 0.14099912, 0.07939396,
       0.05317105, 0.11453081, 0.12498181, 0.05801409, 0.10943612])

In [108]:
en_arr / len(corpus.source_corpus)

array([0.06346818, 0.11426893, 0.07896215, 0.0539864 , 0.12387346,
       0.12879408, 0.12989076, 0.070026  , 0.10865746, 0.12823925])