In [1]:
import configparser
import pickle
import logging
from pymongo import MongoClient
from gensim import corpora, models
from codebase.topic_utilities import make_perplexity_plots

In [2]:
# initialize and read config file - connect to mongodb
config = configparser.ConfigParser()
config.read('./config.ini')

# connect to mongo
MongoServer = config["Mongo"]["URI"]
MongoDB = config["Mongo"]["Database"]
MongoUser = config["Mongo"]["User"]
MongoPW = config["Mongo"]["PW"]

uri = "mongodb://" + MongoUser + ":" + MongoPW + "@" + MongoServer + "/?authSource=" +\
MongoDB + "&authMechanism=SCRAM-SHA-1"

client = MongoClient(uri)
target_collection = client[MongoDB]["FirstWeek_March"]

In [4]:
%%time
docs = []
for i, doc in enumerate(target_collection.find({"text":{"$exists": True}})):
    docs.append(doc["tokens"])

CPU times: user 4.11 s, sys: 1.39 s, total: 5.51 s
Wall time: 20.3 s


In [11]:
%%time
phrases = models.phrases.Phrases(docs, min_count=20, scoring="npmi", threshold=0.6)
bigram = models.phrases.Phraser(phrases)  # construct faster model (this is only an wrapper)
bigram_docs = [bigram[doc] for doc in docs]

CPU times: user 27.7 s, sys: 126 ms, total: 27.8 s
Wall time: 27.8 s


In [13]:
%%time
clean_bigram_docs = []
for doc in bigram_docs:
    clean_bigram_docs.append([word for word in doc if len(word) > 2])

CPU times: user 1.24 s, sys: 7.65 ms, total: 1.25 s
Wall time: 1.24 s


In [19]:
%%time
token_dictionary = corpora.Dictionary(clean_bigram_docs)
token_dictionary.filter_extremes(no_below=20, no_above=0.5)
token_dictionary.compactify()

CPU times: user 6.47 s, sys: 6.28 ms, total: 6.48 s
Wall time: 6.48 s


In [20]:
corpus = [token_dictionary.doc2bow(text) for text in clean_bigram_docs]

In [22]:
%%time

# n_topics 為主題數設定
n_topics = 50

# flag 可設成任何能幫助你紀錄的字串
# 建議以 日期-主題數 的方式來記憶
flag = "Covid19-FirstWeek_March-" + str(n_topics) + "-topics"

logging.basicConfig(filename='./logs/'+ flag +'.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

# 這段操作可以讓模型保有 5% 的資料來做測試資料，以便衡量模型是否收斂
estimated_chunk_size = int(len(corpus) * 0.095)

# 套用 gensim.models 的 LdaMulticore 以多核心加速訓練主題模型
# 以下設定大致無須調校，唯獨 passes (對等於神經模型的 epoch) 在大文本的情況下可能要增加
# 配合 make_perplexity_plots 來觀察是否收斂，若否則可以考慮增加
ldamodel = models.ldamulticore.LdaMulticore(corpus=corpus, num_topics=n_topics, id2word = token_dictionary,\
                                    chunksize=estimated_chunk_size, alpha=1/n_topics, eta=0.01,\
                                    passes=50, eval_every=10)

logging.shutdown()

CPU times: user 21min 25s, sys: 1min, total: 22min 25s
Wall time: 22min 11s


In [23]:
make_perplexity_plots(log_path = './logs/' + flag + '.log', \
                      output_path = './logs/' + flag + '.pdf')

In [24]:
# 確認模型收斂無虞後，將主題模型儲存以辨後續使用
ldamodel.save("./models/" + flag)

In [25]:
%%time
import pyLDAvis
import pyLDAvis.gensim as gensimvis

# 透過 pyLDAvis 的 gensimvis 將模型轉換成 LDAvis 需求格式
vis_data = gensimvis.prepare(ldamodel, corpus, token_dictionary, sort_topics=False)
pyLDAvis.save_html(data=vis_data, fileobj="./models/"+ flag +".html")

CPU times: user 1min 47s, sys: 599 ms, total: 1min 47s
Wall time: 1min 54s
