In [1]:
from codebase.utils import MongoConnector 

from gensim.models import TfidfModel
from gensim.models.nmf import Nmf
from gensim.corpora import Dictionary, MmCorpus

In [2]:
fileTag = "FirstWeek-March-Tweets"
corpora_path = "./corpora/"
model_path = "./models/"

## Step1, build Dictionary object

In [3]:
conn = MongoConnector("./config.ini")
conn.get_collection_cursor("FirstWeek_March")

In [4]:
%%time
query = {"tokens":{"$exists": True}}
dct = Dictionary()
for doc in conn.data_streaming_from_collection(query=query):
    token_f = [x for x in doc["tokens"] if len(x) > 1]
    dct.add_documents([token_f])

CPU times: user 43.2 s, sys: 482 ms, total: 43.7 s
Wall time: 56 s


In [5]:
print("Original size of vocabs: {}".format(len(dct)))
# control the vocabulary
dct.filter_extremes(no_below=20, no_above=0.5, keep_n=len(dct), keep_tokens=None)
print("Truncated size of vocabs: {}".format(len(dct)))

Original size of vocabs: 335223
Truncated size of vocabs: 29667


## Step2, apply Tf-IDF representation 

In [6]:
%%time
query = {"tokens":{"$exists": True}}
bow_corpus = []
meta_wf = open("{}{}-Meta.csv".format(corpora_path, fileTag),"w")
meta_wf.write("position_index,id_str,created_time\n")
position_index = 0
for doc in conn.data_streaming_from_collection(query=query):
    # gensim's Dictionary.doc2bow will ignore words that are not in dictionary by default
    bow_per_doc = dct.doc2bow(doc["tokens"])
    if len(bow_per_doc) > 4:
        timestamp = doc["created_at"].strftime("%Y-%m-%d %H:%M:%S")
        meta_wf.write("{},{},{}\n".format(position_index, doc["id_str"], timestamp))
        bow_corpus.append(bow_per_doc)
        position_index += 1
meta_wf.close()

CPU times: user 36.6 s, sys: 929 ms, total: 37.5 s
Wall time: 45.7 s


In [10]:
%%time
tfidf_model = TfidfModel(bow_corpus)  # fit model
tfidf_corpus = tfidf_model[bow_corpus]

CPU times: user 2.21 s, sys: 0 ns, total: 2.21 s
Wall time: 2.2 s


## Step3, train NMF to extract topic pattern

In [11]:
import logging
logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

In [None]:
%%time
nmf = Nmf(tfidf_corpus, id2word=dct, num_topics=50)
# CPU times: user 21min 12s, sys: 12min, total: 33min 13s
# Wall time: 5min 29s

In [21]:
# get phi of a topic
nmf.show_topic(topicid=9,topn=20)

[('county', 0.14236108682730447),
 ('officials', 0.019101592353724235),
 ('king', 0.018942966867407436),
 ('two', 0.01594921330650367),
 ('presumptive', 0.014868525580645197),
 ('santa', 0.012602169478004594),
 ('questions', 0.012228678584015079),
 ('florida', 0.011518394265381204),
 ('montgomery', 0.009187398302580222),
 ('new', 0.008964702096575038),
 ('man', 0.008872473133647307),
 ('broward', 0.008868353411455773),
 ('clara', 0.008840548435683908),
 ('resident', 0.007922327904062918),
 ('lee', 0.007636380220626515),
 ('los', 0.007213491226627287),
 ('angeles', 0.007131223085317641),
 ('department', 0.00705121820635023),
 ('residents', 0.0069208504385192245),
 ('positive', 0.006912661213467376)]

In [22]:
# get theta of a document
nmf[tfidf_corpus[7]]

[(7, 0.10944433115677271),
 (21, 0.021792774115770488),
 (24, 0.8298059263318602),
 (34, 0.015225289333366417)]

## Step4, Model persistence

In [None]:
dct.save('{}{}.dict'.format(corpora_path,fileTag))
MmCorpus.serialize('{}{}-tf-idf.mm'.format(corpora_path,fileTag), tfidf_corpus)

tfidf_model.save('{}{}-tf-idf.model'.format(model_path,fileTag))

model_suffix = "-50topics"
nmf.save("{}{}{}.model".format(model_path,fileTag,model_suffix))

# dct = Dictionary.load('dictionary.dict')
# tfidf_corpus = MmCorpus.load('corpus.mm')
# nmf = Nmf.load('topic.model')