In [1]:
import logging
import multiprocessing

import ujson
from tqdm import tqdm 
from gensim.models.ldamulticore import LdaMulticore

In [2]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [3]:
path_data = '/ssd/ms3u14/'
path_preprocessed = path_data + 'news_cleaned_2018_02_13_all.preprocessed.jsonl'

In [4]:
def jsonl_iterator(path):
    with tqdm() as progress:
        with open(path, 'r') as in_jsonl:
            for line in in_jsonl:
                yield ujson.loads(line)
                progress.update()

In [5]:
def iterate_corpus(word2id):
    for article in jsonl_iterator(path_preprocessed):
        yield [word2id[word] for word in article['content']]

In [6]:
def generate_id2word_count():
    last_id = 0
    word2id_count = {}
    for article in jsonl_iterator(path_preprocessed):
        for word in article['content']:
            if word not in word2id_count:
                word2id_count[word] = {'id': last_id, 'count': 0}
                last_id += 1
            
            word2id_count[word]['count'] += 1
                
    return word2id_count

In [8]:
word2id_count = generate_id2word_count()

6669116it [17:30, 6348.94it/s] 


In [9]:
len(word2id_count)

5858972

In [22]:
word2id = dict([(w, ic['id']) for w, ic in word2id_count.items() if ic['count'] > 80])

In [23]:
len(word2id)

216704

In [24]:
id2word = dict([(i, w) for w, i in word2id.items()])

In [25]:
class NewsCorpus():
#     def __init__(self, word2id):
#         self.word2id = word2id
    
    def __iter__(self):
        for article in jsonl_iterator(path_preprocessed):
            yield [self.word2id[word] for word in article['content'] if word in self.word2id]
#             yield [self.word2id[word] for word in article['content'] if word in self.word2id]
    
    def __len__(self):
        return 6669116

In [27]:
lda = LdaMulticore(NewsCorpus(word2id), num_topics=250, workers=6, id2word=id2word)

INFO : using symmetric alpha at 0.004
INFO : using symmetric eta at 0.004
INFO : using serial LDA version on this node
INFO : running online LDA training, 250 topics, 1 passes over the supplied corpus of 6669116 documents, updating every 12000 documents, evaluating every ~120000 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 6 processes


OSError: [Errno 12] Cannot allocate memory