In [1]:
import logging
import multiprocessing

import ujson
from tqdm import tqdm 
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary

In [2]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [3]:
path_data = '/ssd/ms3u14/'
path_preprocessed = path_data + 'news_cleaned_2018_02_13_all.preprocessed.jsonl'
path_corpus = '/ssd/ms3u14/news_cleaned_2018_02_13_all.corpus'

In [8]:
def jsonl_iterator(path):
    with tqdm() as progress:
        with open(path, 'r') as in_jsonl:
            for line in in_jsonl:
                yield ujson.loads(line)
                progress.update()
                
def articles_iterator():
    for article in jsonl_iterator(path_preprocessed):
        yield article['content']
                
class NewsCorpus():
    def __init__(self, dictionary):
        self.dictionary = dictionary
        
        if not dictionary:
            self.dictionary = Dictionary(articles_iterator())
    
    def __iter__(self):
        for article in articles_iterator():
            yield self.dictionary.doc2bow(article)
            
    def __len__(self):
        return 6669116

In [None]:
corpus = NewsCorpus()
corpus.dictionary.save(path_corpus)

In [9]:
corpus = NewsCorpus(Dictionary.load(path_corpus))

INFO : loading Dictionary object from /ssd/ms3u14/news_cleaned_2018_02_13_all.corpus
INFO : loaded /ssd/ms3u14/news_cleaned_2018_02_13_all.corpus


In [10]:
model = TfidfModel(corpus)

INFO : collecting document frequencies
0it [00:00, ?it/s]INFO : PROGRESS: processing document #0
9911it [00:05, 1712.06it/s]INFO : PROGRESS: processing document #10000
19815it [00:10, 2144.76it/s]INFO : PROGRESS: processing document #20000
29839it [00:14, 1822.97it/s]INFO : PROGRESS: processing document #30000
40000it [00:19, 2264.16it/s]INFO : PROGRESS: processing document #40000
49917it [00:23, 1820.70it/s]INFO : PROGRESS: processing document #50000
59845it [00:27, 2828.35it/s]INFO : PROGRESS: processing document #60000
69848it [00:31, 2221.62it/s]INFO : PROGRESS: processing document #70000
79770it [00:36, 2396.72it/s]INFO : PROGRESS: processing document #80000
89799it [00:40, 2365.98it/s]INFO : PROGRESS: processing document #90000
99927it [00:44, 2475.46it/s]INFO : PROGRESS: processing document #100000
109942it [00:49, 1967.38it/s]INFO : PROGRESS: processing document #110000
119863it [00:53, 3186.67it/s]INFO : PROGRESS: processing document #120000
129803it [00:57, 2716.76it/s]INFO :

1109913it [07:24, 4819.12it/s]INFO : PROGRESS: processing document #1110000
1119765it [07:26, 3485.39it/s]INFO : PROGRESS: processing document #1120000
1129745it [07:29, 4580.17it/s]INFO : PROGRESS: processing document #1130000
1139989it [07:33, 1560.01it/s]INFO : PROGRESS: processing document #1140000
1149982it [07:40, 1755.46it/s]INFO : PROGRESS: processing document #1150000
1159993it [07:45, 1992.69it/s]INFO : PROGRESS: processing document #1160000
1169714it [07:50, 2389.72it/s]INFO : PROGRESS: processing document #1170000
1179960it [07:54, 2706.57it/s]INFO : PROGRESS: processing document #1180000
1189948it [07:57, 2336.46it/s]INFO : PROGRESS: processing document #1190000
1199944it [08:00, 3380.62it/s]INFO : PROGRESS: processing document #1200000
1209975it [08:04, 2190.04it/s]INFO : PROGRESS: processing document #1210000
1219861it [08:09, 2562.53it/s]INFO : PROGRESS: processing document #1220000
1229644it [08:12, 4419.06it/s]INFO : PROGRESS: processing document #1230000
1239897it [0

2189933it [15:07, 2593.77it/s]INFO : PROGRESS: processing document #2190000
2199846it [15:12, 2134.75it/s]INFO : PROGRESS: processing document #2200000
2209976it [15:16, 2785.13it/s]INFO : PROGRESS: processing document #2210000
2219795it [15:19, 2419.51it/s]INFO : PROGRESS: processing document #2220000
2229781it [15:23, 3078.09it/s]INFO : PROGRESS: processing document #2230000
2239767it [15:27, 2878.56it/s]INFO : PROGRESS: processing document #2240000
2249718it [15:30, 2830.83it/s]INFO : PROGRESS: processing document #2250000
2259782it [15:34, 2513.86it/s]INFO : PROGRESS: processing document #2260000
2269937it [15:39, 2238.64it/s]INFO : PROGRESS: processing document #2270000
2279910it [15:42, 1968.58it/s]INFO : PROGRESS: processing document #2280000
2289855it [15:46, 2321.97it/s]INFO : PROGRESS: processing document #2290000
2299788it [15:51, 2200.09it/s]INFO : PROGRESS: processing document #2300000
2309882it [15:54, 3550.70it/s]INFO : PROGRESS: processing document #2310000
2319802it [1

3269886it [22:13, 2667.48it/s]INFO : PROGRESS: processing document #3270000
3279912it [22:16, 2465.61it/s]INFO : PROGRESS: processing document #3280000
3289935it [22:20, 2969.37it/s]INFO : PROGRESS: processing document #3290000
3299793it [22:24, 2291.41it/s]INFO : PROGRESS: processing document #3300000
3309874it [22:28, 2483.97it/s]INFO : PROGRESS: processing document #3310000
3319780it [22:31, 2710.24it/s]INFO : PROGRESS: processing document #3320000
3329816it [22:35, 2693.21it/s]INFO : PROGRESS: processing document #3330000
3339908it [22:39, 2493.27it/s]INFO : PROGRESS: processing document #3340000
3349851it [22:43, 2262.10it/s]INFO : PROGRESS: processing document #3350000
3359893it [22:47, 2888.40it/s]INFO : PROGRESS: processing document #3360000
3369848it [22:51, 2773.85it/s]INFO : PROGRESS: processing document #3370000
3379748it [22:55, 2897.12it/s]INFO : PROGRESS: processing document #3380000
3389840it [22:58, 3241.75it/s]INFO : PROGRESS: processing document #3390000
3399805it [2

4349907it [28:40, 1455.29it/s]INFO : PROGRESS: processing document #4350000
4359893it [28:46, 1440.29it/s]INFO : PROGRESS: processing document #4360000
4369849it [28:52, 1762.82it/s]INFO : PROGRESS: processing document #4370000
4379794it [28:56, 2664.98it/s]INFO : PROGRESS: processing document #4380000
4389766it [29:00, 2544.89it/s]INFO : PROGRESS: processing document #4390000
4399964it [29:04, 3422.36it/s]INFO : PROGRESS: processing document #4400000
4409861it [29:07, 2766.93it/s]INFO : PROGRESS: processing document #4410000
4419699it [29:10, 3216.36it/s]INFO : PROGRESS: processing document #4420000
4429961it [29:14, 3476.08it/s]INFO : PROGRESS: processing document #4430000
4439773it [29:17, 3016.23it/s]INFO : PROGRESS: processing document #4440000
4449924it [29:20, 2558.08it/s]INFO : PROGRESS: processing document #4450000
4459877it [29:24, 2932.29it/s]INFO : PROGRESS: processing document #4460000
4469834it [29:28, 2253.64it/s]INFO : PROGRESS: processing document #4470000
4479970it [2

5429842it [35:26, 2667.79it/s]INFO : PROGRESS: processing document #5430000
5439934it [35:29, 2694.76it/s]INFO : PROGRESS: processing document #5440000
5449766it [35:33, 2540.24it/s]INFO : PROGRESS: processing document #5450000
5459801it [35:37, 2683.13it/s]INFO : PROGRESS: processing document #5460000
5469957it [35:41, 2667.26it/s]INFO : PROGRESS: processing document #5470000
5479898it [35:45, 2520.71it/s]INFO : PROGRESS: processing document #5480000
5489731it [35:49, 2756.25it/s]INFO : PROGRESS: processing document #5490000
5499849it [35:52, 2537.03it/s]INFO : PROGRESS: processing document #5500000
5509802it [35:56, 2720.33it/s]INFO : PROGRESS: processing document #5510000
5519724it [36:00, 2846.54it/s]INFO : PROGRESS: processing document #5520000
5529937it [36:04, 2569.12it/s]INFO : PROGRESS: processing document #5530000
5539941it [36:08, 2784.39it/s]INFO : PROGRESS: processing document #5540000
5549751it [36:11, 2818.38it/s]INFO : PROGRESS: processing document #5550000
5559795it [3

6509841it [42:57, 1760.42it/s]INFO : PROGRESS: processing document #6510000
6519862it [43:02, 2171.25it/s]INFO : PROGRESS: processing document #6520000
6529986it [43:07, 2082.93it/s]INFO : PROGRESS: processing document #6530000
6539968it [43:12, 2071.49it/s]INFO : PROGRESS: processing document #6540000
6549912it [43:16, 1924.95it/s]INFO : PROGRESS: processing document #6550000
6559889it [43:21, 2140.87it/s]INFO : PROGRESS: processing document #6560000
6569892it [43:26, 1994.14it/s]INFO : PROGRESS: processing document #6570000
6579956it [43:32, 1838.19it/s]INFO : PROGRESS: processing document #6580000
6589834it [43:37, 1645.32it/s]INFO : PROGRESS: processing document #6590000
6599890it [43:43, 1573.86it/s]INFO : PROGRESS: processing document #6600000
6609851it [43:48, 1564.85it/s]INFO : PROGRESS: processing document #6610000
6619954it [43:54, 1700.45it/s]INFO : PROGRESS: processing document #6620000
6629893it [44:00, 1664.86it/s]INFO : PROGRESS: processing document #6630000
6639965it [4