In [1]:
import gensim
import logging

# Logging code taken from http://rare-technologies.com/word2vec-tutorial/
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Loading & Pre-processing Data

In [2]:
from nltk.tokenize import RegexpTokenizer # tokenizing
from nltk.corpus import stopwords  # list of stop words

In [24]:
## https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/deepir.ipynb
import re

contractions = re.compile(r"'|-|\"")
# all non alphanumeric
symbols = re.compile(r'(\W+)', re.U)
# single character removal
singles = re.compile(r'(\s\S\s)', re.I|re.U)
# separators (any whitespace)
seps = re.compile(r'\s+')
# tokenizer
tokenizer = RegexpTokenizer(r'\w+') # tokens separated by white spice
# stop words
stops = set(stopwords.words('english')) # list of english stop words

# cleaner (order matters)
def clean(text, rmv_stop_words=True, return_tokens=False): 
    text = text.lower()
    text = contractions.sub('', text)
    text = symbols.sub(r' \1 ', text)
    text = singles.sub(' ', text)
    text = seps.sub(' ', text)
    tokens = tokenizer.tokenize(text)     # tokenize
    if rmv_stop_words:
        tokens = [i for i in tokens if not i in stops] # remove stop words
        text = ' '.join(tokens)
    if return_tokens:
        return tokens
    return text

# sentence splitter
#alteos = re.compile(r'([!\?])')
#def sentences(l):
#    l = alteos.sub(r' \1 .', l).rstrip("(\.)*\n")
#    return l.split(".")

In [3]:
tokenizer = RegexpTokenizer(r'\w+') # tokens separated by white space -- very naive, fist pass 
stops = set(stopwords.words('english')) # list of english stop words
lemma = WordNetLemmatizer()

def clean(title, rmv_stop_words=False):
    tokens = tokenizer.tokenize(title.lower())  # TODO: smarter tokenizer to allow for code
    # spell check
    # phrase detection
    # entity detection
    if rmv_stop_words:
        tokens = [i for i in tokens if not i in stops] # remove stop words
    normalized = [lemma.lemmatize(token) for token in tokens] # lemma
    return normalized

In [28]:
path = "/Users/stevenfelix/Documents/DataScience_local/Insight/"
file = 'posts_titles_50M.txt'
corpus = []

# compile and pre-process corpus serially
with open(path+file, 'r') as f:
    for line in f:
        corpus.append(clean(line, rmv_stop_words=True, return_tokens=True))

In [29]:
print(len(corpus))
print(corpus[200])

17565207
['csv', 'file', 'imports', 'net']


In [30]:
#dictionary = gensim.corpora.Dictionary(corpus)
#print(len(dictionary))
#dictionary.filter_extremes(no_below=2)#no_above = .5) # can play with this
#len(dictionary)

# Model Training

In [None]:
model_nostop = gensim.models.word2vec.Word2Vec(corpus_nostop, sg=1, size=200, window=5, min_count=2)

In [8]:
num_doc='50M'
sg=1
size=250
window=5
min_count=3
model_full = gensim.models.word2vec.Word2Vec(corpus, sg=sg, size=size, window=window, min_count=min_count)

2018-01-18 10:12:34,471 : INFO : collecting all words and their counts
2018-01-18 10:12:34,472 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-18 10:12:34,517 : INFO : PROGRESS: at sentence #10000, processed 87251 words, keeping 7064 word types
2018-01-18 10:12:34,564 : INFO : PROGRESS: at sentence #20000, processed 178244 words, keeping 10377 word types
2018-01-18 10:12:34,611 : INFO : PROGRESS: at sentence #30000, processed 268294 words, keeping 12855 word types
2018-01-18 10:12:34,663 : INFO : PROGRESS: at sentence #40000, processed 357470 words, keeping 14950 word types
2018-01-18 10:12:34,730 : INFO : PROGRESS: at sentence #50000, processed 444951 words, keeping 16862 word types
2018-01-18 10:12:34,779 : INFO : PROGRESS: at sentence #60000, processed 531723 words, keeping 18607 word types
2018-01-18 10:12:34,824 : INFO : PROGRESS: at sentence #70000, processed 619805 words, keeping 20183 word types
2018-01-18 10:12:34,869 : INFO : PROGRESS: at se

2018-01-18 10:12:38,211 : INFO : PROGRESS: at sentence #710000, processed 6080504 words, keeping 78585 word types
2018-01-18 10:12:38,272 : INFO : PROGRESS: at sentence #720000, processed 6164893 words, keeping 79263 word types
2018-01-18 10:12:38,333 : INFO : PROGRESS: at sentence #730000, processed 6249699 words, keeping 79909 word types
2018-01-18 10:12:38,414 : INFO : PROGRESS: at sentence #740000, processed 6334495 words, keeping 80596 word types
2018-01-18 10:12:38,478 : INFO : PROGRESS: at sentence #750000, processed 6419771 words, keeping 81260 word types
2018-01-18 10:12:38,535 : INFO : PROGRESS: at sentence #760000, processed 6504866 words, keeping 81904 word types
2018-01-18 10:12:38,611 : INFO : PROGRESS: at sentence #770000, processed 6588967 words, keeping 82578 word types
2018-01-18 10:12:38,663 : INFO : PROGRESS: at sentence #780000, processed 6673573 words, keeping 83240 word types
2018-01-18 10:12:38,710 : INFO : PROGRESS: at sentence #790000, processed 6758292 words,

2018-01-18 10:12:43,277 : INFO : PROGRESS: at sentence #1420000, processed 12068791 words, keeping 121793 word types
2018-01-18 10:12:43,324 : INFO : PROGRESS: at sentence #1430000, processed 12151939 words, keeping 122317 word types
2018-01-18 10:12:43,373 : INFO : PROGRESS: at sentence #1440000, processed 12235349 words, keeping 122851 word types
2018-01-18 10:12:43,422 : INFO : PROGRESS: at sentence #1450000, processed 12318336 words, keeping 123400 word types
2018-01-18 10:12:43,474 : INFO : PROGRESS: at sentence #1460000, processed 12402604 words, keeping 123917 word types
2018-01-18 10:12:43,525 : INFO : PROGRESS: at sentence #1470000, processed 12484821 words, keeping 124491 word types
2018-01-18 10:12:43,585 : INFO : PROGRESS: at sentence #1480000, processed 12568217 words, keeping 125043 word types
2018-01-18 10:12:43,638 : INFO : PROGRESS: at sentence #1490000, processed 12651007 words, keeping 125578 word types
2018-01-18 10:12:43,680 : INFO : PROGRESS: at sentence #1500000,

2018-01-18 10:12:47,082 : INFO : PROGRESS: at sentence #2130000, processed 18085536 words, keeping 158685 word types
2018-01-18 10:12:47,141 : INFO : PROGRESS: at sentence #2140000, processed 18169842 words, keeping 159161 word types
2018-01-18 10:12:47,217 : INFO : PROGRESS: at sentence #2150000, processed 18255896 words, keeping 159695 word types
2018-01-18 10:12:47,271 : INFO : PROGRESS: at sentence #2160000, processed 18341502 words, keeping 160198 word types
2018-01-18 10:12:47,334 : INFO : PROGRESS: at sentence #2170000, processed 18427481 words, keeping 160706 word types
2018-01-18 10:12:47,379 : INFO : PROGRESS: at sentence #2180000, processed 18512852 words, keeping 161238 word types
2018-01-18 10:12:47,424 : INFO : PROGRESS: at sentence #2190000, processed 18598625 words, keeping 161697 word types
2018-01-18 10:12:47,476 : INFO : PROGRESS: at sentence #2200000, processed 18683888 words, keeping 162198 word types
2018-01-18 10:12:47,536 : INFO : PROGRESS: at sentence #2210000,

2018-01-18 10:12:50,764 : INFO : PROGRESS: at sentence #2840000, processed 24158500 words, keeping 192712 word types
2018-01-18 10:12:50,815 : INFO : PROGRESS: at sentence #2850000, processed 24243117 words, keeping 193228 word types
2018-01-18 10:12:50,872 : INFO : PROGRESS: at sentence #2860000, processed 24328944 words, keeping 193725 word types
2018-01-18 10:12:50,920 : INFO : PROGRESS: at sentence #2870000, processed 24414681 words, keeping 194189 word types
2018-01-18 10:12:50,972 : INFO : PROGRESS: at sentence #2880000, processed 24499709 words, keeping 194683 word types
2018-01-18 10:12:51,018 : INFO : PROGRESS: at sentence #2890000, processed 24584538 words, keeping 195148 word types
2018-01-18 10:12:51,067 : INFO : PROGRESS: at sentence #2900000, processed 24670374 words, keeping 195561 word types
2018-01-18 10:12:51,116 : INFO : PROGRESS: at sentence #2910000, processed 24754918 words, keeping 195995 word types
2018-01-18 10:12:51,163 : INFO : PROGRESS: at sentence #2920000,

2018-01-18 10:12:54,908 : INFO : PROGRESS: at sentence #3550000, processed 30224236 words, keeping 213830 word types
2018-01-18 10:12:54,959 : INFO : PROGRESS: at sentence #3560000, processed 30307811 words, keeping 213830 word types
2018-01-18 10:12:55,009 : INFO : PROGRESS: at sentence #3570000, processed 30392288 words, keeping 213830 word types
2018-01-18 10:12:55,075 : INFO : PROGRESS: at sentence #3580000, processed 30476806 words, keeping 213830 word types
2018-01-18 10:12:55,131 : INFO : PROGRESS: at sentence #3590000, processed 30561519 words, keeping 213830 word types
2018-01-18 10:12:55,177 : INFO : PROGRESS: at sentence #3600000, processed 30646247 words, keeping 213830 word types
2018-01-18 10:12:55,231 : INFO : PROGRESS: at sentence #3610000, processed 30730479 words, keeping 213830 word types
2018-01-18 10:12:55,285 : INFO : PROGRESS: at sentence #3620000, processed 30814630 words, keeping 213830 word types
2018-01-18 10:12:55,343 : INFO : PROGRESS: at sentence #3630000,

2018-01-18 10:12:59,417 : INFO : PROGRESS: at sentence #4260000, processed 36263046 words, keeping 213830 word types
2018-01-18 10:12:59,469 : INFO : PROGRESS: at sentence #4270000, processed 36348174 words, keeping 213830 word types
2018-01-18 10:12:59,520 : INFO : PROGRESS: at sentence #4280000, processed 36432178 words, keeping 213830 word types
2018-01-18 10:12:59,573 : INFO : PROGRESS: at sentence #4290000, processed 36517345 words, keeping 213830 word types
2018-01-18 10:12:59,623 : INFO : PROGRESS: at sentence #4300000, processed 36601491 words, keeping 213830 word types
2018-01-18 10:12:59,682 : INFO : PROGRESS: at sentence #4310000, processed 36685749 words, keeping 213830 word types
2018-01-18 10:12:59,738 : INFO : PROGRESS: at sentence #4320000, processed 36770312 words, keeping 213830 word types
2018-01-18 10:12:59,793 : INFO : PROGRESS: at sentence #4330000, processed 36854142 words, keeping 213830 word types
2018-01-18 10:12:59,848 : INFO : PROGRESS: at sentence #4340000,

2018-01-18 10:13:04,006 : INFO : PROGRESS: at sentence #4970000, processed 42224598 words, keeping 213830 word types
2018-01-18 10:13:04,075 : INFO : PROGRESS: at sentence #4980000, processed 42308406 words, keeping 213830 word types
2018-01-18 10:13:04,145 : INFO : PROGRESS: at sentence #4990000, processed 42392760 words, keeping 213830 word types
2018-01-18 10:13:04,223 : INFO : PROGRESS: at sentence #5000000, processed 42477875 words, keeping 213830 word types
2018-01-18 10:13:04,300 : INFO : PROGRESS: at sentence #5010000, processed 42562960 words, keeping 213830 word types
2018-01-18 10:13:04,375 : INFO : PROGRESS: at sentence #5020000, processed 42647763 words, keeping 213830 word types
2018-01-18 10:13:04,435 : INFO : PROGRESS: at sentence #5030000, processed 42731705 words, keeping 213830 word types
2018-01-18 10:13:04,516 : INFO : PROGRESS: at sentence #5040000, processed 42816214 words, keeping 213830 word types
2018-01-18 10:13:04,597 : INFO : PROGRESS: at sentence #5050000,

2018-01-18 10:13:08,475 : INFO : PROGRESS: at sentence #5680000, processed 48288767 words, keeping 213830 word types
2018-01-18 10:13:08,533 : INFO : PROGRESS: at sentence #5690000, processed 48374585 words, keeping 213830 word types
2018-01-18 10:13:08,597 : INFO : PROGRESS: at sentence #5700000, processed 48460548 words, keeping 213830 word types
2018-01-18 10:13:08,657 : INFO : PROGRESS: at sentence #5710000, processed 48546797 words, keeping 213830 word types
2018-01-18 10:13:08,715 : INFO : PROGRESS: at sentence #5720000, processed 48632019 words, keeping 213830 word types
2018-01-18 10:13:08,769 : INFO : PROGRESS: at sentence #5730000, processed 48717477 words, keeping 213830 word types
2018-01-18 10:13:08,823 : INFO : PROGRESS: at sentence #5740000, processed 48803140 words, keeping 213830 word types
2018-01-18 10:13:08,875 : INFO : PROGRESS: at sentence #5750000, processed 48887203 words, keeping 213830 word types
2018-01-18 10:13:08,926 : INFO : PROGRESS: at sentence #5760000,

2018-01-18 10:13:12,471 : INFO : PROGRESS: at sentence #6390000, processed 54341337 words, keeping 213830 word types
2018-01-18 10:13:12,531 : INFO : PROGRESS: at sentence #6400000, processed 54425178 words, keeping 213830 word types
2018-01-18 10:13:12,590 : INFO : PROGRESS: at sentence #6410000, processed 54509664 words, keeping 213830 word types
2018-01-18 10:13:12,648 : INFO : PROGRESS: at sentence #6420000, processed 54594298 words, keeping 213830 word types
2018-01-18 10:13:12,711 : INFO : PROGRESS: at sentence #6430000, processed 54678323 words, keeping 213830 word types
2018-01-18 10:13:12,771 : INFO : PROGRESS: at sentence #6440000, processed 54763432 words, keeping 213830 word types
2018-01-18 10:13:12,833 : INFO : PROGRESS: at sentence #6450000, processed 54848398 words, keeping 213830 word types
2018-01-18 10:13:12,889 : INFO : PROGRESS: at sentence #6460000, processed 54932994 words, keeping 213830 word types
2018-01-18 10:13:12,952 : INFO : PROGRESS: at sentence #6470000,

2018-01-18 10:13:16,642 : INFO : PROGRESS: at sentence #7100000, processed 60360943 words, keeping 236221 word types
2018-01-18 10:13:16,703 : INFO : PROGRESS: at sentence #7110000, processed 60446982 words, keeping 236710 word types
2018-01-18 10:13:16,758 : INFO : PROGRESS: at sentence #7120000, processed 60531800 words, keeping 237152 word types
2018-01-18 10:13:16,811 : INFO : PROGRESS: at sentence #7130000, processed 60616045 words, keeping 237577 word types
2018-01-18 10:13:16,874 : INFO : PROGRESS: at sentence #7140000, processed 60700686 words, keeping 238027 word types
2018-01-18 10:13:16,938 : INFO : PROGRESS: at sentence #7150000, processed 60787138 words, keeping 238489 word types
2018-01-18 10:13:16,986 : INFO : PROGRESS: at sentence #7160000, processed 60871234 words, keeping 238849 word types
2018-01-18 10:13:17,053 : INFO : PROGRESS: at sentence #7170000, processed 60956077 words, keeping 239318 word types
2018-01-18 10:13:17,118 : INFO : PROGRESS: at sentence #7180000,

2018-01-18 10:13:20,781 : INFO : PROGRESS: at sentence #7810000, processed 66393369 words, keeping 266604 word types
2018-01-18 10:13:20,854 : INFO : PROGRESS: at sentence #7820000, processed 66478370 words, keeping 267027 word types
2018-01-18 10:13:20,915 : INFO : PROGRESS: at sentence #7830000, processed 66562919 words, keeping 267462 word types
2018-01-18 10:13:20,974 : INFO : PROGRESS: at sentence #7840000, processed 66646796 words, keeping 267853 word types
2018-01-18 10:13:21,039 : INFO : PROGRESS: at sentence #7850000, processed 66732168 words, keeping 268288 word types
2018-01-18 10:13:21,096 : INFO : PROGRESS: at sentence #7860000, processed 66819316 words, keeping 268685 word types
2018-01-18 10:13:21,157 : INFO : PROGRESS: at sentence #7870000, processed 66906399 words, keeping 269146 word types
2018-01-18 10:13:21,220 : INFO : PROGRESS: at sentence #7880000, processed 66992310 words, keeping 269597 word types
2018-01-18 10:13:21,280 : INFO : PROGRESS: at sentence #7890000,

2018-01-18 10:13:24,795 : INFO : PROGRESS: at sentence #8520000, processed 72485408 words, keeping 295728 word types
2018-01-18 10:13:24,845 : INFO : PROGRESS: at sentence #8530000, processed 72572066 words, keeping 296134 word types
2018-01-18 10:13:24,894 : INFO : PROGRESS: at sentence #8540000, processed 72657369 words, keeping 296559 word types
2018-01-18 10:13:24,948 : INFO : PROGRESS: at sentence #8550000, processed 72742327 words, keeping 296948 word types
2018-01-18 10:13:25,004 : INFO : PROGRESS: at sentence #8560000, processed 72828384 words, keeping 297407 word types
2018-01-18 10:13:25,058 : INFO : PROGRESS: at sentence #8570000, processed 72914920 words, keeping 297796 word types
2018-01-18 10:13:25,118 : INFO : PROGRESS: at sentence #8580000, processed 73000332 words, keeping 298202 word types
2018-01-18 10:13:25,176 : INFO : PROGRESS: at sentence #8590000, processed 73086749 words, keeping 298657 word types
2018-01-18 10:13:25,228 : INFO : PROGRESS: at sentence #8600000,

2018-01-18 10:13:28,724 : INFO : PROGRESS: at sentence #9230000, processed 78593188 words, keeping 324760 word types
2018-01-18 10:13:28,798 : INFO : PROGRESS: at sentence #9240000, processed 78679014 words, keeping 325139 word types
2018-01-18 10:13:28,865 : INFO : PROGRESS: at sentence #9250000, processed 78764841 words, keeping 325558 word types
2018-01-18 10:13:28,937 : INFO : PROGRESS: at sentence #9260000, processed 78851348 words, keeping 325905 word types
2018-01-18 10:13:28,995 : INFO : PROGRESS: at sentence #9270000, processed 78938524 words, keeping 326315 word types
2018-01-18 10:13:29,072 : INFO : PROGRESS: at sentence #9280000, processed 79024383 words, keeping 326711 word types
2018-01-18 10:13:29,126 : INFO : PROGRESS: at sentence #9290000, processed 79110382 words, keeping 327107 word types
2018-01-18 10:13:29,195 : INFO : PROGRESS: at sentence #9300000, processed 79196535 words, keeping 327482 word types
2018-01-18 10:13:29,250 : INFO : PROGRESS: at sentence #9310000,

2018-01-18 10:13:32,864 : INFO : PROGRESS: at sentence #9940000, processed 84712758 words, keeping 352054 word types
2018-01-18 10:13:32,927 : INFO : PROGRESS: at sentence #9950000, processed 84799873 words, keeping 352438 word types
2018-01-18 10:13:32,984 : INFO : PROGRESS: at sentence #9960000, processed 84884837 words, keeping 352793 word types
2018-01-18 10:13:33,046 : INFO : PROGRESS: at sentence #9970000, processed 84969911 words, keeping 353213 word types
2018-01-18 10:13:33,099 : INFO : PROGRESS: at sentence #9980000, processed 85055741 words, keeping 353587 word types
2018-01-18 10:13:33,149 : INFO : PROGRESS: at sentence #9990000, processed 85142305 words, keeping 353974 word types
2018-01-18 10:13:33,206 : INFO : PROGRESS: at sentence #10000000, processed 85228942 words, keeping 354354 word types
2018-01-18 10:13:33,259 : INFO : PROGRESS: at sentence #10010000, processed 85313648 words, keeping 354726 word types
2018-01-18 10:13:33,322 : INFO : PROGRESS: at sentence #100200

2018-01-18 10:13:36,871 : INFO : PROGRESS: at sentence #10640000, processed 90760271 words, keeping 371099 word types
2018-01-18 10:13:36,933 : INFO : PROGRESS: at sentence #10650000, processed 90845285 words, keeping 371099 word types
2018-01-18 10:13:36,987 : INFO : PROGRESS: at sentence #10660000, processed 90930504 words, keeping 371099 word types
2018-01-18 10:13:37,044 : INFO : PROGRESS: at sentence #10670000, processed 91015328 words, keeping 371099 word types
2018-01-18 10:13:37,097 : INFO : PROGRESS: at sentence #10680000, processed 91100569 words, keeping 371099 word types
2018-01-18 10:13:37,156 : INFO : PROGRESS: at sentence #10690000, processed 91183820 words, keeping 371099 word types
2018-01-18 10:13:37,289 : INFO : PROGRESS: at sentence #10700000, processed 91268326 words, keeping 371099 word types
2018-01-18 10:13:37,385 : INFO : PROGRESS: at sentence #10710000, processed 91352766 words, keeping 371099 word types
2018-01-18 10:13:37,452 : INFO : PROGRESS: at sentence #

2018-01-18 10:13:41,500 : INFO : PROGRESS: at sentence #11340000, processed 96713260 words, keeping 371099 word types
2018-01-18 10:13:41,564 : INFO : PROGRESS: at sentence #11350000, processed 96798025 words, keeping 371099 word types
2018-01-18 10:13:41,737 : INFO : PROGRESS: at sentence #11360000, processed 96882943 words, keeping 371099 word types
2018-01-18 10:13:41,805 : INFO : PROGRESS: at sentence #11370000, processed 96968088 words, keeping 371099 word types
2018-01-18 10:13:41,860 : INFO : PROGRESS: at sentence #11380000, processed 97052860 words, keeping 371099 word types
2018-01-18 10:13:41,915 : INFO : PROGRESS: at sentence #11390000, processed 97138907 words, keeping 371099 word types
2018-01-18 10:13:41,974 : INFO : PROGRESS: at sentence #11400000, processed 97223794 words, keeping 371099 word types
2018-01-18 10:13:42,025 : INFO : PROGRESS: at sentence #11410000, processed 97308227 words, keeping 371099 word types
2018-01-18 10:13:42,084 : INFO : PROGRESS: at sentence #

2018-01-18 10:13:46,819 : INFO : PROGRESS: at sentence #12040000, processed 102594249 words, keeping 371099 word types
2018-01-18 10:13:46,936 : INFO : PROGRESS: at sentence #12050000, processed 102678092 words, keeping 371099 word types
2018-01-18 10:13:47,048 : INFO : PROGRESS: at sentence #12060000, processed 102762708 words, keeping 371099 word types
2018-01-18 10:13:47,114 : INFO : PROGRESS: at sentence #12070000, processed 102846785 words, keeping 371099 word types
2018-01-18 10:13:47,182 : INFO : PROGRESS: at sentence #12080000, processed 102931472 words, keeping 371099 word types
2018-01-18 10:13:47,347 : INFO : PROGRESS: at sentence #12090000, processed 103015647 words, keeping 371099 word types
2018-01-18 10:13:47,445 : INFO : PROGRESS: at sentence #12100000, processed 103100601 words, keeping 371099 word types
2018-01-18 10:13:47,520 : INFO : PROGRESS: at sentence #12110000, processed 103184742 words, keeping 371099 word types
2018-01-18 10:13:47,609 : INFO : PROGRESS: at se

2018-01-18 10:13:52,401 : INFO : PROGRESS: at sentence #12730000, processed 108475543 words, keeping 371099 word types
2018-01-18 10:13:52,477 : INFO : PROGRESS: at sentence #12740000, processed 108561307 words, keeping 371099 word types
2018-01-18 10:13:52,601 : INFO : PROGRESS: at sentence #12750000, processed 108646520 words, keeping 371099 word types
2018-01-18 10:13:52,675 : INFO : PROGRESS: at sentence #12760000, processed 108733560 words, keeping 371099 word types
2018-01-18 10:13:52,755 : INFO : PROGRESS: at sentence #12770000, processed 108818406 words, keeping 371099 word types
2018-01-18 10:13:52,831 : INFO : PROGRESS: at sentence #12780000, processed 108904930 words, keeping 371099 word types
2018-01-18 10:13:52,902 : INFO : PROGRESS: at sentence #12790000, processed 108991479 words, keeping 371099 word types
2018-01-18 10:13:52,976 : INFO : PROGRESS: at sentence #12800000, processed 109077323 words, keeping 371099 word types
2018-01-18 10:13:53,051 : INFO : PROGRESS: at se

2018-01-18 10:13:57,138 : INFO : PROGRESS: at sentence #13420000, processed 114366744 words, keeping 371099 word types
2018-01-18 10:13:57,197 : INFO : PROGRESS: at sentence #13430000, processed 114451761 words, keeping 371099 word types
2018-01-18 10:13:57,262 : INFO : PROGRESS: at sentence #13440000, processed 114536539 words, keeping 371099 word types
2018-01-18 10:13:57,354 : INFO : PROGRESS: at sentence #13450000, processed 114621977 words, keeping 371099 word types
2018-01-18 10:13:57,423 : INFO : PROGRESS: at sentence #13460000, processed 114706458 words, keeping 371099 word types
2018-01-18 10:13:57,505 : INFO : PROGRESS: at sentence #13470000, processed 114791040 words, keeping 371099 word types
2018-01-18 10:13:57,598 : INFO : PROGRESS: at sentence #13480000, processed 114876340 words, keeping 371099 word types
2018-01-18 10:13:57,677 : INFO : PROGRESS: at sentence #13490000, processed 114961869 words, keeping 371099 word types
2018-01-18 10:13:57,756 : INFO : PROGRESS: at se

2018-01-18 10:14:02,796 : INFO : PROGRESS: at sentence #14110000, processed 120222529 words, keeping 371099 word types
2018-01-18 10:14:02,871 : INFO : PROGRESS: at sentence #14120000, processed 120306361 words, keeping 371099 word types
2018-01-18 10:14:02,949 : INFO : PROGRESS: at sentence #14130000, processed 120391479 words, keeping 371099 word types
2018-01-18 10:14:03,047 : INFO : PROGRESS: at sentence #14140000, processed 120476501 words, keeping 371099 word types
2018-01-18 10:14:03,163 : INFO : PROGRESS: at sentence #14150000, processed 120560896 words, keeping 371099 word types
2018-01-18 10:14:03,293 : INFO : PROGRESS: at sentence #14160000, processed 120645018 words, keeping 371099 word types
2018-01-18 10:14:03,414 : INFO : PROGRESS: at sentence #14170000, processed 120729626 words, keeping 371099 word types
2018-01-18 10:14:03,510 : INFO : PROGRESS: at sentence #14180000, processed 120814949 words, keeping 371099 word types
2018-01-18 10:14:03,609 : INFO : PROGRESS: at se

2018-01-18 10:14:08,323 : INFO : PROGRESS: at sentence #14800000, processed 126080861 words, keeping 371099 word types
2018-01-18 10:14:08,396 : INFO : PROGRESS: at sentence #14810000, processed 126165528 words, keeping 371099 word types
2018-01-18 10:14:08,464 : INFO : PROGRESS: at sentence #14820000, processed 126250794 words, keeping 371099 word types
2018-01-18 10:14:08,528 : INFO : PROGRESS: at sentence #14830000, processed 126336330 words, keeping 371099 word types
2018-01-18 10:14:08,606 : INFO : PROGRESS: at sentence #14840000, processed 126421328 words, keeping 371099 word types
2018-01-18 10:14:08,667 : INFO : PROGRESS: at sentence #14850000, processed 126505585 words, keeping 371099 word types
2018-01-18 10:14:08,736 : INFO : PROGRESS: at sentence #14860000, processed 126590870 words, keeping 371099 word types
2018-01-18 10:14:08,796 : INFO : PROGRESS: at sentence #14870000, processed 126675640 words, keeping 371099 word types
2018-01-18 10:14:08,858 : INFO : PROGRESS: at se

2018-01-18 10:14:14,241 : INFO : PROGRESS: at sentence #15490000, processed 131986624 words, keeping 371099 word types
2018-01-18 10:14:14,311 : INFO : PROGRESS: at sentence #15500000, processed 132072845 words, keeping 371099 word types
2018-01-18 10:14:14,374 : INFO : PROGRESS: at sentence #15510000, processed 132157895 words, keeping 371099 word types
2018-01-18 10:14:14,476 : INFO : PROGRESS: at sentence #15520000, processed 132243066 words, keeping 371099 word types
2018-01-18 10:14:14,575 : INFO : PROGRESS: at sentence #15530000, processed 132329350 words, keeping 371099 word types
2018-01-18 10:14:14,666 : INFO : PROGRESS: at sentence #15540000, processed 132415365 words, keeping 371099 word types
2018-01-18 10:14:14,965 : INFO : PROGRESS: at sentence #15550000, processed 132500795 words, keeping 371099 word types
2018-01-18 10:14:15,050 : INFO : PROGRESS: at sentence #15560000, processed 132586278 words, keeping 371099 word types
2018-01-18 10:14:15,119 : INFO : PROGRESS: at se

2018-01-18 10:14:20,869 : INFO : PROGRESS: at sentence #16180000, processed 137920115 words, keeping 371099 word types
2018-01-18 10:14:20,959 : INFO : PROGRESS: at sentence #16190000, processed 138006408 words, keeping 371099 word types
2018-01-18 10:14:21,033 : INFO : PROGRESS: at sentence #16200000, processed 138092273 words, keeping 371099 word types
2018-01-18 10:14:21,112 : INFO : PROGRESS: at sentence #16210000, processed 138178103 words, keeping 371099 word types
2018-01-18 10:14:21,202 : INFO : PROGRESS: at sentence #16220000, processed 138264726 words, keeping 371099 word types
2018-01-18 10:14:21,278 : INFO : PROGRESS: at sentence #16230000, processed 138351422 words, keeping 371099 word types
2018-01-18 10:14:21,368 : INFO : PROGRESS: at sentence #16240000, processed 138437802 words, keeping 371099 word types
2018-01-18 10:14:21,560 : INFO : PROGRESS: at sentence #16250000, processed 138522636 words, keeping 371099 word types
2018-01-18 10:14:21,660 : INFO : PROGRESS: at se

2018-01-18 10:14:27,020 : INFO : PROGRESS: at sentence #16870000, processed 143865887 words, keeping 371099 word types
2018-01-18 10:14:27,088 : INFO : PROGRESS: at sentence #16880000, processed 143952815 words, keeping 371099 word types
2018-01-18 10:14:27,158 : INFO : PROGRESS: at sentence #16890000, processed 144038733 words, keeping 371099 word types
2018-01-18 10:14:27,225 : INFO : PROGRESS: at sentence #16900000, processed 144124181 words, keeping 371099 word types
2018-01-18 10:14:27,302 : INFO : PROGRESS: at sentence #16910000, processed 144210325 words, keeping 371099 word types
2018-01-18 10:14:27,364 : INFO : PROGRESS: at sentence #16920000, processed 144297825 words, keeping 371099 word types
2018-01-18 10:14:27,431 : INFO : PROGRESS: at sentence #16930000, processed 144384643 words, keeping 371099 word types
2018-01-18 10:14:27,497 : INFO : PROGRESS: at sentence #16940000, processed 144470736 words, keeping 371099 word types
2018-01-18 10:14:27,566 : INFO : PROGRESS: at se

2018-01-18 10:14:34,018 : INFO : PROGRESS: at sentence #17560000, processed 149810434 words, keeping 371099 word types
2018-01-18 10:14:34,058 : INFO : collected 371099 word types from a corpus of 149855462 raw words and 17565207 sentences
2018-01-18 10:14:34,064 : INFO : Loading a fresh vocabulary
2018-01-18 10:14:37,109 : INFO : min_count=3 retains 244261 unique words (65% of original 371099, drops 126838)
2018-01-18 10:14:37,111 : INFO : min_count=3 leaves 149601786 word corpus (99% of original 149855462, drops 253676)
2018-01-18 10:14:38,020 : INFO : deleting the raw counts dictionary of 371099 items
2018-01-18 10:14:38,071 : INFO : sample=0.001 downsamples 45 most-common words
2018-01-18 10:14:38,072 : INFO : downsampling leaves estimated 120858757 word corpus (80.8% of prior 149601786)
2018-01-18 10:14:38,075 : INFO : estimated required memory for 244261 words and 250 dimensions: 610652500 bytes
2018-01-18 10:14:39,081 : INFO : resetting layer weights
2018-01-18 10:14:43,649 : IN

2018-01-18 10:15:56,793 : INFO : PROGRESS: at 2.99% examples, 246331 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:15:57,844 : INFO : PROGRESS: at 3.03% examples, 246441 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:15:58,876 : INFO : PROGRESS: at 3.07% examples, 246281 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:15:59,890 : INFO : PROGRESS: at 3.11% examples, 246292 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:16:00,927 : INFO : PROGRESS: at 3.16% examples, 246643 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:16:01,990 : INFO : PROGRESS: at 3.20% examples, 246697 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:16:03,017 : INFO : PROGRESS: at 3.25% examples, 246868 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:16:04,029 : INFO : PROGRESS: at 3.29% examples, 247076 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:16:05,048 : INFO : PROGRESS: at 3.33% examples, 246566 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:16:06,074 : INFO : PROGRESS: at 3.37% examples, 246833 words/s, in_q

2018-01-18 10:17:19,727 : INFO : PROGRESS: at 6.38% examples, 245984 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:17:20,762 : INFO : PROGRESS: at 6.42% examples, 246210 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:17:21,798 : INFO : PROGRESS: at 6.47% examples, 246433 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:17:22,834 : INFO : PROGRESS: at 6.52% examples, 246658 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:17:23,847 : INFO : PROGRESS: at 6.57% examples, 246861 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:17:24,860 : INFO : PROGRESS: at 6.61% examples, 247064 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:17:25,903 : INFO : PROGRESS: at 6.66% examples, 247267 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:17:26,913 : INFO : PROGRESS: at 6.71% examples, 247468 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:17:27,963 : INFO : PROGRESS: at 6.76% examples, 247660 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:17:28,997 : INFO : PROGRESS: at 6.80% examples, 247821 words/s, in_q

2018-01-18 10:18:42,719 : INFO : PROGRESS: at 9.94% examples, 250888 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:18:43,720 : INFO : PROGRESS: at 9.98% examples, 250648 words/s, in_qsize 6, out_qsize 1
2018-01-18 10:18:44,721 : INFO : PROGRESS: at 10.01% examples, 250577 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:18:45,883 : INFO : PROGRESS: at 10.05% examples, 250173 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:18:46,910 : INFO : PROGRESS: at 10.09% examples, 250111 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:18:47,941 : INFO : PROGRESS: at 10.13% examples, 250145 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:18:48,962 : INFO : PROGRESS: at 10.17% examples, 250090 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:18:49,977 : INFO : PROGRESS: at 10.21% examples, 250106 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:18:50,993 : INFO : PROGRESS: at 10.26% examples, 250221 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:18:52,104 : INFO : PROGRESS: at 10.29% examples, 249848 words

2018-01-18 10:20:05,195 : INFO : PROGRESS: at 13.07% examples, 245569 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:20:06,200 : INFO : PROGRESS: at 13.12% examples, 245652 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:20:07,209 : INFO : PROGRESS: at 13.16% examples, 245757 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:20:08,246 : INFO : PROGRESS: at 13.21% examples, 245867 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:20:09,250 : INFO : PROGRESS: at 13.26% examples, 245977 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:20:10,255 : INFO : PROGRESS: at 13.31% examples, 246060 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:20:11,262 : INFO : PROGRESS: at 13.35% examples, 246067 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:20:12,270 : INFO : PROGRESS: at 13.39% examples, 246147 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:20:13,272 : INFO : PROGRESS: at 13.44% examples, 246231 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:20:14,289 : INFO : PROGRESS: at 13.49% examples, 246278 wor

2018-01-18 10:21:26,803 : INFO : PROGRESS: at 16.61% examples, 248771 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:21:27,838 : INFO : PROGRESS: at 16.66% examples, 248853 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:21:28,891 : INFO : PROGRESS: at 16.71% examples, 248924 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:21:29,916 : INFO : PROGRESS: at 16.76% examples, 249011 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:21:30,917 : INFO : PROGRESS: at 16.80% examples, 249073 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:21:31,923 : INFO : PROGRESS: at 16.85% examples, 249151 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:21:32,959 : INFO : PROGRESS: at 16.90% examples, 249211 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:21:33,994 : INFO : PROGRESS: at 16.95% examples, 249291 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:21:35,045 : INFO : PROGRESS: at 17.00% examples, 249361 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:21:36,062 : INFO : PROGRESS: at 17.04% examples, 249431 wor

2018-01-18 10:22:48,436 : INFO : PROGRESS: at 20.26% examples, 252597 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:22:49,436 : INFO : PROGRESS: at 20.30% examples, 252688 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:22:50,442 : INFO : PROGRESS: at 20.35% examples, 252744 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:22:51,445 : INFO : PROGRESS: at 20.40% examples, 252785 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:22:52,486 : INFO : PROGRESS: at 20.45% examples, 252873 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:22:53,491 : INFO : PROGRESS: at 20.50% examples, 252961 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:22:54,517 : INFO : PROGRESS: at 20.55% examples, 253037 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:22:55,527 : INFO : PROGRESS: at 20.59% examples, 253087 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:22:56,539 : INFO : PROGRESS: at 20.64% examples, 253138 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:22:57,582 : INFO : PROGRESS: at 20.69% examples, 253188 wor

2018-01-18 10:24:10,324 : INFO : PROGRESS: at 23.84% examples, 254144 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:24:11,341 : INFO : PROGRESS: at 23.88% examples, 254154 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:24:12,357 : INFO : PROGRESS: at 23.93% examples, 254206 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:24:13,380 : INFO : PROGRESS: at 23.97% examples, 254241 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:24:14,423 : INFO : PROGRESS: at 24.02% examples, 254283 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:24:15,442 : INFO : PROGRESS: at 24.07% examples, 254321 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:24:16,472 : INFO : PROGRESS: at 24.12% examples, 254355 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:24:17,518 : INFO : PROGRESS: at 24.16% examples, 254340 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:24:18,531 : INFO : PROGRESS: at 24.19% examples, 254243 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:24:19,547 : INFO : PROGRESS: at 24.24% examples, 254256 wor

2018-01-18 10:25:32,061 : INFO : PROGRESS: at 27.48% examples, 255883 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:25:33,109 : INFO : PROGRESS: at 27.53% examples, 255919 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:25:34,113 : INFO : PROGRESS: at 27.57% examples, 255959 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:25:35,115 : INFO : PROGRESS: at 27.62% examples, 256010 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:25:36,143 : INFO : PROGRESS: at 27.67% examples, 256028 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:25:37,206 : INFO : PROGRESS: at 27.70% examples, 255919 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:25:38,223 : INFO : PROGRESS: at 27.74% examples, 255916 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:25:39,236 : INFO : PROGRESS: at 27.79% examples, 255963 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:25:40,255 : INFO : PROGRESS: at 27.84% examples, 256020 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:25:41,258 : INFO : PROGRESS: at 27.89% examples, 256071 wor

2018-01-18 10:26:53,830 : INFO : PROGRESS: at 31.19% examples, 258036 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:26:54,843 : INFO : PROGRESS: at 31.23% examples, 258053 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:26:55,896 : INFO : PROGRESS: at 31.28% examples, 258078 words/s, in_qsize 6, out_qsize 1
2018-01-18 10:26:56,959 : INFO : PROGRESS: at 31.32% examples, 258045 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:26:57,974 : INFO : PROGRESS: at 31.36% examples, 258028 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:26:58,976 : INFO : PROGRESS: at 31.40% examples, 257972 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:27:00,020 : INFO : PROGRESS: at 31.43% examples, 257880 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:27:01,039 : INFO : PROGRESS: at 31.47% examples, 257851 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:27:02,090 : INFO : PROGRESS: at 31.50% examples, 257746 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:27:03,110 : INFO : PROGRESS: at 31.54% examples, 257718 wor

2018-01-18 10:28:15,616 : INFO : PROGRESS: at 34.81% examples, 258978 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:28:16,619 : INFO : PROGRESS: at 34.86% examples, 259007 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:28:17,623 : INFO : PROGRESS: at 34.90% examples, 259025 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:28:18,650 : INFO : PROGRESS: at 34.95% examples, 259026 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:28:19,673 : INFO : PROGRESS: at 34.99% examples, 259027 words/s, in_qsize 6, out_qsize 1
2018-01-18 10:28:20,675 : INFO : PROGRESS: at 35.04% examples, 259046 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:28:21,710 : INFO : PROGRESS: at 35.08% examples, 259054 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:28:22,760 : INFO : PROGRESS: at 35.13% examples, 259048 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:28:23,785 : INFO : PROGRESS: at 35.17% examples, 259059 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:28:24,798 : INFO : PROGRESS: at 35.22% examples, 259074 wor

2018-01-18 10:29:37,254 : INFO : PROGRESS: at 38.49% examples, 260214 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:29:38,291 : INFO : PROGRESS: at 38.54% examples, 260236 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:29:39,335 : INFO : PROGRESS: at 38.58% examples, 260257 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:29:40,361 : INFO : PROGRESS: at 38.63% examples, 260282 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:29:41,383 : INFO : PROGRESS: at 38.67% examples, 260282 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:29:42,385 : INFO : PROGRESS: at 38.72% examples, 260306 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:29:43,393 : INFO : PROGRESS: at 38.77% examples, 260320 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:29:44,409 : INFO : PROGRESS: at 38.81% examples, 260340 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:29:45,419 : INFO : PROGRESS: at 38.86% examples, 260362 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:29:46,446 : INFO : PROGRESS: at 38.90% examples, 260369 wor

2018-01-18 10:30:59,366 : INFO : PROGRESS: at 42.15% examples, 260995 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:31:00,368 : INFO : PROGRESS: at 42.20% examples, 261025 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:31:01,402 : INFO : PROGRESS: at 42.25% examples, 261054 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:31:02,414 : INFO : PROGRESS: at 42.30% examples, 261088 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:31:03,418 : INFO : PROGRESS: at 42.34% examples, 261109 words/s, in_qsize 6, out_qsize 1
2018-01-18 10:31:04,427 : INFO : PROGRESS: at 42.39% examples, 261128 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:31:05,441 : INFO : PROGRESS: at 42.44% examples, 261138 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:31:06,443 : INFO : PROGRESS: at 42.48% examples, 261167 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:31:07,447 : INFO : PROGRESS: at 42.53% examples, 261195 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:31:08,453 : INFO : PROGRESS: at 42.58% examples, 261223 wor

2018-01-18 10:32:21,184 : INFO : PROGRESS: at 45.66% examples, 260765 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:32:22,193 : INFO : PROGRESS: at 45.70% examples, 260776 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:32:23,195 : INFO : PROGRESS: at 45.75% examples, 260788 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:32:24,214 : INFO : PROGRESS: at 45.79% examples, 260780 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:32:25,240 : INFO : PROGRESS: at 45.84% examples, 260794 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:32:26,244 : INFO : PROGRESS: at 45.88% examples, 260806 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:32:27,277 : INFO : PROGRESS: at 45.93% examples, 260811 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:32:28,280 : INFO : PROGRESS: at 45.98% examples, 260830 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:32:29,292 : INFO : PROGRESS: at 46.02% examples, 260825 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:32:30,320 : INFO : PROGRESS: at 46.06% examples, 260823 wor

2018-01-18 10:33:42,516 : INFO : PROGRESS: at 49.35% examples, 261711 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:33:43,537 : INFO : PROGRESS: at 49.39% examples, 261710 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:33:44,564 : INFO : PROGRESS: at 49.44% examples, 261729 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:33:45,597 : INFO : PROGRESS: at 49.49% examples, 261746 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:33:46,613 : INFO : PROGRESS: at 49.53% examples, 261760 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:33:47,646 : INFO : PROGRESS: at 49.58% examples, 261778 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:33:48,671 : INFO : PROGRESS: at 49.63% examples, 261797 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:33:49,714 : INFO : PROGRESS: at 49.67% examples, 261813 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:33:50,729 : INFO : PROGRESS: at 49.72% examples, 261820 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:33:51,746 : INFO : PROGRESS: at 49.77% examples, 261841 wor

2018-01-18 10:35:04,023 : INFO : PROGRESS: at 53.06% examples, 262727 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:35:05,039 : INFO : PROGRESS: at 53.11% examples, 262745 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:35:06,043 : INFO : PROGRESS: at 53.16% examples, 262760 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:35:07,073 : INFO : PROGRESS: at 53.20% examples, 262770 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:35:08,124 : INFO : PROGRESS: at 53.25% examples, 262781 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:35:09,136 : INFO : PROGRESS: at 53.30% examples, 262801 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:35:10,173 : INFO : PROGRESS: at 53.35% examples, 262816 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:35:11,173 : INFO : PROGRESS: at 53.40% examples, 262838 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:35:12,223 : INFO : PROGRESS: at 53.45% examples, 262843 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:35:13,235 : INFO : PROGRESS: at 53.50% examples, 262863 wor

2018-01-18 10:36:25,554 : INFO : PROGRESS: at 56.83% examples, 263713 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:36:26,606 : INFO : PROGRESS: at 56.88% examples, 263723 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:36:27,630 : INFO : PROGRESS: at 56.93% examples, 263733 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:36:28,666 : INFO : PROGRESS: at 56.98% examples, 263746 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:36:29,670 : INFO : PROGRESS: at 57.02% examples, 263760 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:36:30,684 : INFO : PROGRESS: at 57.07% examples, 263771 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:36:31,689 : INFO : PROGRESS: at 57.12% examples, 263778 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:36:32,708 : INFO : PROGRESS: at 57.16% examples, 263788 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:36:33,739 : INFO : PROGRESS: at 57.21% examples, 263802 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:36:34,756 : INFO : PROGRESS: at 57.26% examples, 263818 wor

2018-01-18 10:37:47,162 : INFO : PROGRESS: at 60.58% examples, 264616 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:37:48,182 : INFO : PROGRESS: at 60.62% examples, 264630 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:37:49,189 : INFO : PROGRESS: at 60.67% examples, 264646 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:37:50,192 : INFO : PROGRESS: at 60.72% examples, 264657 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:37:51,203 : INFO : PROGRESS: at 60.77% examples, 264673 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:37:52,210 : INFO : PROGRESS: at 60.81% examples, 264690 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:37:53,229 : INFO : PROGRESS: at 60.86% examples, 264698 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:37:54,254 : INFO : PROGRESS: at 60.91% examples, 264712 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:37:55,256 : INFO : PROGRESS: at 60.95% examples, 264712 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:37:56,265 : INFO : PROGRESS: at 61.00% examples, 264722 wor

2018-01-18 10:39:08,787 : INFO : PROGRESS: at 64.39% examples, 265554 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:39:09,824 : INFO : PROGRESS: at 64.44% examples, 265569 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:39:10,861 : INFO : PROGRESS: at 64.49% examples, 265579 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:39:11,881 : INFO : PROGRESS: at 64.54% examples, 265591 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:39:12,893 : INFO : PROGRESS: at 64.58% examples, 265600 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:39:13,910 : INFO : PROGRESS: at 64.63% examples, 265608 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:39:14,914 : INFO : PROGRESS: at 64.68% examples, 265613 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:39:15,919 : INFO : PROGRESS: at 64.72% examples, 265628 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:39:16,930 : INFO : PROGRESS: at 64.77% examples, 265636 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:39:17,950 : INFO : PROGRESS: at 64.82% examples, 265649 wor

2018-01-18 10:40:30,278 : INFO : PROGRESS: at 68.12% examples, 266084 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:40:31,312 : INFO : PROGRESS: at 68.17% examples, 266094 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:40:32,315 : INFO : PROGRESS: at 68.22% examples, 266109 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:40:33,323 : INFO : PROGRESS: at 68.27% examples, 266118 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:40:34,329 : INFO : PROGRESS: at 68.31% examples, 266128 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:40:35,337 : INFO : PROGRESS: at 68.36% examples, 266137 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:40:36,369 : INFO : PROGRESS: at 68.41% examples, 266142 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:40:37,432 : INFO : PROGRESS: at 68.46% examples, 266147 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:40:38,448 : INFO : PROGRESS: at 68.49% examples, 266118 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:40:39,451 : INFO : PROGRESS: at 68.54% examples, 266123 wor

2018-01-18 10:41:52,080 : INFO : PROGRESS: at 71.86% examples, 266648 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:41:53,131 : INFO : PROGRESS: at 71.90% examples, 266653 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:41:54,147 : INFO : PROGRESS: at 71.95% examples, 266663 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:41:55,188 : INFO : PROGRESS: at 72.00% examples, 266669 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:41:56,195 : INFO : PROGRESS: at 72.04% examples, 266681 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:41:57,215 : INFO : PROGRESS: at 72.09% examples, 266691 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:41:58,246 : INFO : PROGRESS: at 72.14% examples, 266700 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:41:59,255 : INFO : PROGRESS: at 72.19% examples, 266712 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:42:00,273 : INFO : PROGRESS: at 72.24% examples, 266723 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:42:01,285 : INFO : PROGRESS: at 72.28% examples, 266730 wor

2018-01-18 10:43:13,710 : INFO : PROGRESS: at 75.63% examples, 267214 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:43:14,715 : INFO : PROGRESS: at 75.68% examples, 267222 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:43:15,730 : INFO : PROGRESS: at 75.72% examples, 267229 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:43:16,752 : INFO : PROGRESS: at 75.77% examples, 267239 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:43:17,759 : INFO : PROGRESS: at 75.82% examples, 267252 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:43:18,775 : INFO : PROGRESS: at 75.87% examples, 267258 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:43:19,796 : INFO : PROGRESS: at 75.92% examples, 267268 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:43:20,811 : INFO : PROGRESS: at 75.96% examples, 267270 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:43:21,853 : INFO : PROGRESS: at 76.00% examples, 267248 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:43:22,854 : INFO : PROGRESS: at 76.04% examples, 267234 wor

2018-01-18 10:44:35,363 : INFO : PROGRESS: at 79.38% examples, 267743 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:44:36,370 : INFO : PROGRESS: at 79.42% examples, 267723 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:44:37,376 : INFO : PROGRESS: at 79.47% examples, 267743 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:44:38,380 : INFO : PROGRESS: at 79.52% examples, 267746 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:44:39,439 : INFO : PROGRESS: at 79.57% examples, 267754 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:44:40,487 : INFO : PROGRESS: at 79.61% examples, 267760 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:44:41,496 : INFO : PROGRESS: at 79.66% examples, 267757 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:44:42,530 : INFO : PROGRESS: at 79.71% examples, 267765 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:44:43,598 : INFO : PROGRESS: at 79.75% examples, 267767 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:44:44,631 : INFO : PROGRESS: at 79.80% examples, 267762 wor

2018-01-18 10:45:56,772 : INFO : PROGRESS: at 83.09% examples, 268048 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:45:57,777 : INFO : PROGRESS: at 83.14% examples, 268050 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:45:58,814 : INFO : PROGRESS: at 83.19% examples, 268057 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:45:59,828 : INFO : PROGRESS: at 83.23% examples, 268067 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:46:00,837 : INFO : PROGRESS: at 83.28% examples, 268074 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:46:01,877 : INFO : PROGRESS: at 83.33% examples, 268080 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:46:02,888 : INFO : PROGRESS: at 83.38% examples, 268091 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:46:03,889 : INFO : PROGRESS: at 83.42% examples, 268095 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:46:04,904 : INFO : PROGRESS: at 83.47% examples, 268105 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:46:05,920 : INFO : PROGRESS: at 83.52% examples, 268106 wor

2018-01-18 10:47:18,604 : INFO : PROGRESS: at 86.73% examples, 268038 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:47:19,605 : INFO : PROGRESS: at 86.77% examples, 268009 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:47:20,650 : INFO : PROGRESS: at 86.81% examples, 267985 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:47:21,677 : INFO : PROGRESS: at 86.85% examples, 267985 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:47:22,695 : INFO : PROGRESS: at 86.90% examples, 267986 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:47:23,708 : INFO : PROGRESS: at 86.94% examples, 267967 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:47:24,713 : INFO : PROGRESS: at 86.98% examples, 267958 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:47:25,731 : INFO : PROGRESS: at 87.02% examples, 267959 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:47:26,772 : INFO : PROGRESS: at 87.07% examples, 267965 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:47:27,800 : INFO : PROGRESS: at 87.12% examples, 267969 wor

2018-01-18 10:48:40,841 : INFO : PROGRESS: at 90.23% examples, 267607 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:48:41,855 : INFO : PROGRESS: at 90.28% examples, 267616 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:48:42,874 : INFO : PROGRESS: at 90.32% examples, 267609 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:48:43,889 : INFO : PROGRESS: at 90.36% examples, 267606 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:48:44,918 : INFO : PROGRESS: at 90.40% examples, 267570 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:48:45,951 : INFO : PROGRESS: at 90.43% examples, 267533 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:48:46,978 : INFO : PROGRESS: at 90.47% examples, 267509 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:48:48,012 : INFO : PROGRESS: at 90.51% examples, 267492 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:48:49,014 : INFO : PROGRESS: at 90.54% examples, 267468 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:48:50,040 : INFO : PROGRESS: at 90.59% examples, 267464 wor

2018-01-18 10:50:02,635 : INFO : PROGRESS: at 93.68% examples, 267126 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:50:03,642 : INFO : PROGRESS: at 93.73% examples, 267120 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:50:04,650 : INFO : PROGRESS: at 93.77% examples, 267127 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:50:05,680 : INFO : PROGRESS: at 93.82% examples, 267130 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:50:06,698 : INFO : PROGRESS: at 93.87% examples, 267139 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:50:07,705 : INFO : PROGRESS: at 93.92% examples, 267145 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:50:08,706 : INFO : PROGRESS: at 93.96% examples, 267152 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:50:09,750 : INFO : PROGRESS: at 94.01% examples, 267150 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:50:10,752 : INFO : PROGRESS: at 94.05% examples, 267145 words/s, in_qsize 6, out_qsize 0
2018-01-18 10:50:11,773 : INFO : PROGRESS: at 94.10% examples, 267153 wor

2018-01-18 10:51:24,666 : INFO : PROGRESS: at 97.42% examples, 267423 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:51:25,671 : INFO : PROGRESS: at 97.46% examples, 267425 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:51:26,682 : INFO : PROGRESS: at 97.51% examples, 267416 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:51:27,723 : INFO : PROGRESS: at 97.55% examples, 267425 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:51:28,740 : INFO : PROGRESS: at 97.60% examples, 267430 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:51:29,755 : INFO : PROGRESS: at 97.65% examples, 267435 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:51:30,791 : INFO : PROGRESS: at 97.70% examples, 267441 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:51:31,802 : INFO : PROGRESS: at 97.74% examples, 267442 words/s, in_qsize 4, out_qsize 1
2018-01-18 10:51:32,812 : INFO : PROGRESS: at 97.79% examples, 267448 words/s, in_qsize 5, out_qsize 0
2018-01-18 10:51:33,831 : INFO : PROGRESS: at 97.83% examples, 267453 wor

In [31]:
num_doc='50M'
sg=0
size=250
window=5
min_count=3
hs=1
negative=0
model_full = gensim.models.word2vec.Word2Vec(corpus, sg=sg, size=size, window=window, min_count=min_count, hs=hs, negative=0)

2018-01-24 13:07:18,117 : INFO : collecting all words and their counts
2018-01-24 13:07:18,122 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-24 13:07:18,229 : INFO : PROGRESS: at sentence #10000, processed 53499 words, keeping 8072 word types
2018-01-24 13:07:18,313 : INFO : PROGRESS: at sentence #20000, processed 108631 words, keeping 12072 word types
2018-01-24 13:07:18,379 : INFO : PROGRESS: at sentence #30000, processed 164079 words, keeping 15024 word types
2018-01-24 13:07:18,465 : INFO : PROGRESS: at sentence #40000, processed 219488 words, keeping 17523 word types
2018-01-24 13:07:18,546 : INFO : PROGRESS: at sentence #50000, processed 274392 words, keeping 19823 word types
2018-01-24 13:07:18,607 : INFO : PROGRESS: at sentence #60000, processed 328944 words, keeping 21940 word types
2018-01-24 13:07:18,658 : INFO : PROGRESS: at sentence #70000, processed 383988 words, keeping 23848 word types
2018-01-24 13:07:18,707 : INFO : PROGRESS: at se

2018-01-24 13:07:21,753 : INFO : PROGRESS: at sentence #710000, processed 3892917 words, keeping 95066 word types
2018-01-24 13:07:21,796 : INFO : PROGRESS: at sentence #720000, processed 3947720 words, keeping 95880 word types
2018-01-24 13:07:21,840 : INFO : PROGRESS: at sentence #730000, processed 4002743 words, keeping 96651 word types
2018-01-24 13:07:21,881 : INFO : PROGRESS: at sentence #740000, processed 4057675 words, keeping 97490 word types
2018-01-24 13:07:21,929 : INFO : PROGRESS: at sentence #750000, processed 4112925 words, keeping 98289 word types
2018-01-24 13:07:21,972 : INFO : PROGRESS: at sentence #760000, processed 4167845 words, keeping 99080 word types
2018-01-24 13:07:22,020 : INFO : PROGRESS: at sentence #770000, processed 4222332 words, keeping 99886 word types
2018-01-24 13:07:22,064 : INFO : PROGRESS: at sentence #780000, processed 4277218 words, keeping 100670 word types
2018-01-24 13:07:22,106 : INFO : PROGRESS: at sentence #790000, processed 4332093 words

2018-01-24 13:07:24,986 : INFO : PROGRESS: at sentence #1420000, processed 7790453 words, keeping 148007 word types
2018-01-24 13:07:25,025 : INFO : PROGRESS: at sentence #1430000, processed 7845002 words, keeping 148650 word types
2018-01-24 13:07:25,071 : INFO : PROGRESS: at sentence #1440000, processed 7899647 words, keeping 149300 word types
2018-01-24 13:07:25,112 : INFO : PROGRESS: at sentence #1450000, processed 7953866 words, keeping 149974 word types
2018-01-24 13:07:25,153 : INFO : PROGRESS: at sentence #1460000, processed 8008757 words, keeping 150620 word types
2018-01-24 13:07:25,195 : INFO : PROGRESS: at sentence #1470000, processed 8062331 words, keeping 151299 word types
2018-01-24 13:07:25,242 : INFO : PROGRESS: at sentence #1480000, processed 8116935 words, keeping 151979 word types
2018-01-24 13:07:25,292 : INFO : PROGRESS: at sentence #1490000, processed 8171416 words, keeping 152641 word types
2018-01-24 13:07:25,332 : INFO : PROGRESS: at sentence #1500000, process

2018-01-24 13:07:28,295 : INFO : PROGRESS: at sentence #2130000, processed 11729569 words, keeping 193039 word types
2018-01-24 13:07:28,343 : INFO : PROGRESS: at sentence #2140000, processed 11784842 words, keeping 193611 word types
2018-01-24 13:07:28,387 : INFO : PROGRESS: at sentence #2150000, processed 11841227 words, keeping 194251 word types
2018-01-24 13:07:28,433 : INFO : PROGRESS: at sentence #2160000, processed 11897064 words, keeping 194868 word types
2018-01-24 13:07:28,476 : INFO : PROGRESS: at sentence #2170000, processed 11953218 words, keeping 195482 word types
2018-01-24 13:07:28,525 : INFO : PROGRESS: at sentence #2180000, processed 12009217 words, keeping 196130 word types
2018-01-24 13:07:28,566 : INFO : PROGRESS: at sentence #2190000, processed 12064977 words, keeping 196708 word types
2018-01-24 13:07:28,615 : INFO : PROGRESS: at sentence #2200000, processed 12120898 words, keeping 197295 word types
2018-01-24 13:07:28,661 : INFO : PROGRESS: at sentence #2210000,

2018-01-24 13:07:31,598 : INFO : PROGRESS: at sentence #2840000, processed 15718595 words, keeping 234651 word types
2018-01-24 13:07:31,646 : INFO : PROGRESS: at sentence #2850000, processed 15774461 words, keeping 235276 word types
2018-01-24 13:07:31,698 : INFO : PROGRESS: at sentence #2860000, processed 15831321 words, keeping 235869 word types
2018-01-24 13:07:31,743 : INFO : PROGRESS: at sentence #2870000, processed 15887752 words, keeping 236417 word types
2018-01-24 13:07:31,791 : INFO : PROGRESS: at sentence #2880000, processed 15943874 words, keeping 237004 word types
2018-01-24 13:07:31,838 : INFO : PROGRESS: at sentence #2890000, processed 15999987 words, keeping 237562 word types
2018-01-24 13:07:31,890 : INFO : PROGRESS: at sentence #2900000, processed 16056669 words, keeping 238047 word types
2018-01-24 13:07:31,933 : INFO : PROGRESS: at sentence #2910000, processed 16112519 words, keeping 238588 word types
2018-01-24 13:07:31,985 : INFO : PROGRESS: at sentence #2920000,

2018-01-24 13:07:35,024 : INFO : PROGRESS: at sentence #3550000, processed 19669587 words, keeping 260218 word types
2018-01-24 13:07:35,073 : INFO : PROGRESS: at sentence #3560000, processed 19723601 words, keeping 260218 word types
2018-01-24 13:07:35,126 : INFO : PROGRESS: at sentence #3570000, processed 19778143 words, keeping 260218 word types
2018-01-24 13:07:35,198 : INFO : PROGRESS: at sentence #3580000, processed 19832759 words, keeping 260218 word types
2018-01-24 13:07:35,271 : INFO : PROGRESS: at sentence #3590000, processed 19887479 words, keeping 260218 word types
2018-01-24 13:07:35,341 : INFO : PROGRESS: at sentence #3600000, processed 19942079 words, keeping 260218 word types
2018-01-24 13:07:35,412 : INFO : PROGRESS: at sentence #3610000, processed 19996412 words, keeping 260218 word types
2018-01-24 13:07:35,470 : INFO : PROGRESS: at sentence #3620000, processed 20050731 words, keeping 260218 word types
2018-01-24 13:07:35,526 : INFO : PROGRESS: at sentence #3630000,

2018-01-24 13:07:38,898 : INFO : PROGRESS: at sentence #4260000, processed 23571783 words, keeping 260218 word types
2018-01-24 13:07:38,944 : INFO : PROGRESS: at sentence #4270000, processed 23627068 words, keeping 260218 word types
2018-01-24 13:07:39,016 : INFO : PROGRESS: at sentence #4280000, processed 23681511 words, keeping 260218 word types
2018-01-24 13:07:39,077 : INFO : PROGRESS: at sentence #4290000, processed 23736733 words, keeping 260218 word types
2018-01-24 13:07:39,125 : INFO : PROGRESS: at sentence #4300000, processed 23791324 words, keeping 260218 word types
2018-01-24 13:07:39,167 : INFO : PROGRESS: at sentence #4310000, processed 23846201 words, keeping 260218 word types
2018-01-24 13:07:39,221 : INFO : PROGRESS: at sentence #4320000, processed 23901102 words, keeping 260218 word types
2018-01-24 13:07:39,263 : INFO : PROGRESS: at sentence #4330000, processed 23955495 words, keeping 260218 word types
2018-01-24 13:07:39,316 : INFO : PROGRESS: at sentence #4340000,

2018-01-24 13:07:42,294 : INFO : PROGRESS: at sentence #4970000, processed 27468270 words, keeping 260218 word types
2018-01-24 13:07:42,333 : INFO : PROGRESS: at sentence #4980000, processed 27523278 words, keeping 260218 word types
2018-01-24 13:07:42,384 : INFO : PROGRESS: at sentence #4990000, processed 27578765 words, keeping 260218 word types
2018-01-24 13:07:42,425 : INFO : PROGRESS: at sentence #5000000, processed 27634524 words, keeping 260218 word types
2018-01-24 13:07:42,477 : INFO : PROGRESS: at sentence #5010000, processed 27690286 words, keeping 260218 word types
2018-01-24 13:07:42,518 : INFO : PROGRESS: at sentence #5020000, processed 27746009 words, keeping 260218 word types
2018-01-24 13:07:42,567 : INFO : PROGRESS: at sentence #5030000, processed 27801293 words, keeping 260218 word types
2018-01-24 13:07:42,619 : INFO : PROGRESS: at sentence #5040000, processed 27857036 words, keeping 260218 word types
2018-01-24 13:07:42,659 : INFO : PROGRESS: at sentence #5050000,

2018-01-24 13:07:45,630 : INFO : PROGRESS: at sentence #5680000, processed 31436585 words, keeping 260218 word types
2018-01-24 13:07:45,674 : INFO : PROGRESS: at sentence #5690000, processed 31492659 words, keeping 260218 word types
2018-01-24 13:07:45,724 : INFO : PROGRESS: at sentence #5700000, processed 31549315 words, keeping 260218 word types
2018-01-24 13:07:45,774 : INFO : PROGRESS: at sentence #5710000, processed 31605780 words, keeping 260218 word types
2018-01-24 13:07:45,820 : INFO : PROGRESS: at sentence #5720000, processed 31661839 words, keeping 260218 word types
2018-01-24 13:07:45,875 : INFO : PROGRESS: at sentence #5730000, processed 31717835 words, keeping 260218 word types
2018-01-24 13:07:45,921 : INFO : PROGRESS: at sentence #5740000, processed 31774157 words, keeping 260218 word types
2018-01-24 13:07:45,977 : INFO : PROGRESS: at sentence #5750000, processed 31829317 words, keeping 260218 word types
2018-01-24 13:07:46,030 : INFO : PROGRESS: at sentence #5760000,

2018-01-24 13:07:49,182 : INFO : PROGRESS: at sentence #6390000, processed 35428934 words, keeping 260218 word types
2018-01-24 13:07:49,242 : INFO : PROGRESS: at sentence #6400000, processed 35484747 words, keeping 260218 word types
2018-01-24 13:07:49,299 : INFO : PROGRESS: at sentence #6410000, processed 35540952 words, keeping 260218 word types
2018-01-24 13:07:49,349 : INFO : PROGRESS: at sentence #6420000, processed 35597254 words, keeping 260218 word types
2018-01-24 13:07:49,398 : INFO : PROGRESS: at sentence #6430000, processed 35653145 words, keeping 260218 word types
2018-01-24 13:07:49,446 : INFO : PROGRESS: at sentence #6440000, processed 35709549 words, keeping 260218 word types
2018-01-24 13:07:49,501 : INFO : PROGRESS: at sentence #6450000, processed 35766129 words, keeping 260218 word types
2018-01-24 13:07:49,556 : INFO : PROGRESS: at sentence #6460000, processed 35822231 words, keeping 260218 word types
2018-01-24 13:07:49,606 : INFO : PROGRESS: at sentence #6470000,

2018-01-24 13:07:52,980 : INFO : PROGRESS: at sentence #7100000, processed 39423315 words, keeping 287146 word types
2018-01-24 13:07:53,042 : INFO : PROGRESS: at sentence #7110000, processed 39480369 words, keeping 287725 word types
2018-01-24 13:07:53,109 : INFO : PROGRESS: at sentence #7120000, processed 39536930 words, keeping 288247 word types
2018-01-24 13:07:53,177 : INFO : PROGRESS: at sentence #7130000, processed 39592989 words, keeping 288771 word types
2018-01-24 13:07:53,226 : INFO : PROGRESS: at sentence #7140000, processed 39649260 words, keeping 289310 word types
2018-01-24 13:07:53,282 : INFO : PROGRESS: at sentence #7150000, processed 39706352 words, keeping 289863 word types
2018-01-24 13:07:53,335 : INFO : PROGRESS: at sentence #7160000, processed 39762111 words, keeping 290334 word types
2018-01-24 13:07:53,394 : INFO : PROGRESS: at sentence #7170000, processed 39818476 words, keeping 290885 word types
2018-01-24 13:07:53,449 : INFO : PROGRESS: at sentence #7180000,

2018-01-24 13:07:56,712 : INFO : PROGRESS: at sentence #7810000, processed 43435321 words, keeping 323748 word types
2018-01-24 13:07:56,765 : INFO : PROGRESS: at sentence #7820000, processed 43491825 words, keeping 324255 word types
2018-01-24 13:07:56,814 : INFO : PROGRESS: at sentence #7830000, processed 43548044 words, keeping 324803 word types
2018-01-24 13:07:56,863 : INFO : PROGRESS: at sentence #7840000, processed 43604088 words, keeping 325282 word types
2018-01-24 13:07:56,917 : INFO : PROGRESS: at sentence #7850000, processed 43661138 words, keeping 325796 word types
2018-01-24 13:07:56,973 : INFO : PROGRESS: at sentence #7860000, processed 43718580 words, keeping 326287 word types
2018-01-24 13:07:57,018 : INFO : PROGRESS: at sentence #7870000, processed 43776203 words, keeping 326817 word types
2018-01-24 13:07:57,074 : INFO : PROGRESS: at sentence #7880000, processed 43833325 words, keeping 327367 word types
2018-01-24 13:07:57,123 : INFO : PROGRESS: at sentence #7890000,

2018-01-24 13:08:00,556 : INFO : PROGRESS: at sentence #8520000, processed 47481436 words, keeping 359076 word types
2018-01-24 13:08:00,603 : INFO : PROGRESS: at sentence #8530000, processed 47538702 words, keeping 359557 word types
2018-01-24 13:08:00,656 : INFO : PROGRESS: at sentence #8540000, processed 47595066 words, keeping 360081 word types
2018-01-24 13:08:00,709 : INFO : PROGRESS: at sentence #8550000, processed 47651814 words, keeping 360558 word types
2018-01-24 13:08:00,762 : INFO : PROGRESS: at sentence #8560000, processed 47709316 words, keeping 361099 word types
2018-01-24 13:08:00,804 : INFO : PROGRESS: at sentence #8570000, processed 47766694 words, keeping 361588 word types
2018-01-24 13:08:00,859 : INFO : PROGRESS: at sentence #8580000, processed 47823363 words, keeping 362081 word types
2018-01-24 13:08:00,906 : INFO : PROGRESS: at sentence #8590000, processed 47880719 words, keeping 362615 word types
2018-01-24 13:08:00,960 : INFO : PROGRESS: at sentence #8600000,

2018-01-24 13:08:04,144 : INFO : PROGRESS: at sentence #9230000, processed 51532458 words, keeping 394025 word types
2018-01-24 13:08:04,200 : INFO : PROGRESS: at sentence #9240000, processed 51589139 words, keeping 394472 word types
2018-01-24 13:08:04,246 : INFO : PROGRESS: at sentence #9250000, processed 51646453 words, keeping 394972 word types
2018-01-24 13:08:04,303 : INFO : PROGRESS: at sentence #9260000, processed 51704030 words, keeping 395410 word types
2018-01-24 13:08:04,348 : INFO : PROGRESS: at sentence #9270000, processed 51761765 words, keeping 395929 word types
2018-01-24 13:08:04,405 : INFO : PROGRESS: at sentence #9280000, processed 51818660 words, keeping 396417 word types
2018-01-24 13:08:04,455 : INFO : PROGRESS: at sentence #9290000, processed 51875701 words, keeping 396910 word types
2018-01-24 13:08:04,505 : INFO : PROGRESS: at sentence #9300000, processed 51932768 words, keeping 397369 word types
2018-01-24 13:08:04,555 : INFO : PROGRESS: at sentence #9310000,

2018-01-24 13:08:07,914 : INFO : PROGRESS: at sentence #9940000, processed 55593201 words, keeping 427275 word types
2018-01-24 13:08:07,968 : INFO : PROGRESS: at sentence #9950000, processed 55651002 words, keeping 427745 word types
2018-01-24 13:08:08,014 : INFO : PROGRESS: at sentence #9960000, processed 55707454 words, keeping 428147 word types
2018-01-24 13:08:08,078 : INFO : PROGRESS: at sentence #9970000, processed 55764431 words, keeping 428649 word types
2018-01-24 13:08:08,135 : INFO : PROGRESS: at sentence #9980000, processed 55821610 words, keeping 429108 word types
2018-01-24 13:08:08,180 : INFO : PROGRESS: at sentence #9990000, processed 55878999 words, keeping 429577 word types
2018-01-24 13:08:08,237 : INFO : PROGRESS: at sentence #10000000, processed 55936119 words, keeping 430028 word types
2018-01-24 13:08:08,281 : INFO : PROGRESS: at sentence #10010000, processed 55992573 words, keeping 430476 word types
2018-01-24 13:08:08,336 : INFO : PROGRESS: at sentence #100200

2018-01-24 13:08:11,656 : INFO : PROGRESS: at sentence #10640000, processed 59551853 words, keeping 450305 word types
2018-01-24 13:08:11,706 : INFO : PROGRESS: at sentence #10650000, processed 59606488 words, keeping 450305 word types
2018-01-24 13:08:11,759 : INFO : PROGRESS: at sentence #10660000, processed 59661100 words, keeping 450305 word types
2018-01-24 13:08:11,806 : INFO : PROGRESS: at sentence #10670000, processed 59715586 words, keeping 450305 word types
2018-01-24 13:08:11,855 : INFO : PROGRESS: at sentence #10680000, processed 59770432 words, keeping 450305 word types
2018-01-24 13:08:11,905 : INFO : PROGRESS: at sentence #10690000, processed 59824221 words, keeping 450305 word types
2018-01-24 13:08:11,951 : INFO : PROGRESS: at sentence #10700000, processed 59878700 words, keeping 450305 word types
2018-01-24 13:08:12,005 : INFO : PROGRESS: at sentence #10710000, processed 59933360 words, keeping 450305 word types
2018-01-24 13:08:12,051 : INFO : PROGRESS: at sentence #

2018-01-24 13:08:15,401 : INFO : PROGRESS: at sentence #11340000, processed 63396373 words, keeping 450305 word types
2018-01-24 13:08:15,467 : INFO : PROGRESS: at sentence #11350000, processed 63451412 words, keeping 450305 word types
2018-01-24 13:08:15,537 : INFO : PROGRESS: at sentence #11360000, processed 63506481 words, keeping 450305 word types
2018-01-24 13:08:15,589 : INFO : PROGRESS: at sentence #11370000, processed 63561707 words, keeping 450305 word types
2018-01-24 13:08:15,643 : INFO : PROGRESS: at sentence #11380000, processed 63616653 words, keeping 450305 word types
2018-01-24 13:08:15,695 : INFO : PROGRESS: at sentence #11390000, processed 63672349 words, keeping 450305 word types
2018-01-24 13:08:15,752 : INFO : PROGRESS: at sentence #11400000, processed 63727470 words, keeping 450305 word types
2018-01-24 13:08:15,807 : INFO : PROGRESS: at sentence #11410000, processed 63782104 words, keeping 450305 word types
2018-01-24 13:08:15,864 : INFO : PROGRESS: at sentence #

2018-01-24 13:08:19,466 : INFO : PROGRESS: at sentence #12040000, processed 67236930 words, keeping 450305 word types
2018-01-24 13:08:19,514 : INFO : PROGRESS: at sentence #12050000, processed 67292211 words, keeping 450305 word types
2018-01-24 13:08:19,564 : INFO : PROGRESS: at sentence #12060000, processed 67347610 words, keeping 450305 word types
2018-01-24 13:08:19,619 : INFO : PROGRESS: at sentence #12070000, processed 67402737 words, keeping 450305 word types
2018-01-24 13:08:19,677 : INFO : PROGRESS: at sentence #12080000, processed 67458052 words, keeping 450305 word types
2018-01-24 13:08:19,724 : INFO : PROGRESS: at sentence #12090000, processed 67513128 words, keeping 450305 word types
2018-01-24 13:08:19,778 : INFO : PROGRESS: at sentence #12100000, processed 67568657 words, keeping 450305 word types
2018-01-24 13:08:19,827 : INFO : PROGRESS: at sentence #12110000, processed 67623877 words, keeping 450305 word types
2018-01-24 13:08:19,872 : INFO : PROGRESS: at sentence #

2018-01-24 13:08:22,952 : INFO : PROGRESS: at sentence #12740000, processed 71141930 words, keeping 450305 word types
2018-01-24 13:08:23,004 : INFO : PROGRESS: at sentence #12750000, processed 71197900 words, keeping 450305 word types
2018-01-24 13:08:23,048 : INFO : PROGRESS: at sentence #12760000, processed 71254932 words, keeping 450305 word types
2018-01-24 13:08:23,097 : INFO : PROGRESS: at sentence #12770000, processed 71310505 words, keeping 450305 word types
2018-01-24 13:08:23,144 : INFO : PROGRESS: at sentence #12780000, processed 71366992 words, keeping 450305 word types
2018-01-24 13:08:23,195 : INFO : PROGRESS: at sentence #12790000, processed 71423549 words, keeping 450305 word types
2018-01-24 13:08:23,241 : INFO : PROGRESS: at sentence #12800000, processed 71479808 words, keeping 450305 word types
2018-01-24 13:08:23,289 : INFO : PROGRESS: at sentence #12810000, processed 71536558 words, keeping 450305 word types
2018-01-24 13:08:23,340 : INFO : PROGRESS: at sentence #

2018-01-24 13:08:26,475 : INFO : PROGRESS: at sentence #13440000, processed 75078424 words, keeping 450305 word types
2018-01-24 13:08:26,526 : INFO : PROGRESS: at sentence #13450000, processed 75135080 words, keeping 450305 word types
2018-01-24 13:08:26,574 : INFO : PROGRESS: at sentence #13460000, processed 75190799 words, keeping 450305 word types
2018-01-24 13:08:26,624 : INFO : PROGRESS: at sentence #13470000, processed 75246889 words, keeping 450305 word types
2018-01-24 13:08:26,683 : INFO : PROGRESS: at sentence #13480000, processed 75303475 words, keeping 450305 word types
2018-01-24 13:08:26,730 : INFO : PROGRESS: at sentence #13490000, processed 75360184 words, keeping 450305 word types
2018-01-24 13:08:26,777 : INFO : PROGRESS: at sentence #13500000, processed 75416082 words, keeping 450305 word types
2018-01-24 13:08:26,827 : INFO : PROGRESS: at sentence #13510000, processed 75472870 words, keeping 450305 word types
2018-01-24 13:08:26,884 : INFO : PROGRESS: at sentence #

2018-01-24 13:08:30,208 : INFO : PROGRESS: at sentence #14140000, processed 79018932 words, keeping 450305 word types
2018-01-24 13:08:30,268 : INFO : PROGRESS: at sentence #14150000, processed 79074860 words, keeping 450305 word types
2018-01-24 13:08:30,323 : INFO : PROGRESS: at sentence #14160000, processed 79130889 words, keeping 450305 word types
2018-01-24 13:08:30,376 : INFO : PROGRESS: at sentence #14170000, processed 79187152 words, keeping 450305 word types
2018-01-24 13:08:30,432 : INFO : PROGRESS: at sentence #14180000, processed 79243644 words, keeping 450305 word types
2018-01-24 13:08:30,485 : INFO : PROGRESS: at sentence #14190000, processed 79299102 words, keeping 450305 word types
2018-01-24 13:08:30,542 : INFO : PROGRESS: at sentence #14200000, processed 79355223 words, keeping 450305 word types
2018-01-24 13:08:30,597 : INFO : PROGRESS: at sentence #14210000, processed 79411584 words, keeping 450305 word types
2018-01-24 13:08:30,652 : INFO : PROGRESS: at sentence #

2018-01-24 13:08:34,138 : INFO : PROGRESS: at sentence #14840000, processed 82970813 words, keeping 450305 word types
2018-01-24 13:08:34,196 : INFO : PROGRESS: at sentence #14850000, processed 83027175 words, keeping 450305 word types
2018-01-24 13:08:34,251 : INFO : PROGRESS: at sentence #14860000, processed 83083678 words, keeping 450305 word types
2018-01-24 13:08:34,312 : INFO : PROGRESS: at sentence #14870000, processed 83140356 words, keeping 450305 word types
2018-01-24 13:08:34,367 : INFO : PROGRESS: at sentence #14880000, processed 83196608 words, keeping 450305 word types
2018-01-24 13:08:34,428 : INFO : PROGRESS: at sentence #14890000, processed 83252707 words, keeping 450305 word types
2018-01-24 13:08:34,487 : INFO : PROGRESS: at sentence #14900000, processed 83309745 words, keeping 450305 word types
2018-01-24 13:08:34,541 : INFO : PROGRESS: at sentence #14910000, processed 83366699 words, keeping 450305 word types
2018-01-24 13:08:34,602 : INFO : PROGRESS: at sentence #

2018-01-24 13:08:38,167 : INFO : PROGRESS: at sentence #15540000, processed 86953310 words, keeping 450305 word types
2018-01-24 13:08:38,221 : INFO : PROGRESS: at sentence #15550000, processed 87010181 words, keeping 450305 word types
2018-01-24 13:08:38,272 : INFO : PROGRESS: at sentence #15560000, processed 87067241 words, keeping 450305 word types
2018-01-24 13:08:38,332 : INFO : PROGRESS: at sentence #15570000, processed 87124517 words, keeping 450305 word types
2018-01-24 13:08:38,383 : INFO : PROGRESS: at sentence #15580000, processed 87182335 words, keeping 450305 word types
2018-01-24 13:08:38,437 : INFO : PROGRESS: at sentence #15590000, processed 87239373 words, keeping 450305 word types
2018-01-24 13:08:38,489 : INFO : PROGRESS: at sentence #15600000, processed 87296245 words, keeping 450305 word types
2018-01-24 13:08:38,543 : INFO : PROGRESS: at sentence #15610000, processed 87353358 words, keeping 450305 word types
2018-01-24 13:08:38,596 : INFO : PROGRESS: at sentence #

2018-01-24 13:08:41,948 : INFO : PROGRESS: at sentence #16240000, processed 90948285 words, keeping 450305 word types
2018-01-24 13:08:42,001 : INFO : PROGRESS: at sentence #16250000, processed 91004815 words, keeping 450305 word types
2018-01-24 13:08:42,053 : INFO : PROGRESS: at sentence #16260000, processed 91062042 words, keeping 450305 word types
2018-01-24 13:08:42,107 : INFO : PROGRESS: at sentence #16270000, processed 91119496 words, keeping 450305 word types
2018-01-24 13:08:42,166 : INFO : PROGRESS: at sentence #16280000, processed 91176386 words, keeping 450305 word types
2018-01-24 13:08:42,223 : INFO : PROGRESS: at sentence #16290000, processed 91232946 words, keeping 450305 word types
2018-01-24 13:08:42,276 : INFO : PROGRESS: at sentence #16300000, processed 91289517 words, keeping 450305 word types
2018-01-24 13:08:42,329 : INFO : PROGRESS: at sentence #16310000, processed 91346694 words, keeping 450305 word types
2018-01-24 13:08:42,378 : INFO : PROGRESS: at sentence #

2018-01-24 13:08:45,783 : INFO : PROGRESS: at sentence #16940000, processed 94951491 words, keeping 450305 word types
2018-01-24 13:08:45,830 : INFO : PROGRESS: at sentence #16950000, processed 95008221 words, keeping 450305 word types
2018-01-24 13:08:45,889 : INFO : PROGRESS: at sentence #16960000, processed 95065402 words, keeping 450305 word types
2018-01-24 13:08:45,939 : INFO : PROGRESS: at sentence #16970000, processed 95122897 words, keeping 450305 word types
2018-01-24 13:08:46,001 : INFO : PROGRESS: at sentence #16980000, processed 95180315 words, keeping 450305 word types
2018-01-24 13:08:46,059 : INFO : PROGRESS: at sentence #16990000, processed 95236762 words, keeping 450305 word types
2018-01-24 13:08:46,122 : INFO : PROGRESS: at sentence #17000000, processed 95293196 words, keeping 450305 word types
2018-01-24 13:08:46,179 : INFO : PROGRESS: at sentence #17010000, processed 95350536 words, keeping 450305 word types
2018-01-24 13:08:46,227 : INFO : PROGRESS: at sentence #

2018-01-24 13:08:53,452 : INFO : constructing a huffman tree from 295222 words
2018-01-24 13:09:08,254 : INFO : built huffman tree with maximum node depth 25
2018-01-24 13:09:08,389 : INFO : resetting layer weights
2018-01-24 13:09:14,847 : INFO : training model with 3 workers on 295222 vocabulary and 250 features, using sg=0 hs=1 sample=0.001 negative=0 window=5
2018-01-24 13:09:15,908 : INFO : PROGRESS: at 0.06% examples, 265749 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:09:16,949 : INFO : PROGRESS: at 0.13% examples, 284705 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:09:17,961 : INFO : PROGRESS: at 0.20% examples, 296581 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:09:18,976 : INFO : PROGRESS: at 0.27% examples, 302316 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:09:19,978 : INFO : PROGRESS: at 0.34% examples, 302726 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:09:21,016 : INFO : PROGRESS: at 0.40% examples, 302803 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:09:22,04

2018-01-24 13:10:34,531 : INFO : PROGRESS: at 5.15% examples, 299351 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:10:35,555 : INFO : PROGRESS: at 5.22% examples, 299090 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:10:36,582 : INFO : PROGRESS: at 5.28% examples, 299176 words/s, in_qsize 6, out_qsize 1
2018-01-24 13:10:37,585 : INFO : PROGRESS: at 5.36% examples, 299458 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:10:38,591 : INFO : PROGRESS: at 5.42% examples, 299270 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:10:39,604 : INFO : PROGRESS: at 5.48% examples, 299057 words/s, in_qsize 6, out_qsize 1
2018-01-24 13:10:40,622 : INFO : PROGRESS: at 5.53% examples, 298390 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:10:41,650 : INFO : PROGRESS: at 5.59% examples, 297924 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:10:42,652 : INFO : PROGRESS: at 5.66% examples, 297879 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:10:43,674 : INFO : PROGRESS: at 5.72% examples, 297878 words/s, in_q

2018-01-24 13:11:57,254 : INFO : PROGRESS: at 10.33% examples, 296266 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:11:58,254 : INFO : PROGRESS: at 10.40% examples, 296358 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:11:59,272 : INFO : PROGRESS: at 10.46% examples, 296418 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:12:00,274 : INFO : PROGRESS: at 10.53% examples, 296506 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:12:01,289 : INFO : PROGRESS: at 10.60% examples, 296624 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:12:02,293 : INFO : PROGRESS: at 10.66% examples, 296707 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:12:03,323 : INFO : PROGRESS: at 10.73% examples, 296855 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:12:04,350 : INFO : PROGRESS: at 10.80% examples, 297005 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:12:05,404 : INFO : PROGRESS: at 10.87% examples, 297107 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:12:06,410 : INFO : PROGRESS: at 10.94% examples, 297289 wor

2018-01-24 13:13:18,769 : INFO : PROGRESS: at 15.72% examples, 300505 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:13:19,787 : INFO : PROGRESS: at 15.79% examples, 300568 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:13:20,816 : INFO : PROGRESS: at 15.86% examples, 300654 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:13:21,850 : INFO : PROGRESS: at 15.93% examples, 300697 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:13:22,854 : INFO : PROGRESS: at 16.00% examples, 300776 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:13:23,854 : INFO : PROGRESS: at 16.07% examples, 300896 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:13:24,855 : INFO : PROGRESS: at 16.14% examples, 300976 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:13:25,884 : INFO : PROGRESS: at 16.21% examples, 301022 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:13:26,915 : INFO : PROGRESS: at 16.28% examples, 301177 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:13:27,959 : INFO : PROGRESS: at 16.36% examples, 301279 wor

2018-01-24 13:14:40,489 : INFO : PROGRESS: at 21.07% examples, 302153 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:14:41,496 : INFO : PROGRESS: at 21.14% examples, 302213 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:14:42,508 : INFO : PROGRESS: at 21.22% examples, 302355 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:14:43,544 : INFO : PROGRESS: at 21.29% examples, 302443 words/s, in_qsize 5, out_qsize 1
2018-01-24 13:14:44,554 : INFO : PROGRESS: at 21.36% examples, 302557 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:14:45,580 : INFO : PROGRESS: at 21.44% examples, 302624 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:14:46,614 : INFO : PROGRESS: at 21.51% examples, 302684 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:14:47,628 : INFO : PROGRESS: at 21.58% examples, 302792 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:14:48,652 : INFO : PROGRESS: at 21.66% examples, 302889 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:14:49,669 : INFO : PROGRESS: at 21.73% examples, 302963 wor

2018-01-24 13:16:02,055 : INFO : PROGRESS: at 26.62% examples, 304890 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:16:03,071 : INFO : PROGRESS: at 26.70% examples, 304969 words/s, in_qsize 5, out_qsize 1
2018-01-24 13:16:04,101 : INFO : PROGRESS: at 26.77% examples, 305039 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:16:05,120 : INFO : PROGRESS: at 26.84% examples, 305046 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:16:06,123 : INFO : PROGRESS: at 26.91% examples, 305087 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:16:07,126 : INFO : PROGRESS: at 26.97% examples, 305106 words/s, in_qsize 6, out_qsize 1
2018-01-24 13:16:08,136 : INFO : PROGRESS: at 27.04% examples, 305098 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:16:09,145 : INFO : PROGRESS: at 27.10% examples, 305089 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:16:10,188 : INFO : PROGRESS: at 27.17% examples, 305078 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:16:11,193 : INFO : PROGRESS: at 27.23% examples, 305072 wor

2018-01-24 13:17:23,597 : INFO : PROGRESS: at 32.05% examples, 306318 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:17:24,619 : INFO : PROGRESS: at 32.12% examples, 306380 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:17:25,634 : INFO : PROGRESS: at 32.19% examples, 306407 words/s, in_qsize 5, out_qsize 1
2018-01-24 13:17:26,664 : INFO : PROGRESS: at 32.27% examples, 306464 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:17:27,670 : INFO : PROGRESS: at 32.34% examples, 306515 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:17:28,694 : INFO : PROGRESS: at 32.42% examples, 306555 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:17:29,694 : INFO : PROGRESS: at 32.49% examples, 306589 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:17:30,706 : INFO : PROGRESS: at 32.56% examples, 306617 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:17:31,726 : INFO : PROGRESS: at 32.63% examples, 306657 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:17:32,738 : INFO : PROGRESS: at 32.70% examples, 306703 wor

2018-01-24 13:18:45,165 : INFO : PROGRESS: at 37.51% examples, 307124 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:18:46,192 : INFO : PROGRESS: at 37.57% examples, 307100 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:18:47,204 : INFO : PROGRESS: at 37.64% examples, 307102 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:18:48,224 : INFO : PROGRESS: at 37.71% examples, 307115 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:18:49,250 : INFO : PROGRESS: at 37.77% examples, 307109 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:18:50,260 : INFO : PROGRESS: at 37.84% examples, 307112 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:18:51,266 : INFO : PROGRESS: at 37.91% examples, 307117 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:18:52,280 : INFO : PROGRESS: at 37.95% examples, 306921 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:18:53,288 : INFO : PROGRESS: at 38.00% examples, 306843 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:18:54,341 : INFO : PROGRESS: at 38.06% examples, 306742 wor

2018-01-24 13:20:06,651 : INFO : PROGRESS: at 43.00% examples, 308079 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:20:07,661 : INFO : PROGRESS: at 43.06% examples, 308082 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:20:08,671 : INFO : PROGRESS: at 43.13% examples, 308087 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:20:09,681 : INFO : PROGRESS: at 43.20% examples, 308090 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:20:10,710 : INFO : PROGRESS: at 43.27% examples, 308100 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:20:11,735 : INFO : PROGRESS: at 43.34% examples, 308125 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:20:12,744 : INFO : PROGRESS: at 43.41% examples, 308173 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:20:13,750 : INFO : PROGRESS: at 43.48% examples, 308222 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:20:14,800 : INFO : PROGRESS: at 43.55% examples, 308236 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:20:15,816 : INFO : PROGRESS: at 43.62% examples, 308210 wor

2018-01-24 13:21:28,190 : INFO : PROGRESS: at 48.58% examples, 309264 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:21:29,197 : INFO : PROGRESS: at 48.65% examples, 309277 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:21:30,199 : INFO : PROGRESS: at 48.72% examples, 309306 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:21:31,219 : INFO : PROGRESS: at 48.79% examples, 309326 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:21:32,220 : INFO : PROGRESS: at 48.86% examples, 309367 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:21:33,222 : INFO : PROGRESS: at 48.93% examples, 309381 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:21:34,232 : INFO : PROGRESS: at 49.00% examples, 309431 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:21:35,246 : INFO : PROGRESS: at 49.07% examples, 309454 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:21:36,250 : INFO : PROGRESS: at 49.15% examples, 309506 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:21:37,330 : INFO : PROGRESS: at 49.22% examples, 309513 wor

2018-01-24 13:22:49,711 : INFO : PROGRESS: at 54.18% examples, 310466 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:22:50,713 : INFO : PROGRESS: at 54.24% examples, 310458 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:22:51,740 : INFO : PROGRESS: at 54.31% examples, 310452 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:22:52,755 : INFO : PROGRESS: at 54.36% examples, 310334 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:22:53,771 : INFO : PROGRESS: at 54.42% examples, 310286 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:22:54,837 : INFO : PROGRESS: at 54.47% examples, 310208 words/s, in_qsize 6, out_qsize 1
2018-01-24 13:22:55,910 : INFO : PROGRESS: at 54.54% examples, 310184 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:22:56,953 : INFO : PROGRESS: at 54.60% examples, 310161 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:22:57,981 : INFO : PROGRESS: at 54.68% examples, 310190 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:22:58,996 : INFO : PROGRESS: at 54.75% examples, 310223 wor

2018-01-24 13:24:11,180 : INFO : PROGRESS: at 59.71% examples, 311312 words/s, in_qsize 5, out_qsize 1
2018-01-24 13:24:12,185 : INFO : PROGRESS: at 59.78% examples, 311331 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:24:13,199 : INFO : PROGRESS: at 59.85% examples, 311348 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:24:14,247 : INFO : PROGRESS: at 59.91% examples, 311342 words/s, in_qsize 5, out_qsize 2
2018-01-24 13:24:15,267 : INFO : PROGRESS: at 59.98% examples, 311346 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:24:16,316 : INFO : PROGRESS: at 60.05% examples, 311343 words/s, in_qsize 6, out_qsize 1
2018-01-24 13:24:17,335 : INFO : PROGRESS: at 60.12% examples, 311351 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:24:18,341 : INFO : PROGRESS: at 60.20% examples, 311375 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:24:19,347 : INFO : PROGRESS: at 60.27% examples, 311397 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:24:20,384 : INFO : PROGRESS: at 60.34% examples, 311420 wor

2018-01-24 13:25:32,657 : INFO : PROGRESS: at 65.47% examples, 312668 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:25:33,673 : INFO : PROGRESS: at 65.54% examples, 312674 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:25:34,698 : INFO : PROGRESS: at 65.61% examples, 312677 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:25:35,702 : INFO : PROGRESS: at 65.68% examples, 312687 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:25:36,738 : INFO : PROGRESS: at 65.75% examples, 312686 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:25:37,748 : INFO : PROGRESS: at 65.82% examples, 312693 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:25:38,785 : INFO : PROGRESS: at 65.89% examples, 312703 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:25:39,798 : INFO : PROGRESS: at 65.97% examples, 312719 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:25:40,837 : INFO : PROGRESS: at 66.04% examples, 312737 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:25:41,882 : INFO : PROGRESS: at 66.11% examples, 312753 wor

2018-01-24 13:26:54,161 : INFO : PROGRESS: at 70.98% examples, 313028 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:26:55,183 : INFO : PROGRESS: at 71.04% examples, 312993 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:26:56,203 : INFO : PROGRESS: at 71.10% examples, 312995 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:26:57,209 : INFO : PROGRESS: at 71.17% examples, 312991 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:26:58,213 : INFO : PROGRESS: at 71.24% examples, 312997 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:26:59,227 : INFO : PROGRESS: at 71.31% examples, 313010 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:27:00,241 : INFO : PROGRESS: at 71.38% examples, 313021 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:27:01,291 : INFO : PROGRESS: at 71.45% examples, 313031 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:27:02,315 : INFO : PROGRESS: at 71.52% examples, 313040 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:27:03,320 : INFO : PROGRESS: at 71.58% examples, 313038 wor

2018-01-24 13:28:15,482 : INFO : PROGRESS: at 76.59% examples, 313661 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:28:16,523 : INFO : PROGRESS: at 76.66% examples, 313648 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:28:17,527 : INFO : PROGRESS: at 76.72% examples, 313628 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:28:18,536 : INFO : PROGRESS: at 76.79% examples, 313624 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:28:19,561 : INFO : PROGRESS: at 76.86% examples, 313624 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:28:20,583 : INFO : PROGRESS: at 76.92% examples, 313616 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:28:21,626 : INFO : PROGRESS: at 76.99% examples, 313602 words/s, in_qsize 6, out_qsize 1
2018-01-24 13:28:22,627 : INFO : PROGRESS: at 77.06% examples, 313617 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:28:23,660 : INFO : PROGRESS: at 77.13% examples, 313631 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:28:24,684 : INFO : PROGRESS: at 77.20% examples, 313630 wor

2018-01-24 13:29:37,133 : INFO : PROGRESS: at 82.23% examples, 314304 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:29:38,154 : INFO : PROGRESS: at 82.30% examples, 314314 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:29:39,195 : INFO : PROGRESS: at 82.38% examples, 314326 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:29:40,203 : INFO : PROGRESS: at 82.45% examples, 314347 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:29:41,216 : INFO : PROGRESS: at 82.52% examples, 314367 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:29:42,253 : INFO : PROGRESS: at 82.60% examples, 314380 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:29:43,266 : INFO : PROGRESS: at 82.67% examples, 314415 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:29:44,296 : INFO : PROGRESS: at 82.75% examples, 314438 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:29:45,307 : INFO : PROGRESS: at 82.82% examples, 314458 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:29:46,329 : INFO : PROGRESS: at 82.89% examples, 314467 wor

2018-01-24 13:30:58,698 : INFO : PROGRESS: at 87.71% examples, 314209 words/s, in_qsize 4, out_qsize 1
2018-01-24 13:30:59,724 : INFO : PROGRESS: at 87.79% examples, 314215 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:31:00,768 : INFO : PROGRESS: at 87.85% examples, 314203 words/s, in_qsize 6, out_qsize 1
2018-01-24 13:31:01,826 : INFO : PROGRESS: at 87.92% examples, 314201 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:31:02,853 : INFO : PROGRESS: at 87.99% examples, 314200 words/s, in_qsize 4, out_qsize 1
2018-01-24 13:31:03,873 : INFO : PROGRESS: at 88.06% examples, 314186 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:31:04,897 : INFO : PROGRESS: at 88.12% examples, 314179 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:31:05,941 : INFO : PROGRESS: at 88.19% examples, 314174 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:31:06,959 : INFO : PROGRESS: at 88.26% examples, 314175 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:31:07,995 : INFO : PROGRESS: at 88.33% examples, 314164 wor

2018-01-24 13:32:20,200 : INFO : PROGRESS: at 93.19% examples, 314281 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:32:21,203 : INFO : PROGRESS: at 93.26% examples, 314280 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:32:22,216 : INFO : PROGRESS: at 93.33% examples, 314277 words/s, in_qsize 6, out_qsize 1
2018-01-24 13:32:23,238 : INFO : PROGRESS: at 93.41% examples, 314292 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:32:24,242 : INFO : PROGRESS: at 93.47% examples, 314284 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:32:25,268 : INFO : PROGRESS: at 93.54% examples, 314271 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:32:26,296 : INFO : PROGRESS: at 93.61% examples, 314264 words/s, in_qsize 6, out_qsize 0
2018-01-24 13:32:27,312 : INFO : PROGRESS: at 93.68% examples, 314261 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:32:28,353 : INFO : PROGRESS: at 93.74% examples, 314244 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:32:29,382 : INFO : PROGRESS: at 93.81% examples, 314238 wor

2018-01-24 13:33:41,835 : INFO : PROGRESS: at 98.66% examples, 314266 words/s, in_qsize 5, out_qsize 1
2018-01-24 13:33:42,875 : INFO : PROGRESS: at 98.73% examples, 314268 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:33:43,887 : INFO : PROGRESS: at 98.80% examples, 314264 words/s, in_qsize 6, out_qsize 1
2018-01-24 13:33:44,911 : INFO : PROGRESS: at 98.87% examples, 314263 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:33:45,923 : INFO : PROGRESS: at 98.93% examples, 314259 words/s, in_qsize 4, out_qsize 1
2018-01-24 13:33:46,948 : INFO : PROGRESS: at 99.00% examples, 314252 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:33:47,971 : INFO : PROGRESS: at 99.07% examples, 314251 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:33:49,015 : INFO : PROGRESS: at 99.13% examples, 314246 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:33:50,035 : INFO : PROGRESS: at 99.20% examples, 314246 words/s, in_qsize 5, out_qsize 0
2018-01-24 13:33:51,064 : INFO : PROGRESS: at 99.27% examples, 314245 wor

####  some considerations:
 - revisit tokenization  / spell checking / entity recognition
 - stop words?
 - things to adjust: vector size, negative sampling, min_count
 - review more notes on logs

## Save model

In [32]:
path = "/Users/stevenfelix/Documents/DataScience_local/Insight/"
file = 'model_full_{}_sg{}_sz{}_win{}_min{}_hs{}_neg{}'.format(num_doc,sg,size,window,min_count,hs,negative)
model_full.save(path+file)
model_full.wv.save_word2vec_format(path+file+'_kv')

2018-01-24 13:34:01,567 : INFO : saving Word2Vec object under /Users/stevenfelix/Documents/DataScience_local/Insight/model_full_50M_sg0_sz250_win5_min3_hs1_neg0, separately None
2018-01-24 13:34:01,577 : INFO : storing np array 'syn0' to /Users/stevenfelix/Documents/DataScience_local/Insight/model_full_50M_sg0_sz250_win5_min3_hs1_neg0.wv.syn0.npy
2018-01-24 13:34:02,494 : INFO : not storing attribute syn0norm
2018-01-24 13:34:02,502 : INFO : storing np array 'syn1' to /Users/stevenfelix/Documents/DataScience_local/Insight/model_full_50M_sg0_sz250_win5_min3_hs1_neg0.syn1.npy
2018-01-24 13:34:03,039 : INFO : not storing attribute cum_table
2018-01-24 13:34:11,425 : INFO : saved /Users/stevenfelix/Documents/DataScience_local/Insight/model_full_50M_sg0_sz250_win5_min3_hs1_neg0
2018-01-24 13:34:11,429 : INFO : storing 295222x250 projection weights into /Users/stevenfelix/Documents/DataScience_local/Insight/model_full_50M_sg0_sz250_win5_min3_hs1_neg0_kv


## brief tests

In [10]:
model_nostop.most_similar(['iterate'])

NameError: name 'model_nostop' is not defined

In [33]:
model_full.most_similar(['iterate'])

2018-01-24 14:22:31,017 : INFO : precomputing L2-norms of word weight vectors


[('iterating', 0.8353675007820129),
 ('looping', 0.6400182247161865),
 ('traverse', 0.5869097709655762),
 ('flatten', 0.5416625738143921),
 ('enumerate', 0.5304164290428162),
 ('traversing', 0.4963976740837097),
 ('rearrange', 0.48345401883125305),
 ('reorder', 0.46991872787475586),
 ('sorting', 0.4689270257949829),
 ('enumerating', 0.4686262309551239)]

In [57]:
model_full.most_similar(['pyspark'])

[('rdd', 0.38008955121040344),
 ('poky', 0.3452593684196472),
 ('mrjob', 0.34351515769958496),
 ('scriptswindows', 0.3395163416862488),
 ('bsddb', 0.3378233015537262),
 ('datastructures', 0.3373665511608124),
 ('rmr2', 0.337257981300354),
 ('mllib', 0.33450764417648315),
 ('cassandradriver', 0.3343200981616974),
 ('yarn', 0.32895198464393616)]

In [44]:
model_full.most_similar(['nodetool'])

[('commitlogheader', 0.5858930349349976),
 ('json2sstable', 0.5270707011222839),
 ('cfstats', 0.5217332243919373),
 ('rcassandra', 0.5164926648139954),
 ('randompartitioner', 0.5140954852104187),
 ('sstableloader', 0.5115439295768738),
 ('cqlsh', 0.5114718675613403),
 ('cassandrajar', 0.5046938061714172),
 ('noavailablehostsexception', 0.5021253228187561),
 ('aquiles', 0.49960777163505554)]

**synonyms work pretty well!!!**

In [12]:
model_nostop.predict_output_word(['iterate','dataframe','rows'])

NameError: name 'model_nostop' is not defined

In [13]:
x = model_full.predict_output_word(['iterate','dataframe','rows'])

In [14]:
x

[('panda', 0.0070507326),
 ('dataframe', 0.0058084298),
 ('dataframes', 0.0043647736),
 ('pairwise', 0.0010982126),
 ('over', 0.0010611849),
 ('multiindex', 0.0010397913),
 ('subset', 0.00093480962),
 ('subsetting', 0.00092987594),
 ('tuples', 0.00075485575),
 ('numpy', 0.00072131585)]

In [None]:
q="How to find all questions that were duplicates of another question".split()
model_full.predict_output_word(q)