In [38]:
import gensim
from gensim.models import word2vec
import logging
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric, strip_multiple_whitespaces
from gensim.utils import decode_htmlentities
import time
import re
import numpy as np

In [39]:
u = 'E tu vivrai nel terrore - L&#x27;aldil&#xE0; (1981)'
s = decode_htmlentities(u)

In [40]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [41]:
def read_input(input_file):
    logging.info("reading file {0}...this may take a while".format(input_file))
    with open(input_file, 'r') as f:
        for i, line in enumerate(f):
            if (i % 10000 == 0):
                logging.info("read {0} reviews".format(i))
                line = decode_htmlentities(line)
                line = re.sub('[!"#%&\'\(\)*+,-./:;<=>?@\[\]\^_`{|}~1234567890’”“′‘\\\]','', line)
                line = strip_punctuation(line)
                line = strip_numeric(line)
                line = strip_multiple_whitespaces(line)
                #print(line)
            # do some pre-processing and return list of words for each review text
            yield( gensim.utils.simple_preprocess(line, deacc=True, min_len=3))          

In [37]:
start = time.time()
documents = list(read_input ('/home/tpradhan/DBLP/others/word2vectraincorpus.txt'))
logging.info ("Done reading data file")
print(time.time()-start)

#print("The preprocessed word2vec text2 is :\n", documents)

2018-07-05 00:25:46,120 : INFO : reading file /home/tpradhan/DBLP/others/test2.txt...this may take a while
2018-07-05 00:25:46,122 : INFO : read 0 reviews
2018-07-05 00:25:46,124 : INFO : reading file /home/tpradhan/DBLP/others/test3.txt...this may take a while
2018-07-05 00:25:46,125 : INFO : read 0 reviews
2018-07-05 00:25:46,126 : INFO : Done reading data file


 author relations between articles ie two articles with the same authors The rationale underlying OUR METHOD is that researchers 
author relations between articles ie two articles with the same authors The rationale underlying our method is that researchers
0.006928443908691406
The preprocessed word2vec text2 is :
 [['author', 'relations', 'between', 'articles', 'two', 'articles', 'with', 'the', 'same', 'authors', 'the', 'rationale', 'underlying', 'our', 'method', 'that', 'researchers']]
The preprocessed word2vec text3 is :
 [['author', 'relations', 'between', 'articles', 'two', 'articles', 'with', 'the', 'same', 'authors', 'the', 'rationale', 'underlying', 'our', 'method', 'that', 'researchers']]


In [5]:
start=time.time()
model = gensim.models.Word2Vec(documents, size=100, alpha=0.025, window=5, min_count=5,
                               sample=0.001, seed=1, workers=55, 
                               min_alpha=0.0001, sg=1, hs=0, negative=5, iter=5, 
                               sorted_vocab=1, batch_words=10000, 
                               compute_loss=True)
model.train(documents, total_examples=len(documents), epochs=7)
print(time.time()-start)

2018-06-03 15:17:18,107 : INFO : collecting all words and their counts
2018-06-03 15:17:18,109 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-03 15:18:09,166 : INFO : collected 726100 word types from a corpus of 222189061 raw words and 1378 sentences
2018-06-03 15:18:09,168 : INFO : Loading a fresh vocabulary
2018-06-03 15:18:38,088 : INFO : min_count=2 retains 420324 unique words (57% of original 726100, drops 305776)
2018-06-03 15:18:38,089 : INFO : min_count=2 leaves 221883285 word corpus (99% of original 222189061, drops 305776)
2018-06-03 15:18:39,149 : INFO : deleting the raw counts dictionary of 726100 items
2018-06-03 15:18:39,195 : INFO : sample=0.001 downsamples 21 most-common words
2018-06-03 15:18:39,196 : INFO : downsampling leaves estimated 212840290 word corpus (95.9% of prior 221883285)
2018-06-03 15:18:40,940 : INFO : estimated required memory for 420324 words and 156 dimensions: 734726352 bytes
2018-06-03 15:18:40,941 : INFO : reset

2018-06-03 15:18:48,263 : INFO : worker thread finished; awaiting finish of 27 more threads
2018-06-03 15:18:48,283 : INFO : worker thread finished; awaiting finish of 26 more threads
2018-06-03 15:18:48,313 : INFO : worker thread finished; awaiting finish of 25 more threads
2018-06-03 15:18:48,316 : INFO : worker thread finished; awaiting finish of 24 more threads
2018-06-03 15:18:48,318 : INFO : worker thread finished; awaiting finish of 23 more threads
2018-06-03 15:18:48,325 : INFO : worker thread finished; awaiting finish of 22 more threads
2018-06-03 15:18:48,329 : INFO : worker thread finished; awaiting finish of 21 more threads
2018-06-03 15:18:48,331 : INFO : worker thread finished; awaiting finish of 20 more threads
2018-06-03 15:18:48,332 : INFO : worker thread finished; awaiting finish of 19 more threads
2018-06-03 15:18:48,333 : INFO : worker thread finished; awaiting finish of 18 more threads
2018-06-03 15:18:48,334 : INFO : worker thread finished; awaiting finish of 17 m

2018-06-03 15:18:51,197 : INFO : worker thread finished; awaiting finish of 45 more threads
2018-06-03 15:18:51,201 : INFO : worker thread finished; awaiting finish of 44 more threads
2018-06-03 15:18:51,206 : INFO : worker thread finished; awaiting finish of 43 more threads
2018-06-03 15:18:51,225 : INFO : worker thread finished; awaiting finish of 42 more threads
2018-06-03 15:18:51,295 : INFO : worker thread finished; awaiting finish of 41 more threads
2018-06-03 15:18:51,317 : INFO : worker thread finished; awaiting finish of 40 more threads
2018-06-03 15:18:51,337 : INFO : worker thread finished; awaiting finish of 39 more threads
2018-06-03 15:18:51,340 : INFO : worker thread finished; awaiting finish of 38 more threads
2018-06-03 15:18:51,342 : INFO : worker thread finished; awaiting finish of 37 more threads
2018-06-03 15:18:51,344 : INFO : worker thread finished; awaiting finish of 36 more threads
2018-06-03 15:18:51,400 : INFO : worker thread finished; awaiting finish of 35 m

2018-06-03 15:18:53,866 : INFO : worker thread finished; awaiting finish of 63 more threads
2018-06-03 15:18:53,867 : INFO : worker thread finished; awaiting finish of 62 more threads
2018-06-03 15:18:53,868 : INFO : worker thread finished; awaiting finish of 61 more threads
2018-06-03 15:18:53,869 : INFO : worker thread finished; awaiting finish of 60 more threads
2018-06-03 15:18:53,870 : INFO : worker thread finished; awaiting finish of 59 more threads
2018-06-03 15:18:53,871 : INFO : worker thread finished; awaiting finish of 58 more threads
2018-06-03 15:18:53,884 : INFO : worker thread finished; awaiting finish of 57 more threads
2018-06-03 15:18:53,931 : INFO : worker thread finished; awaiting finish of 56 more threads
2018-06-03 15:18:53,941 : INFO : worker thread finished; awaiting finish of 55 more threads
2018-06-03 15:18:53,949 : INFO : worker thread finished; awaiting finish of 54 more threads
2018-06-03 15:18:53,967 : INFO : worker thread finished; awaiting finish of 53 m

2018-06-03 15:18:56,195 : INFO : worker thread finished; awaiting finish of 78 more threads
2018-06-03 15:18:56,208 : INFO : worker thread finished; awaiting finish of 77 more threads
2018-06-03 15:18:56,217 : INFO : worker thread finished; awaiting finish of 76 more threads
2018-06-03 15:18:56,218 : INFO : worker thread finished; awaiting finish of 75 more threads
2018-06-03 15:18:56,222 : INFO : worker thread finished; awaiting finish of 74 more threads
2018-06-03 15:18:56,225 : INFO : worker thread finished; awaiting finish of 73 more threads
2018-06-03 15:18:56,240 : INFO : worker thread finished; awaiting finish of 72 more threads
2018-06-03 15:18:56,250 : INFO : worker thread finished; awaiting finish of 71 more threads
2018-06-03 15:18:56,253 : INFO : worker thread finished; awaiting finish of 70 more threads
2018-06-03 15:18:56,255 : INFO : worker thread finished; awaiting finish of 69 more threads
2018-06-03 15:18:56,256 : INFO : worker thread finished; awaiting finish of 68 m

2018-06-03 15:18:58,632 : INFO : worker thread finished; awaiting finish of 94 more threads
2018-06-03 15:18:58,681 : INFO : worker thread finished; awaiting finish of 93 more threads
2018-06-03 15:18:58,754 : INFO : worker thread finished; awaiting finish of 92 more threads
2018-06-03 15:18:58,766 : INFO : worker thread finished; awaiting finish of 91 more threads
2018-06-03 15:18:58,791 : INFO : worker thread finished; awaiting finish of 90 more threads
2018-06-03 15:18:58,796 : INFO : worker thread finished; awaiting finish of 89 more threads
2018-06-03 15:18:58,799 : INFO : worker thread finished; awaiting finish of 88 more threads
2018-06-03 15:18:58,804 : INFO : worker thread finished; awaiting finish of 87 more threads
2018-06-03 15:18:58,831 : INFO : worker thread finished; awaiting finish of 86 more threads
2018-06-03 15:18:58,880 : INFO : worker thread finished; awaiting finish of 85 more threads
2018-06-03 15:18:58,887 : INFO : worker thread finished; awaiting finish of 84 m

2018-06-03 15:18:59,518 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-06-03 15:18:59,525 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-06-03 15:18:59,533 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-06-03 15:18:59,545 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-03 15:18:59,546 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-03 15:18:59,558 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-03 15:18:59,563 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-03 15:18:59,563 : INFO : EPOCH - 5 : training on 222189061 raw words (1274960 effective words) took 2.6s, 491031 effective words/s
2018-06-03 15:18:59,564 : INFO : training on a 1110945305 raw words (6374789 effective words) took 14.4s, 444227 effective words/s
2018-06-03 15:18:59,566 : INFO : training model with 102 workers on 420324 vocabulary and 156

2018-06-03 15:19:02,447 : INFO : worker thread finished; awaiting finish of 26 more threads
2018-06-03 15:19:02,450 : INFO : worker thread finished; awaiting finish of 25 more threads
2018-06-03 15:19:02,457 : INFO : worker thread finished; awaiting finish of 24 more threads
2018-06-03 15:19:02,488 : INFO : worker thread finished; awaiting finish of 23 more threads
2018-06-03 15:19:02,498 : INFO : worker thread finished; awaiting finish of 22 more threads
2018-06-03 15:19:02,499 : INFO : worker thread finished; awaiting finish of 21 more threads
2018-06-03 15:19:02,519 : INFO : worker thread finished; awaiting finish of 20 more threads
2018-06-03 15:19:02,527 : INFO : worker thread finished; awaiting finish of 19 more threads
2018-06-03 15:19:02,552 : INFO : worker thread finished; awaiting finish of 18 more threads
2018-06-03 15:19:02,554 : INFO : worker thread finished; awaiting finish of 17 more threads
2018-06-03 15:19:02,572 : INFO : worker thread finished; awaiting finish of 16 m

2018-06-03 15:19:05,113 : INFO : worker thread finished; awaiting finish of 42 more threads
2018-06-03 15:19:05,113 : INFO : worker thread finished; awaiting finish of 41 more threads
2018-06-03 15:19:05,114 : INFO : worker thread finished; awaiting finish of 40 more threads
2018-06-03 15:19:05,114 : INFO : worker thread finished; awaiting finish of 39 more threads
2018-06-03 15:19:05,115 : INFO : worker thread finished; awaiting finish of 38 more threads
2018-06-03 15:19:05,115 : INFO : worker thread finished; awaiting finish of 37 more threads
2018-06-03 15:19:05,116 : INFO : worker thread finished; awaiting finish of 36 more threads
2018-06-03 15:19:05,116 : INFO : worker thread finished; awaiting finish of 35 more threads
2018-06-03 15:19:05,117 : INFO : worker thread finished; awaiting finish of 34 more threads
2018-06-03 15:19:05,117 : INFO : worker thread finished; awaiting finish of 33 more threads
2018-06-03 15:19:05,118 : INFO : worker thread finished; awaiting finish of 32 m

2018-06-03 15:19:08,006 : INFO : worker thread finished; awaiting finish of 58 more threads
2018-06-03 15:19:08,011 : INFO : worker thread finished; awaiting finish of 57 more threads
2018-06-03 15:19:08,035 : INFO : worker thread finished; awaiting finish of 56 more threads
2018-06-03 15:19:08,037 : INFO : worker thread finished; awaiting finish of 55 more threads
2018-06-03 15:19:08,037 : INFO : worker thread finished; awaiting finish of 54 more threads
2018-06-03 15:19:08,086 : INFO : worker thread finished; awaiting finish of 53 more threads
2018-06-03 15:19:08,094 : INFO : worker thread finished; awaiting finish of 52 more threads
2018-06-03 15:19:08,097 : INFO : worker thread finished; awaiting finish of 51 more threads
2018-06-03 15:19:08,099 : INFO : worker thread finished; awaiting finish of 50 more threads
2018-06-03 15:19:08,153 : INFO : worker thread finished; awaiting finish of 49 more threads
2018-06-03 15:19:08,156 : INFO : worker thread finished; awaiting finish of 48 m

2018-06-03 15:19:13,014 : INFO : worker thread finished; awaiting finish of 90 more threads
2018-06-03 15:19:13,116 : INFO : worker thread finished; awaiting finish of 89 more threads
2018-06-03 15:19:13,131 : INFO : worker thread finished; awaiting finish of 88 more threads
2018-06-03 15:19:13,134 : INFO : worker thread finished; awaiting finish of 87 more threads
2018-06-03 15:19:13,137 : INFO : worker thread finished; awaiting finish of 86 more threads
2018-06-03 15:19:13,289 : INFO : EPOCH 5 - PROGRESS: at 76.63% examples, 329851 words/s, in_qsize 77, out_qsize 17
2018-06-03 15:19:13,292 : INFO : worker thread finished; awaiting finish of 85 more threads
2018-06-03 15:19:13,294 : INFO : worker thread finished; awaiting finish of 84 more threads
2018-06-03 15:19:13,295 : INFO : worker thread finished; awaiting finish of 83 more threads
2018-06-03 15:19:13,344 : INFO : worker thread finished; awaiting finish of 82 more threads
2018-06-03 15:19:13,347 : INFO : worker thread finished; 

2018-06-03 15:19:14,160 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-03 15:19:14,162 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-03 15:19:14,167 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-03 15:19:14,168 : INFO : EPOCH - 5 : training on 222189061 raw words (1274953 effective words) took 3.0s, 431738 effective words/s
2018-06-03 15:19:15,226 : INFO : EPOCH 6 - PROGRESS: at 49.35% examples, 158122 words/s, in_qsize 163, out_qsize 0
2018-06-03 15:19:15,885 : INFO : worker thread finished; awaiting finish of 101 more threads
2018-06-03 15:19:15,900 : INFO : worker thread finished; awaiting finish of 100 more threads
2018-06-03 15:19:15,931 : INFO : worker thread finished; awaiting finish of 99 more threads
2018-06-03 15:19:15,933 : INFO : worker thread finished; awaiting finish of 98 more threads
2018-06-03 15:19:15,949 : INFO : worker thread finished; awaiting finish of 97 more threads
2018-06-03 

2018-06-03 15:19:17,026 : INFO : worker thread finished; awaiting finish of 18 more threads
2018-06-03 15:19:17,057 : INFO : worker thread finished; awaiting finish of 17 more threads
2018-06-03 15:19:17,077 : INFO : worker thread finished; awaiting finish of 16 more threads
2018-06-03 15:19:17,087 : INFO : worker thread finished; awaiting finish of 15 more threads
2018-06-03 15:19:17,089 : INFO : worker thread finished; awaiting finish of 14 more threads
2018-06-03 15:19:17,113 : INFO : worker thread finished; awaiting finish of 13 more threads
2018-06-03 15:19:17,132 : INFO : worker thread finished; awaiting finish of 12 more threads
2018-06-03 15:19:17,134 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-06-03 15:19:17,144 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-06-03 15:19:17,149 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-06-03 15:19:17,152 : INFO : worker thread finished; awaiting finish of 8 mor

2018-06-03 15:19:18,825 : INFO : worker thread finished; awaiting finish of 33 more threads
2018-06-03 15:19:18,843 : INFO : worker thread finished; awaiting finish of 32 more threads
2018-06-03 15:19:18,900 : INFO : worker thread finished; awaiting finish of 31 more threads
2018-06-03 15:19:18,909 : INFO : worker thread finished; awaiting finish of 30 more threads
2018-06-03 15:19:18,913 : INFO : worker thread finished; awaiting finish of 29 more threads
2018-06-03 15:19:18,915 : INFO : worker thread finished; awaiting finish of 28 more threads
2018-06-03 15:19:18,934 : INFO : worker thread finished; awaiting finish of 27 more threads
2018-06-03 15:19:18,946 : INFO : worker thread finished; awaiting finish of 26 more threads
2018-06-03 15:19:18,964 : INFO : worker thread finished; awaiting finish of 25 more threads
2018-06-03 15:19:18,978 : INFO : worker thread finished; awaiting finish of 24 more threads
2018-06-03 15:19:18,981 : INFO : worker thread finished; awaiting finish of 23 m

121.03422093391418


In [6]:
model.save('/tmp/word2vec_model')

2018-06-03 15:20:23,817 : INFO : saving Word2Vec object under /tmp/word2vec_model, separately None
2018-06-03 15:20:23,819 : INFO : storing np array 'vectors' to /tmp/word2vec_model.wv.vectors.npy
2018-06-03 15:20:24,007 : INFO : not storing attribute vectors_norm
2018-06-03 15:20:24,009 : INFO : storing np array 'syn1neg' to /tmp/word2vec_model.trainables.syn1neg.npy
2018-06-03 15:20:24,160 : INFO : not storing attribute cum_table
2018-06-03 15:20:25,504 : INFO : saved /tmp/word2vec_model


In [8]:
model.wv.most_similar(positive=['learning'])

2018-06-03 16:15:55,732 : INFO : precomputing L2-norms of word weight vectors


[('learner', 0.8040422201156616),
 ('learners', 0.7816915512084961),
 ('learn', 0.7726538181304932),
 ('learned', 0.7591537833213806),
 ('rl', 0.7579692602157593),
 ('judged', 0.744957685470581),
 ('reinforcement', 0.7405967116355896),
 ('supervised', 0.7404361963272095),
 ('training', 0.7347362041473389),
 ('instructor', 0.7325141429901123)]