In [1]:
# imports needed and set up logging
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [14]:
from gensim.models.callbacks import CallbackAny2Vec

In [19]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0
        self.losses = list ()
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        print("Loss: {0}".format (model.get_latest_training_loss()))
        self.epoch += 1
        self.losses.append (model.get_latest_training_loss())

In [3]:
data_file="reviews_data.txt.gz"

In [4]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    logging.info("reading file {0}...this may take a while".format(input_file))
    
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list (read_input (data_file))
logging.info ("Done reading data file")

2018-10-10 18:02:22,874 : INFO : reading file reviews_data.txt.gz...this may take a while
2018-10-10 18:02:22,877 : INFO : read 0 reviews
2018-10-10 18:02:26,103 : INFO : read 10000 reviews
2018-10-10 18:02:29,134 : INFO : read 20000 reviews
2018-10-10 18:02:32,881 : INFO : read 30000 reviews
2018-10-10 18:02:36,218 : INFO : read 40000 reviews
2018-10-10 18:02:40,193 : INFO : read 50000 reviews
2018-10-10 18:02:44,740 : INFO : read 60000 reviews
2018-10-10 18:02:47,754 : INFO : read 70000 reviews
2018-10-10 18:02:50,454 : INFO : read 80000 reviews
2018-10-10 18:02:53,343 : INFO : read 90000 reviews
2018-10-10 18:02:56,366 : INFO : read 100000 reviews
2018-10-10 18:02:59,340 : INFO : read 110000 reviews
2018-10-10 18:03:02,313 : INFO : read 120000 reviews
2018-10-10 18:03:05,371 : INFO : read 130000 reviews
2018-10-10 18:03:08,617 : INFO : read 140000 reviews
2018-10-10 18:03:11,434 : INFO : read 150000 reviews
2018-10-10 18:03:15,467 : INFO : read 160000 reviews
2018-10-10 18:03:18,612

In [23]:
epoch_logger = EpochLogger()
model = gensim.models.Word2Vec (documents, size=50, window=5, min_count=2, workers=10, compute_loss=True, sg=1, negative=10, callbacks=[epoch_logger])

2018-10-10 18:45:23,297 : INFO : collecting all words and their counts
2018-10-10 18:45:23,299 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-10 18:45:23,738 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2018-10-10 18:45:24,163 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2018-10-10 18:45:24,641 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2018-10-10 18:45:25,083 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2018-10-10 18:45:25,566 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2018-10-10 18:45:26,030 : INFO : PROGRESS: at sentence #60000, processed 11013723 words, keeping 76781 word types
2018-10-10 18:45:26,434 : INFO : PROGRESS: at sentence #70000, processed 12637525 words, keeping 83194 word types
2018-10-10 18:45:26,804 : INFO : PROG

Epoch #0 start


2018-10-10 18:45:37,682 : INFO : EPOCH 1 - PROGRESS: at 0.22% examples, 68816 words/s, in_qsize 20, out_qsize 0
2018-10-10 18:45:38,745 : INFO : EPOCH 1 - PROGRESS: at 1.68% examples, 249172 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:45:39,749 : INFO : EPOCH 1 - PROGRESS: at 3.31% examples, 330272 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:45:40,768 : INFO : EPOCH 1 - PROGRESS: at 4.68% examples, 348549 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:45:41,770 : INFO : EPOCH 1 - PROGRESS: at 5.95% examples, 357348 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:45:42,812 : INFO : EPOCH 1 - PROGRESS: at 7.31% examples, 364590 words/s, in_qsize 18, out_qsize 1
2018-10-10 18:45:43,828 : INFO : EPOCH 1 - PROGRESS: at 8.65% examples, 371096 words/s, in_qsize 18, out_qsize 1
2018-10-10 18:45:44,831 : INFO : EPOCH 1 - PROGRESS: at 9.74% examples, 373783 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:45:45,838 : INFO : EPOCH 1 - PROGRESS: at 10.81% examples, 376821 words/s, in_qsize

2018-10-10 18:46:49,410 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-10-10 18:46:49,435 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-10-10 18:46:49,462 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-10-10 18:46:49,469 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-10-10 18:46:49,481 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-10-10 18:46:49,486 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-10 18:46:49,513 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-10 18:46:49,526 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-10 18:46:49,527 : INFO : EPOCH - 1 : training on 41519355 raw words (30349990 effective words) took 72.9s, 416311 effective words/s


Epoch #0 end
Loss: 37190996.0
Epoch #1 start


2018-10-10 18:46:50,550 : INFO : EPOCH 2 - PROGRESS: at 1.34% examples, 423690 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:46:51,562 : INFO : EPOCH 2 - PROGRESS: at 2.84% examples, 439539 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:46:52,586 : INFO : EPOCH 2 - PROGRESS: at 4.32% examples, 436265 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:46:53,599 : INFO : EPOCH 2 - PROGRESS: at 5.72% examples, 433934 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:46:54,615 : INFO : EPOCH 2 - PROGRESS: at 7.10% examples, 430738 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:46:55,639 : INFO : EPOCH 2 - PROGRESS: at 8.38% examples, 425414 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:46:56,642 : INFO : EPOCH 2 - PROGRESS: at 9.59% examples, 424028 words/s, in_qsize 20, out_qsize 0
2018-10-10 18:46:57,655 : INFO : EPOCH 2 - PROGRESS: at 10.61% examples, 418161 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:46:58,659 : INFO : EPOCH 2 - PROGRESS: at 11.64% examples, 416264 words/s, in_qsi

2018-10-10 18:47:57,624 : INFO : EPOCH 2 - PROGRESS: at 99.98% examples, 445687 words/s, in_qsize 1, out_qsize 1
2018-10-10 18:47:57,624 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-10 18:47:57,631 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-10 18:47:57,632 : INFO : EPOCH - 2 : training on 41519355 raw words (30350596 effective words) took 68.1s, 445738 effective words/s


Epoch #1 end
Loss: 67386632.0
Epoch #2 start


2018-10-10 18:47:58,651 : INFO : EPOCH 3 - PROGRESS: at 1.44% examples, 462971 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:47:59,659 : INFO : EPOCH 3 - PROGRESS: at 2.84% examples, 441909 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:48:00,677 : INFO : EPOCH 3 - PROGRESS: at 4.41% examples, 448424 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:48:01,691 : INFO : EPOCH 3 - PROGRESS: at 5.94% examples, 455123 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:48:02,700 : INFO : EPOCH 3 - PROGRESS: at 7.46% examples, 455359 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:48:03,702 : INFO : EPOCH 3 - PROGRESS: at 8.89% examples, 457165 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:48:04,716 : INFO : EPOCH 3 - PROGRESS: at 10.10% examples, 455615 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:48:05,723 : INFO : EPOCH 3 - PROGRESS: at 11.39% examples, 456586 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:48:06,742 : INFO : EPOCH 3 - PROGRESS: at 12.48% examples, 453621 words/s, in_qs

2018-10-10 18:49:06,353 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-10 18:49:06,361 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-10 18:49:06,367 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-10 18:49:06,368 : INFO : EPOCH - 3 : training on 41519355 raw words (30349703 effective words) took 68.7s, 441657 effective words/s


Epoch #2 end
Loss: 69895328.0
Epoch #3 start


2018-10-10 18:49:07,427 : INFO : EPOCH 4 - PROGRESS: at 1.14% examples, 352477 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:49:08,429 : INFO : EPOCH 4 - PROGRESS: at 2.64% examples, 404786 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:49:09,454 : INFO : EPOCH 4 - PROGRESS: at 4.14% examples, 415549 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:49:10,470 : INFO : EPOCH 4 - PROGRESS: at 5.60% examples, 421393 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:49:11,504 : INFO : EPOCH 4 - PROGRESS: at 7.07% examples, 424817 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:49:12,524 : INFO : EPOCH 4 - PROGRESS: at 8.63% examples, 433810 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:49:13,535 : INFO : EPOCH 4 - PROGRESS: at 9.88% examples, 438740 words/s, in_qsize 20, out_qsize 0
2018-10-10 18:49:14,542 : INFO : EPOCH 4 - PROGRESS: at 11.17% examples, 442691 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:49:15,553 : INFO : EPOCH 4 - PROGRESS: at 12.39% examples, 444757 words/s, in_qsi

2018-10-10 18:50:11,374 : INFO : EPOCH - 4 : training on 41519355 raw words (30349402 effective words) took 65.0s, 466945 effective words/s


Epoch #3 end
Loss: 72454664.0
Epoch #4 start


2018-10-10 18:50:12,390 : INFO : EPOCH 5 - PROGRESS: at 1.00% examples, 324337 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:50:13,392 : INFO : EPOCH 5 - PROGRESS: at 2.19% examples, 345771 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:50:14,408 : INFO : EPOCH 5 - PROGRESS: at 3.46% examples, 356044 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:50:15,428 : INFO : EPOCH 5 - PROGRESS: at 4.65% examples, 355759 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:50:16,441 : INFO : EPOCH 5 - PROGRESS: at 5.81% examples, 355338 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:50:17,455 : INFO : EPOCH 5 - PROGRESS: at 7.07% examples, 358881 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:50:18,460 : INFO : EPOCH 5 - PROGRESS: at 8.35% examples, 365757 words/s, in_qsize 20, out_qsize 0
2018-10-10 18:50:19,468 : INFO : EPOCH 5 - PROGRESS: at 9.44% examples, 365592 words/s, in_qsize 19, out_qsize 0
2018-10-10 18:50:20,470 : INFO : EPOCH 5 - PROGRESS: at 10.65% examples, 375191 words/s, in_qsiz

Epoch #4 end
Loss: 75034864.0


In [27]:
print (epoch_logger.losses[0])
print (epoch_logger.losses[1] - epoch_logger.losses[0])
print (epoch_logger.losses[2] - epoch_logger.losses[1])
print (epoch_logger.losses[3] - epoch_logger.losses[2])
print (epoch_logger.losses[4] - epoch_logger.losses[3])

37190996.0
30195636.0
2508696.0
2559336.0
2580200.0


In [8]:
w1 = "dirty"
model.wv.most_similar (positive=w1)

2018-10-10 18:13:59,731 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('filthy', 0.9375041127204895),
 ('grubby', 0.885326087474823),
 ('dusty', 0.8807424306869507),
 ('unclean', 0.8704223036766052),
 ('stained', 0.8664363622665405),
 ('grimy', 0.8569716811180115),
 ('dingy', 0.8497437238693237),
 ('moldy', 0.848162829875946),
 ('smelly', 0.8481365442276001),
 ('threadbare', 0.847484290599823)]

In [9]:
# look up top 6 words similar to 'polite'
w1 = ["polite"]
model.wv.most_similar (positive=w1,topn=6)

  if np.issubdtype(vec.dtype, np.int):


[('courteous', 0.9687690734863281),
 ('curteous', 0.9472550749778748),
 ('cordial', 0.9467061758041382),
 ('curtious', 0.9369388818740845),
 ('professional', 0.9234247207641602),
 ('friendly', 0.9214097261428833)]

In [12]:
# get everything related to stuff on the bed
w1 = ["bank"]
model.wv.most_similar (positive=w1,topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('barclays', 0.6669918298721313),
 ('expedition', 0.65492182970047),
 ('charleston', 0.6254096031188965),
 ('zion', 0.6123912334442139),
 ('mission', 0.6088446378707886),
 ('coast', 0.6049933433532715),
 ('sedona', 0.6008588075637817),
 ('cutty', 0.5940651297569275),
 ('consulate', 0.5903049111366272),
 ('expo', 0.5848873257637024)]

In [13]:
# get everything related to stuff on the bed
w1 = ["bank",]
w2 = ['river']
model.wv.most_similar (positive=w1,negative=w2,topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('client', 0.607180655002594),
 ('expire', 0.5592370629310608),
 ('presenting', 0.5585870146751404),
 ('emailing', 0.5563965439796448),
 ('yearly', 0.5540558695793152),
 ('completing', 0.5474334955215454),
 ('pending', 0.5394956469535828),
 ('swiping', 0.5350539088249207),
 ('visa', 0.5329407453536987),
 ('bankrupt', 0.5328477025032043)]