### Import libs

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))
import pickle
from tqdm import tqdm

from tools.corpus import SimpleCorp
from utils.tokenizers import stem_tokenizer
from utils.doc2vec import Doc2Vec
from utils.metrics import mrr
from utils.inverse_index import InverseIndex

from gensim.models import Word2Vec
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

import logging
logging.basicConfig(format='%(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info('Doc2vec learning...')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO: Doc2vec learning...


### Data Preparing

In [2]:
PATH_TO_FILES = './data'

tokenized_corp = SimpleCorp().load('codexes_tokenized_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
simple_corp = SimpleCorp().load('codexes_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
simple_corp_art_names = SimpleCorp().load('codexes_corp_art_names', os.path.join(PATH_TO_FILES, "corp"))

In [3]:
class iter_helper:
    def __init__(self, corp):
        self.corp = corp
    
    def __iter__(self):
        for doc_id, doc_text in self.corp:
            yield doc_text

In [4]:
tokenized_corp_iterator = iter_helper(tokenized_corp)

### Word2Vec model 

In [5]:
w2v_model = Word2Vec(min_count=1, size=300, window=10, workers=10)
w2v_model.build_vocab(tokenized_corp_iterator)
w2v_model.train(tokenized_corp_iterator, total_examples=w2v_model.corpus_count, epochs=30)

INFO: collecting all words and their counts
INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO: collected 12315 word types from a corpus of 1172543 raw words and 6322 sentences
INFO: Loading a fresh vocabulary
INFO: effective_min_count=1 retains 12315 unique words (100% of original 12315, drops 0)
INFO: effective_min_count=1 leaves 1172543 word corpus (100% of original 1172543, drops 0)
INFO: deleting the raw counts dictionary of 12315 items
INFO: sample=0.001 downsamples 62 most-common words
INFO: downsampling leaves estimated 1027528 word corpus (87.6% of prior 1172543)
INFO: estimated required memory for 12315 words and 300 dimensions: 35713500 bytes
INFO: resetting layer weights
INFO: training model with 10 workers on 12315 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
INFO: EPOCH 1 - PROGRESS: at 50.93% examples, 668264 words/s, in_qsize 19, out_qsize 0
INFO: worker thread finished; awaiting finish of 9 more threads
INFO: 

INFO: worker thread finished; awaiting finish of 0 more threads
INFO: EPOCH - 9 : training on 1172543 raw words (1027418 effective words) took 1.3s, 763757 effective words/s
INFO: EPOCH 10 - PROGRESS: at 57.81% examples, 682967 words/s, in_qsize 20, out_qsize 1
INFO: worker thread finished; awaiting finish of 9 more threads
INFO: worker thread finished; awaiting finish of 8 more threads
INFO: worker thread finished; awaiting finish of 7 more threads
INFO: worker thread finished; awaiting finish of 6 more threads
INFO: worker thread finished; awaiting finish of 5 more threads
INFO: worker thread finished; awaiting finish of 4 more threads
INFO: worker thread finished; awaiting finish of 3 more threads
INFO: worker thread finished; awaiting finish of 2 more threads
INFO: worker thread finished; awaiting finish of 1 more threads
INFO: worker thread finished; awaiting finish of 0 more threads
INFO: EPOCH - 10 : training on 1172543 raw words (1027165 effective words) took 1.4s, 752179 effec

INFO: worker thread finished; awaiting finish of 3 more threads
INFO: worker thread finished; awaiting finish of 2 more threads
INFO: worker thread finished; awaiting finish of 1 more threads
INFO: worker thread finished; awaiting finish of 0 more threads
INFO: EPOCH - 19 : training on 1172543 raw words (1027362 effective words) took 1.3s, 777544 effective words/s
INFO: EPOCH 20 - PROGRESS: at 75.43% examples, 794513 words/s, in_qsize 19, out_qsize 0
INFO: worker thread finished; awaiting finish of 9 more threads
INFO: worker thread finished; awaiting finish of 8 more threads
INFO: worker thread finished; awaiting finish of 7 more threads
INFO: worker thread finished; awaiting finish of 6 more threads
INFO: worker thread finished; awaiting finish of 5 more threads
INFO: worker thread finished; awaiting finish of 4 more threads
INFO: worker thread finished; awaiting finish of 3 more threads
INFO: worker thread finished; awaiting finish of 2 more threads
INFO: worker thread finished; awa

INFO: worker thread finished; awaiting finish of 6 more threads
INFO: worker thread finished; awaiting finish of 5 more threads
INFO: worker thread finished; awaiting finish of 4 more threads
INFO: worker thread finished; awaiting finish of 3 more threads
INFO: worker thread finished; awaiting finish of 2 more threads
INFO: worker thread finished; awaiting finish of 1 more threads
INFO: worker thread finished; awaiting finish of 0 more threads
INFO: EPOCH - 29 : training on 1172543 raw words (1027493 effective words) took 1.3s, 798338 effective words/s
INFO: EPOCH 30 - PROGRESS: at 69.25% examples, 760806 words/s, in_qsize 19, out_qsize 0
INFO: worker thread finished; awaiting finish of 9 more threads
INFO: worker thread finished; awaiting finish of 8 more threads
INFO: worker thread finished; awaiting finish of 7 more threads
INFO: worker thread finished; awaiting finish of 6 more threads
INFO: worker thread finished; awaiting finish of 5 more threads
INFO: worker thread finished; awa

(30825625, 35176290)

In [6]:
len(w2v_model.wv.vocab)

12315

### TFIDF weights model

In [7]:
dct = Dictionary(tokenized_corp_iterator)  # fit dictionary
model = TfidfModel(dictionary=dct)  # fit model

INFO: adding document #0 to Dictionary(0 unique tokens: [])
INFO: built Dictionary(12315 unique tokens: ['1', '2', '3', '4', '5']...) from 6322 documents (total 1172543 corpus positions)


### Doc2Vec model

In [8]:
d2v = Doc2Vec(w2v_model, tfidf_model=model, tokenizer=stem_tokenizer)
len(d2v.get_sentence_vector(next(iter(tokenized_corp_iterator)), True))

300

In [9]:
d2v_mean = Doc2Vec(w2v_model, tokenizer=stem_tokenizer)
len(d2v_mean.get_sentence_vector(next(iter(tokenized_corp_iterator)), True))

300

### Save doc2vec

In [10]:
basedir = r'./data'
if not os.path.exists(basedir):
    os.makedirs(basedir)
    
with open(r'./data/doc2vec.model', 'wb') as f:
    pickle.dump(d2v, f)
    
with open(r'./data/doc2vec_mean.model', 'wb') as f:
    pickle.dump(d2v_mean, f)

In [11]:
with open(r'./data/doc2vec.model', 'rb') as f:
    d2v2 = pickle.load(f)

In [12]:
len(d2v2.get_sentence_vector(next(iter(tokenized_corp_iterator)), True))

300