### Import libs

In [1]:
import nmslib
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))
import pickle
from tqdm import tqdm

from tools.corpus import SimpleCorp
from utils.tokenizers import stem_tokenizer
from utils.doc2vec import Doc2Vec
from utils.metrics import mrr
from utils.inverse_index import InverseIndex

import logging
logging.basicConfig(format='%(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info('NMS index preparing...')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO: NMS index preparing...


### Data preparing

In [2]:
with open(r'./data/doc2vec.model', 'rb') as f:
    d2v = pickle.load(f)
with open(r'./data/doc2vec_mean.model', 'rb') as f:
    d2v_mean = pickle.load(f)

PATH_TO_FILES = './data'
tokenized_corp = SimpleCorp().load('codexes_tokenized_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
simple_corp = SimpleCorp().load('codexes_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
simple_corp_art_names = SimpleCorp().load('codexes_corp_art_names', os.path.join(PATH_TO_FILES, "corp"))

In [3]:
docs_vectors = [d2v.get_sentence_vector(doc[1], True) for doc in tqdm(tokenized_corp)]
docs_vectors_mean = [d2v_mean.get_sentence_vector(doc[1], True) for doc in tqdm(tokenized_corp)]
np_docs_vectors = np.array(docs_vectors)
np_docs_vectors_mean = np.array(docs_vectors_mean)

100%|████████████████████████████████████████████████████████████████████████████| 6322/6322 [00:06<00:00, 1012.55it/s]
100%|████████████████████████████████████████████████████████████████████████████| 6322/6322 [00:04<00:00, 1488.40it/s]


### Index building

In [4]:
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(np_docs_vectors)
index.createIndex({'post': 2}, print_progress=True)

# initialize a new index, using a HNSW index on Cosine Similarity
index_mean = nmslib.init(method='hnsw', space='cosinesimil')
index_mean.addDataPointBatch(np_docs_vectors_mean)
index_mean.createIndex({'post': 2}, print_progress=True)

INFO: M                   = 16
INFO: indexThreadQty      = 4
INFO: efConstruction      = 200
INFO: maxM			          = 16
INFO: maxM0			          = 32
INFO: mult                = 0.360674
INFO: skip_optimized_index= 0
INFO: delaunay_type       = 2
INFO: Set HNSW query-time parameters:
INFO: ef(Search)         =20
INFO: algoType           =2
INFO: 
The vector space is CosineSimilarity
INFO: Vector length=300
INFO: searchMethod			  = 3
INFO: Making optimized index
INFO: Finished making optimized index
INFO: Maximum level = 3
INFO: Total memory allocated for optimized index+data: 8 Mb
INFO: M                   = 16
INFO: indexThreadQty      = 4
INFO: efConstruction      = 200
INFO: maxM			          = 16
INFO: maxM0			          = 32
INFO: mult                = 0.360674
INFO: skip_optimized_index= 0
INFO: delaunay_type       = 2
INFO: Set HNSW query-time parameters:
INFO: ef(Search)         =20
INFO: algoType           =2
INFO: 
The vector space is CosineSimilarity
INFO: Vector length=300
IN

In [5]:
ids, distances = index.knnQuery(np_docs_vectors[10], k=5)
result_ = 'Sanity check successful' if 10 in ids else 'Sanity check failed'
print(result_)

ids, distances = index_mean.knnQuery(np_docs_vectors_mean[10], k=5)
result_ = 'Sanity check successful' if 10 in ids else 'Sanity check failed'
print(result_)

Sanity check successful
Sanity check successful


In [6]:
ids, distances

(array([  10,   56,  704,  699, 5164]),
 array([0.        , 0.2852506 , 0.3022762 , 0.31626922, 0.32325107],
       dtype=float32))

In [7]:
path_to_save = r'./data/index'
if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)
index.saveIndex(path_to_save + '/index_nmslib', save_data=True)
index_mean.saveIndex(path_to_save + '/index_mean_nmslib', save_data=True)

INFO: writing 8522056 bytes
INFO: writing 8522056 bytes
