### Import libs

In [1]:
import nmslib
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))
import pickle
from tqdm import tqdm
import collections

from tools.corpus import PlotCorpus
from utils.doc2vec import Doc2Vec
from tools.film_card import FilmCard
from utils.metrics import mrr
from utils.inverse_index import InverseIndex
from utils.tokenizers import lemma_tokenizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data preparing

In [2]:
with open(r'./data/doc2vec.model', 'rb') as f:
    d2v = pickle.load(f)
with open(r'./data/doc2vec_mean.model', 'rb') as f:
    d2v_mean = pickle.load(f)
with open(r'./data/dataset/films_wiki.pickle', 'rb') as f:
    films_wiki = pickle.load(f)  

In [3]:
# sanity check
len(d2v.get_sentence_vector(films_wiki[0].get_box('plot'), False)) == 300 

True

In [4]:
films_vectors = [d2v.get_sentence_vector(film.get_box('plot'), False) for film in tqdm(films_wiki)]
films_vectors_mean = [d2v_mean.get_sentence_vector(film.get_box('plot'), False) for film in tqdm(films_wiki)]
np_films_vectors = np.array(films_vectors)
np_films_vectors_mean = np.array(films_vectors_mean)

100%|███████████████████████████████████████████████████████████████████████████| 34886/34886 [05:28<00:00, 106.34it/s]
100%|███████████████████████████████████████████████████████████████████████████| 34886/34886 [04:57<00:00, 117.19it/s]


### Index building

In [5]:
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(np_films_vectors)
index.createIndex({'post': 2}, print_progress=True)

# initialize a new index, using a HNSW index on Cosine Similarity
index_mean = nmslib.init(method='hnsw', space='cosinesimil')
index_mean.addDataPointBatch(np_films_vectors_mean)
index_mean.createIndex({'post': 2}, print_progress=True)

In [6]:
ids, distances = index.knnQuery(films_vectors[10], k=5)
result_ = 'Sanity check successful' if 10 in ids else 'Sanity check failed'
print(result_)

ids, distances = index_mean.knnQuery(films_vectors_mean[10], k=5)
result_ = 'Sanity check successful' if 10 in ids else 'Sanity check failed'
print(result_)

Sanity check successful
Sanity check successful


In [7]:
ids, distances

(array([  10, 5409,  148, 5322, 4700]),
 array([0.        , 0.20693189, 0.21419895, 0.21472907, 0.21527648],
       dtype=float32))

In [8]:
path_to_save = r'./data/index'
if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)
index.saveIndex(path_to_save + '/index_nmslib', save_data=True)
index_mean.saveIndex(path_to_save + '/index_mean_nmslib', save_data=True)