### Import libs

In [1]:
import nmslib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))
import pickle
from tqdm import tqdm
import collections

from tools.corpus import PlotCorpus
from utils.doc2vec import Doc2Vec
from tools.film_card import FilmCard
from utils.metrics import mrr
from utils.inverse_index import InverseIndex
from utils.tokenizers import lemma_tokenizer

from gensim.models import Word2Vec
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

import logging
logging.basicConfig(format='%(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info('Prediction and metrics')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO: Prediction and metrics


### Data preparing

In [2]:
with open(r'./data/doc2vec.model', 'rb') as f:
    d2v = pickle.load(f)
with open(r'./data/doc2vec_mean.model', 'rb') as f:
    d2v_mean = pickle.load(f)
    
with open(r'./data/dataset/films_wiki.pickle', 'rb') as f:
    films_wiki = pickle.load(f)  
    
with open(r'./data/dataset/films_imdb.pickle', 'rb') as f:
    films_imdb = pickle.load(f)
    
logging.info('Data loaded!')

INFO: Data loaded!


### Index loading

In [3]:
path_to_save = r'./data/index'
if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)
index = nmslib.init(method='hnsw', space='cosinesimil')
index_mean = nmslib.init(method='hnsw', space='cosinesimil')
index.loadIndex(path_to_save + '/index_nmslib')
index_mean.loadIndex(path_to_save + '/index_mean_nmslib')

INFO: Loading index from ./data/index/index_nmslib
INFO: Loading optimized index.
INFO: searchMethod: 3
INFO: Total: 34886, Memory per object: 1348
INFO: Finished loading index
INFO: Set HNSW query-time parameters:
INFO: ef(Search)         =20
INFO: algoType           =2
INFO: Loading index from ./data/index/index_mean_nmslib
INFO: Loading optimized index.
INFO: searchMethod: 3
INFO: Total: 34886, Memory per object: 1348
INFO: Finished loading index
INFO: Set HNSW query-time parameters:
INFO: ef(Search)         =20
INFO: algoType           =2


### Searching and metrics 

In [4]:
imdb_vectors = [d2v.get_sentence_vector(film.get_box('plot'), False) for film in tqdm(films_imdb)]
imdb_vectors_mean = [d2v_mean.get_sentence_vector(film.get_box('plot'), False) for film in tqdm(films_imdb)]

100%|██████████████████████████████████████████████████████████████████████████████████| 86/86 [00:03<00:00, 22.30it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 86/86 [00:02<00:00, 41.17it/s]


In [5]:
preds = []
cors = []
correct_ = 0
all_ = 0
for film, film_vector in tqdm(zip(films_imdb, imdb_vectors)):
    ids, distances = index.knnQuery(film_vector, k=5)
    correct_name = film.get_box('title')
    found_names = [films_wiki[id_].get_box('title') for id_ in ids]
    preds.append(found_names)
    cors.append(correct_name)
    if correct_name in found_names:
        correct_ += 1
    all_ += 1

86it [00:00, 6630.82it/s]


In [6]:
cov = correct_ / all_
print(f"TFIDFIndex Coverage = {correct_}/{all_} = {cov:.3f}")
print(f"TFIDFIndex MRR = {mrr(cors, preds):.3f}")

TFIDFIndex Coverage = 79/86 = 0.919
TFIDFIndex MRR = 0.902


In [7]:
preds = []
cors = []
correct_ = 0
all_ = 0
for film, film_vector in tqdm(zip(films_imdb, imdb_vectors_mean)):
    ids, distances = index_mean.knnQuery(film_vector, k=5)
    correct_name = film.get_box('title')
    found_names = [films_wiki[id_].get_box('title') for id_ in ids]
    preds.append(found_names)
    cors.append(correct_name)
    if correct_name in found_names:
        correct_ += 1
    all_ += 1

86it [00:00, 5386.95it/s]


In [8]:
cov = correct_ / all_
print(f"MeanIndex Coverage = {correct_}/{all_} = {cov:.3f}")
print(f"MeanIndex MRR = {mrr(cors, preds):.3f}")

MeanIndex Coverage = 49/86 = 0.570
MeanIndex MRR = 0.481
