### Import libs

In [1]:
import nmslib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))
import pickle
from tqdm import tqdm

from tools.corpus import SimpleCorp
from utils.tokenizers import stem_tokenizer
from utils.doc2vec import Doc2Vec
from utils.metrics import mrr
from utils.inverse_index import InverseIndex

from gensim.models import Word2Vec
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

import logging
logging.basicConfig(format='%(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info('Prediction and metrics')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO: Prediction and metrics


### Data preparing

In [2]:
with open(r'./data/doc2vec.model', 'rb') as f:
    d2v = pickle.load(f)
with open(r'./data/doc2vec_mean.model', 'rb') as f:
    d2v_mean = pickle.load(f)
    
PATH_TO_FILES = './data'
tokenized_corp = SimpleCorp().load('codexes_tokenized_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
simple_corp = SimpleCorp().load('codexes_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
simple_corp_art_names = SimpleCorp().load('codexes_corp_art_names', os.path.join(PATH_TO_FILES, "corp"))
    
logging.info('Data loaded!')

INFO: Data loaded!


### Index loading

In [3]:
path_to_save = r'./data/index'
if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)
index = nmslib.init(method='hnsw', space='cosinesimil')
index_mean = nmslib.init(method='hnsw', space='cosinesimil')
index.loadIndex(path_to_save + '/index_nmslib')
index_mean.loadIndex(path_to_save + '/index_mean_nmslib')

INFO: Loading index from ./data/index/index_nmslib
INFO: Loading optimized index.
INFO: searchMethod: 3
INFO: Total: 6322, Memory per object: 1348
INFO: Finished loading index
INFO: Set HNSW query-time parameters:
INFO: ef(Search)         =20
INFO: algoType           =2
INFO: Loading index from ./data/index/index_mean_nmslib
INFO: Loading optimized index.
INFO: searchMethod: 3
INFO: Total: 6322, Memory per object: 1348
INFO: Finished loading index
INFO: Set HNSW query-time parameters:
INFO: ef(Search)         =20
INFO: algoType           =2


### Searching and metrics 

In [4]:
from tools.pravoved_recognizer import Request

pravoved = Request.load(os.path.join("./data", "pravoved_one_answer.json"))

1429


In [5]:
num2id = {key:val[0] for key, val in enumerate(tokenized_corp)}

In [6]:
preds = []
cors = []
correct_ = 0
all_ = 0
for pravoved_obj in tqdm(pravoved):
    question_vector = d2v.get_sentence_vector(pravoved_obj['question'], False)
    ids, distances = index.knnQuery(question_vector, k=5)
    correct_name = (pravoved_obj['codex'], pravoved_obj['norm'])
    found_names = [num2id[id_] for id_ in ids]
    preds.append(found_names)
    cors.append(correct_name)
    if correct_name in found_names:
        correct_ += 1
    all_ += 1

100%|█████████████████████████████████████████████████████████████████████████████| 1429/1429 [00:05<00:00, 274.18it/s]


In [7]:
cov = correct_ / all_
print(f"TFIDFIndex Coverage = {correct_}/{all_} = {cov:.3f}")
print(f"TFIDFIndex MRR = {mrr(cors, preds):.3f}")

TFIDFIndex Coverage = 153/1429 = 0.107
TFIDFIndex MRR = 0.054


In [8]:
preds = []
cors = []
correct_ = 0
all_ = 0
for pravoved_obj in tqdm(pravoved):
    question_vector = d2v_mean.get_sentence_vector(pravoved_obj['question'], False)
    ids, distances = index_mean.knnQuery(question_vector, k=5)
    correct_name = (pravoved_obj['codex'], pravoved_obj['norm'])
    found_names = [num2id[id_] for id_ in ids]
    preds.append(found_names)
    cors.append(correct_name)
    if correct_name in found_names:
        correct_ += 1
    all_ += 1

100%|█████████████████████████████████████████████████████████████████████████████| 1429/1429 [00:05<00:00, 283.42it/s]


In [9]:
cov = correct_ / all_
print(f"MeanIndex Coverage = {correct_}/{all_} = {cov:.3f}")
print(f"MeanIndex MRR = {mrr(cors, preds):.3f}")

MeanIndex Coverage = 67/1429 = 0.047
MeanIndex MRR = 0.025
