### Import libs

In [1]:
import nmslib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import collections

import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))
import pickle
from tqdm import tqdm

from tools.corpus import SimpleCorp
from utils.tokenizers import stem_tokenizer
from utils.doc2vec import Doc2Vec
from utils.metrics import mrr
from utils.inverse_index import InverseIndex

import logging
logging.basicConfig(format='%(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info('Baseline pipeline...')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO: Baseline pipeline...


### Data and corpus preparing

In [2]:
PATH_TO_FILES = './data'

tokenized_corp = SimpleCorp().load('codexes_tokenized_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
simple_corp = SimpleCorp().load('codexes_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
simple_corp_art_names = SimpleCorp().load('codexes_corp_art_names', os.path.join(PATH_TO_FILES, "corp"))

### Building inverse index

In [3]:
index = InverseIndex(stem_tokenizer)
index.build_on(tokenized_corp, tokenized=True)

100%|███████████████████████████████████████████████████████████████████████████| 6322/6322 [00:00<00:00, 22086.85it/s]


In [4]:
index.search('Симметричные корректировки осуществляются в порядке, установленном настоящей статьей', topN=5, threshold=0.1, metric='f_measure')

[[('20', '93'), 0.47058823529411764],
 [('20', '251'), 0.4],
 [('9', '52'), 0.3846153846153846],
 [('7', '49'), 0.3529411764705882],
 [('20', '322'), 0.3529411764705882]]

### Metrics counting

In [5]:
from tools.pravoved_recognizer import Request

pravoved = Request.load(os.path.join("./data", "pravoved_one_answer.json"))

1429


In [6]:
preds = []
cors = []
correct_ = 0
all_ = 0
for pravoved_obj in tqdm(pravoved):
    res = index.search(pravoved_obj['question'], topN=5, threshold=0.1, metric='f_measure')
    ids, distances = [a[0] for a in res], [a[1] for a in res]
    correct_name = (pravoved_obj['codex'], pravoved_obj['norm'])
    preds.append(ids)
    cors.append(correct_name)
    if correct_name in ids:
        correct_ += 1
    all_ += 1

100%|█████████████████████████████████████████████████████████████████████████████| 1429/1429 [00:11<00:00, 121.19it/s]


In [7]:
cov = correct_ / all_
print(f"Coverage = {correct_}/{all_} = {cov:.3f}")
print(f"MRR = {mrr(cors, preds):.3f}")

Coverage = 51/1429 = 0.036
MRR = 0.021
