### Import libs

In [1]:
import nmslib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))
import pickle
from tqdm import tqdm

from tools.corpus import PlotCorpus
from utils.doc2vec import Doc2Vec
from tools.film_card import FilmCard
from utils.tokenizers import lemma_tokenizer
from utils.metrics import mrr
from utils.inverse_index import InverseIndex

import logging
logging.basicConfig(format='%(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info('Baseline pipeline...')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO: Baseline pipeline...


### Data and corpus preparing

In [2]:
with open(r'./data/dataset/films_wiki.pickle', 'rb') as f:
    films_wiki = pickle.load(f)
with open(r'./data/dataset/films_imdb.pickle', 'rb') as f:
    films_imdb = pickle.load(f)   
logging.info('Data loaded!')

INFO: Data loaded!


In [3]:
corpus = [(film.get_box('title'), film.get_box('plot')) for film in films_wiki]

### Building inverse index

In [4]:
index = InverseIndex(lemma_tokenizer)
index.build_on(corpus, tokenized=False)

index.search(films_wiki[10].get_box('plot'), topN=5, threshold=0.1, metric='f_measure'), films_wiki[10].get_box('title')

100%|███████████████████████████████████████████████████████████████████████████| 34886/34886 [02:42<00:00, 214.74it/s]


([['Dream of a Rarebit Fiend', 1.0],
  ['No Escape', 0.46376811594202905],
  ['Catacombs', 0.4242424242424243],
  ['The Hole', 0.4197530864197531],
  ['Split', 0.411764705882353]],
 'Dream of a Rarebit Fiend')

### Metrics counting

In [5]:
preds = []
cors = []
correct_ = 0
all_ = 0
for film in tqdm(films_imdb):
    res = index.search(film.get_box('plot'), topN=5, threshold=0.1, metric='f_measure')
    ids, distances = [a[0] for a in res], [a[1] for a in res]
    correct_name = film.get_box('title')
    preds.append(ids)
    cors.append(correct_name)
    if correct_name in ids:
        correct_ += 1
    all_ += 1

100%|██████████████████████████████████████████████████████████████████████████████████| 86/86 [00:10<00:00,  8.09it/s]


In [6]:
cov = correct_ / all_
print(f"Coverage = {correct_}/{all_} = {cov:.3f}")
print(f"MRR = {mrr(cors, preds):.3f}")

Coverage = 43/86 = 0.500
MRR = 0.375
