In [1]:
from pathlib import Path
import re
import time
from functools import partial

import pandas as pd
import lucene

from src.helpers import process_query_results
from src.database import make_database, get_vocabulary
from src.query import query_database
from src.evaluation import map_at_k, mar_at_k, retrieve_top_k_docs
from src.analyzer import StemmingAnalyzer


In [2]:
base_path = Path("/root/data/")
base_small_doc_path = base_path / "full_docs_small"
base_large_doc_path = base_path / "full_docs"

small_docs = {int(re.search(r'\d+', doc_path.name).group()): doc_path for doc_path in base_small_doc_path.glob("*.txt")}
large_docs = {int(re.search(r'\d+', doc_path.name).group()): doc_path for doc_path in base_large_doc_path.glob("*.txt")}

small_queries = pd.read_csv(base_path / "dev_small_queries - dev_small_queries.csv", index_col="Query number").to_dict()["Query"]
small_query_results = pd.read_csv(base_path / "dev_query_results_small.csv", index_col="Query_number")
small_query_results = process_query_results(small_queries, small_query_results)

large_queries = pd.read_csv(base_path / "dev_queries.tsv", delimiter="\t", index_col="Query number").to_dict()["Query"]
large_query_results = pd.read_csv(base_path / "dev_query_results.csv", index_col="Query_number")
large_query_results = process_query_results(large_queries, large_query_results)

test_queries = pd.read_csv(base_path / "queries.csv", delimiter="\t", index_col="Query number").to_dict()["Query"]




In [3]:
lucene.initVM()

<jcc.JCCEnv at 0xffff4b499cf0>

In [4]:
data_dir = "/root/data/full_docs"
index_dir = "/root/index/large_stem"
output_dir = "/root/pylucene/results"
similarity = "BM25"
analyzer = StemmingAnalyzer()

In [5]:
start_time = time.time()
make_database(doc_paths=large_docs, index_directory=index_dir, similarity=similarity, custom_analyzer=analyzer)
end_time = time.time()

vocab_size = len(get_vocabulary(index_dir=index_dir))
print(f"Done after: {end_time - start_time:.2f}s with vocab size: {vocab_size}")

Indexing documents: 100%|██████████████████████████████████████████████████████████████| 501676/501676 [07:36<00:00, 1099.40it/s]


Done after: 463.65s with vocab size: 14809778


In [6]:
results = query_database(index_directory=index_dir, query_str="Milestones", top_k=5)
print(results)

[]


In [7]:
k_values = [1, 3, 5, 10]
query_function = partial(query_database, similarity=similarity, custom_analyzer=analyzer)

for k in k_values:
    map_k = map_at_k(large_queries, large_query_results, index_dir, k, query_function=query_function)
    mar_k = mar_at_k(large_queries, large_query_results, index_dir, k, query_function=query_function)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")

Processing queries for MAP@K=1: 100%|███████████████████████████████████████████████████████| 5193/5193 [00:21<00:00, 239.83it/s]
Processing queries for MAR@K=1: 100%|███████████████████████████████████████████████████████| 5193/5193 [00:15<00:00, 328.03it/s]


MAP@1: 0.8559599460812632
MAR@1: 0.058029357735236965


Processing queries for MAP@K=3: 100%|███████████████████████████████████████████████████████| 5193/5193 [00:23<00:00, 224.50it/s]
Processing queries for MAR@K=3: 100%|███████████████████████████████████████████████████████| 5193/5193 [00:22<00:00, 235.75it/s]


MAP@3: 0.7776493998331087
MAR@3: 0.15678590986487664


Processing queries for MAP@K=5: 100%|███████████████████████████████████████████████████████| 5193/5193 [00:25<00:00, 200.57it/s]
Processing queries for MAR@K=5: 100%|███████████████████████████████████████████████████████| 5193/5193 [00:26<00:00, 193.76it/s]


MAP@5: 0.7174658193722319
MAR@5: 0.23929158477212806


Processing queries for MAP@K=10: 100%|██████████████████████████████████████████████████████| 5193/5193 [00:35<00:00, 147.62it/s]
Processing queries for MAR@K=10: 100%|██████████████████████████████████████████████████████| 5193/5193 [00:37<00:00, 140.27it/s]

MAP@10: 0.5968996726362411
MAR@10: 0.39343746443743743





In [8]:
k = 10

results = retrieve_top_k_docs(test_queries, index_directory=index_dir, query_function=query_function, k=k)

results = [(q_id, doc_id) for q_id, doc_ids in results.items() for doc_id in doc_ids]
# pd.DataFrame(sorted(results, key=lambda x: x[0]), columns=["Query_number", "doc_number"]).to_csv(Path(output_dir)/"results.csv", index=False)

Retrieving top-10 documents: 100%|██████████████████████████████████████████████████████████| 5793/5793 [00:38<00:00, 150.89it/s]


In [9]:
get_vocabulary(index_dir)

['',
 'a',
 'aa',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaaaaa',
 'aaaaaaa',
 'aaaaaaaa',
 'aaaaaaaaaa',
 'aaaaaaaaaaa',
 'aaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaeu',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaahhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh',
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaah',
 'aaaaaaaaaaaaaaaaagh',
 'aaaaaaaaaaaaaaaahelp',
 'aaaaaaaaaaaaaaaandglu',
 'aaaaaaaaaaaaaahhhhhhh',
 'aaaaaaaaaaaaagh',
 'aaaaaaaaaaaaiii',
 'aaaaaaaaaaaar',
 'aaaaaaaaaaaatopsimultan',
 'aaaaaaaaaaahhh',
 'aaaaaaaaaaawwwwwwwwwwwwwwweeeeeeeeeeesoooooooooooooooooooooooommmmmmmmmmmmmmmmeeeeeeeeeereplyvinitaseptemb',
 'aaaaaaaaaahhhhhh',
 'aaaaaaaaaf',
 'aaaaaaaaah',
 'aaaaaaaaahhhhh',
 'aaaaaaaaal',
 'aaaaaaaaau',
 'aaaaaaaaavg',
 'aaaaaaaab',
 'aaaaaaaabk',
 'aaaaaaaac',
 'aaaaaaaadn',
 'aaaaaaaagh',
 'aaaaaaaah',
 'aaaaaaaahhhhh',
 'aaaaaaaakgi'