In [3]:
from pathlib import Path
import pandas as pd
import re
import numpy as np
from functools import partial
import pickle

from src.preprocessing import read, tokenize, preprocess
from src.helpers import process_query_results
from src.database import Database, PositionalDatabase
from src.query import query_database, pos_query_database
from src.evaluation import map_at_k, mar_at_k

In [4]:
base_path = Path("/Users/stighellemans/Desktop/Information_Retrieval/assignments/assignment 1")
base_small_doc_path = base_path / "full_docs_small"
base_large_doc_path = base_path / "full_docs"

small_docs = {int(re.search(r'\d+', doc_path.name).group()): doc_path for doc_path in base_small_doc_path.glob("*.txt")}
large_docs = {int(re.search(r'\d+', doc_path.name).group()): doc_path for doc_path in base_large_doc_path.glob("*.txt")}

small_queries = pd.read_csv(base_path / "dev_small_queries - dev_small_queries.csv", index_col="Query number").to_dict()["Query"]
small_query_results = pd.read_csv(base_path / "dev_query_results_small.csv", index_col="Query_number")
small_query_results = process_query_results(small_queries, small_query_results)

large_queries = pd.read_csv(base_path / "dev_queries.tsv", delimiter="\t", index_col="Query number").to_dict()["Query"]
large_query_results = pd.read_csv(base_path / "dev_query_results.csv", index_col="Query_number")
large_query_results = process_query_results(large_queries, large_query_results)

test_queries = pd.read_csv(base_path / "queries.csv", delimiter="\t", index_col="Query number").to_dict()["Query"]




In [1]:
import random

n_subset = 10
subset = {i: small_docs[random.choice(list(small_docs.keys()))] for i in range(n_subset)}

KeyboardInterrupt: 

In [14]:
large_subset = {ID: large_docs[ID] for ID in list(large_docs.keys())[:10000]}

In [None]:


db = Database(tokenize_fn=preprocess, docs=large_subset, n_processes=8)

Processing Documents:   5%|▌         | 541/10000 [00:14<04:01, 39.12it/s] 

442565

In [10]:
from tqdm import tqdm
for doc_id in tqdm(small_docs.keys()):
    Database.process_single_document(db, (doc_id, small_docs[doc_id]))

100%|██████████| 1557/1557 [01:46<00:00, 14.64it/s]


In [12]:
with open("./results/preprocessing_database.pkl", "rb") as file:
    db = pickle.load(file)

In [3]:
with open("./results/preprocessing_database.pkl", "rb") as file:
    db = pickle.load(file)

In [None]:
query_database(db, "hello how are you my friend")

In [4]:
# Example usage for k = 3 and k = 10
k_values = [3, 10]

for k in k_values:
    map_k = map_at_k(small_queries, small_query_results,db, query_database, k)
    mar_k = mar_at_k(small_queries, small_query_results, db, query_database, k)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")

Processing queries for MAP@K=3: 100%|██████████| 248/248 [00:05<00:00, 42.76it/s]
Processing queries for MAP@K=3: 100%|██████████| 248/248 [00:00<00:00, 627.75it/s]


MAP@3: 0.27284946236559143
MAR@3: 0.8064516129032258


Processing queries for MAP@K=10: 100%|██████████| 248/248 [00:04<00:00, 58.68it/s] 
Processing queries for MAP@K=10: 100%|██████████| 248/248 [00:00<00:00, 740.48it/s]

MAP@10: 0.0907258064516129
MAR@10: 0.8891129032258065





In [None]:
pos_db = PositionalDatabase(tokenize_fn=tokenize, docs=large_docs)

Processing Documents:  16%|█▌        | 78431/501676 [10:17<04:15, 1658.16it/s] 

In [None]:
with open("./results/preprocessing_pos_database.pkl", "wb") as file:
    pickle.dump(pos_db, file)

In [None]:
with open("./results/preprocessing_pos_database.pkl", "rb") as file:
    pos_db = pickle.load(file)

In [None]:
# Example usage for k = 3 and k = 10
k_values = [3, 10]

query_function = partial(pos_query_database, q_fraction=1, boost_factor=1)

for k in k_values:
    map_k = map_at_k(large_queries, large_query_results, pos_db, query_function, k)
    mar_k = mar_at_k(large_queries, large_query_results, pos_db, query_function, k)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")