In [1]:
from pathlib import Path
import pandas as pd
import re
import numpy as np
from functools import partial
import pickle

from src.preprocessing import read, tokenize, preprocess
from src.helpers import process_query_results
from src.database import Database, PositionalDatabase
from src.query import query_database, pos_query_database
from src.evaluation import map_at_k, mar_at_k

In [2]:
base_path = Path("/Users/stighellemans/Desktop/Information_Retrieval/assignments/assignment 1")
base_doc_path = base_path / "full_docs_small"


docs = {int(re.search(r'\d+', doc_path.name).group()): read(doc_path) for doc_path in base_doc_path.glob("*.txt")}

small_queries = pd.read_csv(base_path / "dev_small_queries - dev_small_queries.csv", index_col="Query number").to_dict()["Query"]
small_query_results = pd.read_csv(base_path / "dev_query_results_small.csv", index_col="Query_number")
small_query_results = process_query_results(small_queries, small_query_results)

large_queries = pd.read_csv(base_path / "dev_queries.tsv", delimiter="\t", index_col="Query number").to_dict()["Query"]
large_query_results = pd.read_csv(base_path / "dev_query_results.csv", index_col="Query_number")
large_query_results = process_query_results(large_queries, large_query_results)

test_queries = pd.read_csv(base_path / "queries.csv", delimiter="\t", index_col="Query number").to_dict()["Query"]




In [None]:
import random

n_subset = 10
subset = {i: docs[random.choice(list(docs.keys()))] for i in range(n_subset)}

In [None]:
db = Database(tokenize_fn=preprocess, docs=docs)

In [3]:
with open("./results/preprocessing_database.pkl", "rb") as file:
    db = pickle.load(file)

In [None]:
with open("./results/preprocessing_database.pkl", "rb") as file:
    db = pickle.load(file)

In [None]:
# Example usage for k = 3 and k = 10
k_values = [3, 10]

for k in k_values:
    map_k = map_at_k(large_queries, large_query_results,db, query_database, k)
    mar_k = mar_at_k(large_queries, large_query_results, db, query_database, k)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")

In [None]:
pos_db = PositionalDatabase(tokenize_fn=preprocess, docs=docs)

In [None]:
with open("./results/preprocessing_pos_database.pkl", "wb") as file:
    pickle.dump(pos_db, file)

In [None]:
with open("./results/preprocessing_pos_database.pkl", "rb") as file:
    pos_db = pickle.load(file)

In [None]:
# Example usage for k = 3 and k = 10
k_values = [3, 10]

query_function = partial(pos_query_database, q_fraction=1, boost_factor=1)

for k in k_values:
    map_k = map_at_k(large_queries, large_query_results, pos_db, query_function, k)
    mar_k = mar_at_k(large_queries, large_query_results, pos_db, query_function, k)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")