In [1]:
from pathlib import Path
import pandas as pd
import re
import numpy as np
from functools import partial
import pickle

from src.preprocessing import read, tokenize, preprocess
from src.helpers import process_query_results
from src.database import Database, PositionalDatabase
from src.query import query_database, pos_query_database
from src.evaluation import map_at_k, mar_at_k

In [2]:
base_path = Path("/Users/stighellemans/Desktop/Information_Retrieval/assignments/assignment 1")
base_small_doc_path = base_path / "full_docs_small"
base_large_doc_path = base_path / "full_docs"

small_docs = {int(re.search(r'\d+', doc_path.name).group()): doc_path for doc_path in base_small_doc_path.glob("*.txt")}
large_docs = {int(re.search(r'\d+', doc_path.name).group()): doc_path for doc_path in base_large_doc_path.glob("*.txt")}

small_queries = pd.read_csv(base_path / "dev_small_queries - dev_small_queries.csv", index_col="Query number").to_dict()["Query"]
small_query_results = pd.read_csv(base_path / "dev_query_results_small.csv", index_col="Query_number")
small_query_results = process_query_results(small_queries, small_query_results)

large_queries = pd.read_csv(base_path / "dev_queries.tsv", delimiter="\t", index_col="Query number").to_dict()["Query"]
large_query_results = pd.read_csv(base_path / "dev_query_results.csv", index_col="Query_number")
large_query_results = process_query_results(large_queries, large_query_results)

test_queries = pd.read_csv(base_path / "queries.csv", delimiter="\t", index_col="Query number").to_dict()["Query"]




In [13]:
large_subset = {ID: large_docs[ID] for ID in list(large_docs.keys())[90000:190000]}

In [None]:
max([len(read(doc)) for doc in large_subset.values()])

In [3]:
db = Database(tokenize_fn=preprocess, docs=large_docs, n_processes=8, batch_size=5000)

Processing Batch 0: 100%|██████████| 5000/5000 [01:49<00:00, 45.76it/s]
Processing Batch 1: 100%|██████████| 5000/5000 [01:50<00:00, 45.22it/s]
Processing Batch 2: 100%|██████████| 5000/5000 [01:47<00:00, 46.70it/s]
Processing Batch 3: 100%|██████████| 5000/5000 [01:51<00:00, 45.03it/s]
Processing Batch 4: 100%|██████████| 5000/5000 [01:48<00:00, 46.03it/s]
Processing Batch 5: 100%|██████████| 5000/5000 [01:50<00:00, 45.14it/s]
Processing Batch 6: 100%|██████████| 5000/5000 [01:50<00:00, 45.34it/s]
Processing Batch 7: 100%|██████████| 5000/5000 [01:52<00:00, 44.51it/s]
Processing Batch 8: 100%|██████████| 5000/5000 [01:56<00:00, 42.88it/s]
Processing Batch 9: 100%|██████████| 5000/5000 [01:57<00:00, 42.73it/s]
Processing Batch 10: 100%|██████████| 5000/5000 [01:52<00:00, 44.41it/s]
Processing Batch 11: 100%|██████████| 5000/5000 [01:55<00:00, 43.41it/s]
Processing Batch 12: 100%|██████████| 5000/5000 [01:59<00:00, 41.91it/s]
Processing Batch 13: 100%|██████████| 5000/5000 [01:54<00:00,

In [8]:
vocab = [(term, terminfo[0]) for term, terminfo in db.inverted_index.items()]

len([t for t, _ in vocab if t.isalpha()])

6475080

In [18]:
test_queries[1], preprocess(test_queries[1])

('how to display how.close you are to.cell.tower', ['display'])

In [19]:
import re

text = "how.to.display how.close you are to.cell.tower"
clean_text = re.sub(r'[^a-zA-Z]+', ' ', text)
print(clean_text)

how to display how close you are to cell tower


In [20]:
preprocess(clean_text)

['display', 'close', 'cell', 'tower']

In [5]:
with open("./results/preprocessing_large_database.pkl", "wb") as file:
    pickle.dump(db, file)

In [7]:
db.vocab_size()

6475081

In [3]:
with open("./results/preprocessing_large_database.pkl", "rb") as file:
    db = pickle.load(file)

In [None]:
# Example usage for k = 3 and k = 10
k_values = [3, 10]

for k in k_values:
    map_k = map_at_k(large_queries, large_query_results,"db.pickle", query_database, k)
    mar_k = mar_at_k(large_queries, large_query_results, "db.pickle", query_database, k)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")

Processing queries for MAP@K=3:  48%|████▊     | 2494/5193 [03:46<06:29,  6.92it/s]

In [None]:
pos_db = PositionalDatabase(tokenize_fn=preprocess, docs=large_docs, batch_size=5000)

In [None]:
pos_db.merge_partial_indices

In [None]:
with open("./results/preprocessing_large_pos_database.pkl", "wb") as file:
    pickle.dump(pos_db, file)

In [None]:
with open("./results/preprocessing_large_pos_database.pkl", "rb") as file:
    pos_db = pickle.load(file)

In [None]:
# Example usage for k = 3 and k = 10
k_values = [3, 10]

query_function = partial(pos_query_database, q_fraction=1, boost_factor=1)

for k in k_values:
    map_k = map_at_k(large_queries, large_query_results, pos_db, query_function, k)
    mar_k = mar_at_k(large_queries, large_query_results, pos_db, query_function, k)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")