In [1]:
from pathlib import Path
import pandas as pd
import re
import numpy as np
from functools import partial
import pickle

from src.preprocessing import read, tokenize, preprocess
from src.helpers import process_query_results
from src.database import Database, PositionalDatabase
from src.query import query_database, query_pos_database
from src.evaluation import map_at_k, mar_at_k, retrieve_top_k_docs

In [2]:
base_path = Path("/Users/stighellemans/Desktop/Information_Retrieval/assignments/assignment 1")
base_small_doc_path = base_path / "full_docs_small"
base_large_doc_path = base_path / "full_docs"

small_docs = {int(re.search(r'\d+', doc_path.name).group()): doc_path for doc_path in base_small_doc_path.glob("*.txt")}
large_docs = {int(re.search(r'\d+', doc_path.name).group()): doc_path for doc_path in base_large_doc_path.glob("*.txt")}

small_queries = pd.read_csv(base_path / "dev_small_queries - dev_small_queries.csv", index_col="Query number").to_dict()["Query"]
small_query_results = pd.read_csv(base_path / "dev_query_results_small.csv", index_col="Query_number")
small_query_results = process_query_results(small_queries, small_query_results)

large_queries = pd.read_csv(base_path / "dev_queries.tsv", delimiter="\t", index_col="Query number").to_dict()["Query"]
large_query_results = pd.read_csv(base_path / "dev_query_results.csv", index_col="Query_number")
large_query_results = process_query_results(large_queries, large_query_results)

test_queries = pd.read_csv(base_path / "queries.csv", delimiter="\t", index_col="Query number").to_dict()["Query"]




In [3]:
db = Database(tokenize_fn=preprocess, index_path="./results/large_database.idx")
db.build_index(large_docs, batch_size=5000, n_processes=8)

Processing Batch 0: 100%|██████████| 5000/5000 [02:00<00:00, 41.63it/s]
Processing Batch 1: 100%|██████████| 5000/5000 [01:58<00:00, 42.32it/s]
Processing Batch 2: 100%|██████████| 5000/5000 [01:55<00:00, 43.32it/s]
Processing Batch 3: 100%|██████████| 5000/5000 [02:00<00:00, 41.39it/s]
Processing Batch 4: 100%|██████████| 5000/5000 [02:02<00:00, 40.82it/s]
Processing Batch 5: 100%|██████████| 5000/5000 [01:56<00:00, 42.74it/s]
Processing Batch 6: 100%|██████████| 5000/5000 [01:56<00:00, 43.06it/s]
Processing Batch 7: 100%|██████████| 5000/5000 [01:58<00:00, 42.22it/s]
Processing Batch 8: 100%|██████████| 5000/5000 [02:01<00:00, 41.20it/s]
Processing Batch 9: 100%|██████████| 5000/5000 [02:01<00:00, 41.16it/s]
Processing Batch 10: 100%|██████████| 5000/5000 [01:58<00:00, 42.34it/s]
Processing Batch 11: 100%|██████████| 5000/5000 [02:01<00:00, 41.22it/s]
Processing Batch 12: 100%|██████████| 5000/5000 [02:03<00:00, 40.58it/s]
Processing Batch 13: 100%|██████████| 5000/5000 [02:01<00:00,

In [3]:
db = Database(tokenize_fn=preprocess, index_path="./results/small_database.idx")

Precomputing doc norms: 100%|██████████| 80249/80249 [00:01<00:00, 61367.70it/s]


In [6]:
with open("./results/large_database.pkl", "wb") as f:
    pickle.dump(db, f)

In [14]:
with open("./results/large_database.pkl", "rb") as f:
    db = pickle.load(f)

In [4]:
pos_db = PositionalDatabase(tokenize_fn=preprocess, index_path="./results/large_pos_database.idx")
pos_db.build_index(large_docs, batch_size=5000, n_processes=8)

Processing Batch 0: 100%|██████████| 5000/5000 [02:02<00:00, 40.85it/s]
Processing Batch 1: 100%|██████████| 5000/5000 [02:04<00:00, 40.30it/s]
Processing Batch 2: 100%|██████████| 5000/5000 [02:01<00:00, 41.20it/s]
Processing Batch 3: 100%|██████████| 5000/5000 [02:03<00:00, 40.44it/s]
Processing Batch 4: 100%|██████████| 5000/5000 [01:59<00:00, 41.80it/s]
Processing Batch 5: 100%|██████████| 5000/5000 [02:01<00:00, 41.08it/s]
Processing Batch 6: 100%|██████████| 5000/5000 [02:01<00:00, 41.17it/s]
Processing Batch 7: 100%|██████████| 5000/5000 [02:01<00:00, 41.01it/s]
Processing Batch 8: 100%|██████████| 5000/5000 [02:05<00:00, 39.99it/s]
Processing Batch 9: 100%|██████████| 5000/5000 [02:05<00:00, 39.75it/s]
Processing Batch 10: 100%|██████████| 5000/5000 [02:01<00:00, 41.24it/s]
Processing Batch 11: 100%|██████████| 5000/5000 [02:04<00:00, 40.13it/s]
Processing Batch 12: 100%|██████████| 5000/5000 [02:07<00:00, 39.32it/s]
Processing Batch 13: 100%|██████████| 5000/5000 [02:04<00:00,

In [4]:
pos_db = PositionalDatabase(tokenize_fn=preprocess, index_path="./results/large_pos_database.idx")

In [7]:
with open("./results/large_pos_database.pkl", "wb") as f:
    pickle.dump(pos_db, f)

In [3]:
with open("./results/large_pos_database.pkl", "rb") as f:
    pos_db = pickle.load(f)

In [8]:
# Example usage for k = 3 and k = 10
k_values = [3, 10]

for k in k_values:
    map_k = map_at_k(large_queries, large_query_results,db, query_database, k)
    mar_k = mar_at_k(large_queries, large_query_results, db, query_database, k)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")

Processing queries for MAP@K=3: 100%|██████████| 5193/5193 [03:49<00:00, 22.67it/s]
Processing queries for MAR@K=3: 100%|██████████| 5193/5193 [03:46<00:00, 22.92it/s]


MAP@3: 0.2560498106425316
MAR@3: 0.05082440237713469


Processing queries for MAP@K=10: 100%|██████████| 5193/5193 [03:47<00:00, 22.78it/s]
Processing queries for MAR@K=10: 100%|██████████| 5193/5193 [03:44<00:00, 23.08it/s]


MAP@10: 0.21399961486616598
MAR@10: 0.1397437579318776


In [6]:

query = large_queries[174249]

query_database(db, query)

[(110428, 0.13039241974236912),
 (26065, 0.12308827262898157),
 (334958, 0.12120736839958948),
 (317399, 0.12013743992211287),
 (469876, 0.1128595989327455),
 (256924, 0.11186291844722618),
 (426083, 0.10500754032174614),
 (156745, 0.10455386150514911),
 (91736, 0.10308367942879),
 (335758, 0.09959282165980102),
 (233629, 0.09943126673909417),
 (277998, 0.097653214250352),
 (423852, 0.09588289071889174),
 (54394, 0.09558562451161848),
 (288334, 0.09535184393748417),
 (322768, 0.09514035859680282),
 (216737, 0.09379714328220218),
 (30463, 0.09359226730050545),
 (245810, 0.09306111357304483),
 (471142, 0.09299357161445121),
 (371627, 0.09271802423079788),
 (14363, 0.09054970271642503),
 (60080, 0.08976784635801048),
 (276362, 0.08878861532414008),
 (204093, 0.08666717247405101),
 (263960, 0.08587555631449025),
 (341055, 0.08539473318854547),
 (359669, 0.08326093277852184),
 (26639, 0.08322881161401835),
 (446170, 0.08300992762671347),
 (15040, 0.08287776543462654),
 (315477, 0.0825055711

In [4]:
pos_db.inverted_index

1. a: df=30, positional_postings={ 327854: [24], 149462: [105], 260673: [88], 341443: [2211], 474270: [1869, 2401], ... }
2. aa: df=4628, positional_postings={ 204055: [238, 492], 284655: [18, 169, 171], 232281: [107], 453424: [311], 124329: [15, 97, 99], ... }
3. aaa: df=1525, positional_postings={ 159498: [341], 480668: [99, 100], 231950: [404], 352207: [345], 280427: [16], ... }
4. aaaa: df=108, positional_postings={ 312637: [129], 139797: [90], 284912: [11742], 440511: [40], 122598: [1355, 1361], ... }
5. aaaaa: df=31, positional_postings={ 92749: [946, 981, 1003, 1033, 1050], 449906: [378, 647, 685, 788], 436650: [830, 886, 906, 948, 971], 500650: [47017], 475792: [1151], ... }
6. aaaaaa: df=18, positional_postings={ 59758: [550], 92749: [824], 214244: [125], 271208: [116], 288036: [1058], ... }
7. aaaaaaa: df=4, positional_postings={ 23923: [392, 403, 481, 488, 502, 512, 514, 555, 587, 594, 606, 616, 618], 294004: [498, 1410], 88309: [17, 52, 60], 225732: [532] }
8. aaaaaaaa: df=

In [11]:
# Example usage for k = 3 and k = 10
k_values = [3, 10]

query_function = partial(query_pos_database, q_fraction=1, boost_factor=1)

for k in k_values:
    map_k = map_at_k(large_queries, large_query_results, pos_db, query_function, k)
    mar_k = mar_at_k(large_queries, large_query_results, pos_db, query_function, k)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")

Processing queries for MAP@K=3: 100%|██████████| 5193/5193 [48:17<00:00,  1.79it/s]  
Processing queries for MAR@K=3: 100%|██████████| 5193/5193 [47:31<00:00,  1.82it/s]  


MAP@3: 0.42056614673599074
MAR@3: 0.08363313319709792


Processing queries for MAP@K=10: 100%|██████████| 5193/5193 [47:41<00:00,  1.82it/s]  
Processing queries for MAR@K=10: 100%|██████████| 5193/5193 [47:33<00:00,  1.82it/s]  


MAP@10: 0.32501444251877526
MAR@10: 0.2125475318423913


In [15]:
from tqdm import tqdm
k = 10

results = [(q_id, doc_id) for q_id, q in tqdm(test_queries.items()) for doc_id, _ in query_pos_database(pos_db, q)[:k]]
pd.DataFrame(results, columns=["Query_number", "doc_number"]).to_csv("./results/pos_results.csv", index=False)

  0%|          | 11/5793 [00:19<2:04:54,  1.30s/it]

KeyboardInterrupt: 

In [4]:
query_function = partial(query_pos_database, q_fraction=1, boost_factor=1)
k = 10

results = retrieve_top_k_docs(test_queries, pos_db, query_function, k)

results = [(q_id, doc_id) for q_id, doc_ids in results.items() for doc_id in doc_ids]
pd.DataFrame(sorted(results, key=lambda x: x[0]), columns=["Query_number", "doc_number"]).to_csv("./results/pos_results.csv", index=False)

Retrieving top-k documents: 100%|██████████| 5793/5793 [53:02<00:00,  1.82it/s]  


In [7]:
list(d1.keys())

['term1', 'term2', 'term3']

In [1]:
from src.index import InvertedIndex

dbs = []
for i, d in enumerate([d1, d2, d3, d4, d5]):
    InvertedIndex.write_partial_index(d, f"db_{i}.idx")
    dbs.append(f"db_{i}.idx")

NameError: name 'd1' is not defined

In [7]:
InvertedIndex.merge_partial_indices(dbs, output_path="test_merged.idx")

In [2]:
testdb = InvertedIndex("test_merged.idx")
testdb._num_terms_to_show = 5
print(testdb)

1. term1: df=49, postings={ 1: 5, 2: 53, 3: 83, 4: 16, 10: 3, ... }
2. term10: df=17, postings={ 1: 25, 3: 54, 5: 92 }
3. term2: df=51, postings={ 2: 35, 10: 12, 6: 518, 1: 16, 5: 45, ... }
4. term3: df=5, postings={ 8: 2, 19: 5, 32: 124 }
5. term4: df=24, postings={ 5: 11, 8: 36, 11: 157, 4: 10, 16: 33 }
...
Vocabulary size: 8, File size: 433.00 Bytes
