In [1]:
from pathlib import Path
import pandas as pd
import re
import numpy as np

from src.preprocessing import read, tokenize
from src.helpers import tokens_to_term_freqs
from src.database import Database
from src.query import query_database
from src.evaluation import map_at_k, mar_at_k

In [2]:
test_db_tokens = {
    "doc1": ["hey", "hello", "how", "are", "you", "hello", "hello"],
    "doc2": ["what", "is", "your", "name"],
    "doc3": ["hey", "how", "are", "you", "doing"]
}

test_db = {
    "doc1": "hey hello how are you hello hello",
    "doc2": "what is your name",
    "doc3": "hey how you doing"
}


# def tokenize(text):
#     return text.split()

for t in test_db.values():
    print(tokenize(t))

['hey', 'hello', 'how', 'are', 'you', 'hello', 'hello']
['what', 'is', 'your', 'name']
['hey', 'how', 'you', 'doing']


In [2]:
base_path = Path("/Users/stighellemans/Desktop/Information_Retrieval/assignments/assignment 1")
base_doc_path = base_path / "full_docs_small"


docs = {int(re.search(r'\d+', doc_path.name).group()): read(doc_path) for doc_path in base_doc_path.glob("*.txt")}

queries = pd.read_csv(base_path / "dev_small_queries - dev_small_queries.csv", index_col="Query number")
query_results = pd.read_csv(base_path / "dev_query_results_small.csv")
query_results.index = query_results["Query_number"]



In [55]:
query_results.loc[1031861]

Unnamed: 0_level_0,Query_number,doc_number
Query_number,Unnamed: 1_level_1,Unnamed: 2_level_1
1031861,1031861,547
1031861,1031861,1162


In [3]:
db = Database(tokenize_fn=tokenize, docs=docs)

In [4]:
# Example usage for k = 3 and k = 10
k_values = [3, 10]

for k in k_values:
    map_k = map_at_k(queries, query_results,db, k)
    mar_k = mar_at_k(queries, query_results, db, k)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")

MAP@3: 0.23924731182795697
MAR@3: 0.7056451612903226
MAP@10: 0.0842741935483871
MAR@10: 0.8286290322580645


In [13]:
def calc_min_distances(q_tokens, database):
    # remove tokens not in vocab
    q_tokens = [term for term in q_tokens if term in database.inverted_index]

    # get pos indices to calculate a distance
    doc_pos = {}
    for q_term in q_tokens:
        term_info = database.inverted_index[q_term]

        for post in term_info.posting_list:
            doc_id = post.doc_id
            if doc_id not in doc_pos:
                doc_pos[doc_id] = []
            doc_pos[doc_id].append(post.pos_idxs)

    return  {doc_id: min_distance(doc_pos[doc_id]) for doc_id in doc_pos}


def min_distance(d_pos_idxs):
    min_dist = float("inf")
    for i in range(len(d_pos_idxs)):
        for j in range(i + 1, len(d_pos_idxs)):
            for pos1 in d_pos_idxs[i]:
                for pos2 in d_pos_idxs[j]:
                    dist = abs(pos1 - pos2)
                    if dist < min_dist:
                        min_dist = dist
    return min_dist if min_dist != float("inf") else None


dists = calc_min_distances(["hey", "hello", "how", "are", "you", "hello", "hello"], db)

import math
alpha = 1

for id, dist in dists.items():
    if dist is not None:
        print(math.log10(alpha + math.exp(-dists[id])))

3.293064506389929e-10
0.0029164387928812723
0.00014566513694377057
0.05512413479491801
0.13604782228086496
0.3010299956639812
2.2188210677867902e-12
1.9716491414368737e-05
6.6142961023123485e-09
0.0
0.13604782228086496
0.05512413479491801
0.13604782228086496
0.007882413541646564
0.13604782228086496
0.3010299956639812
0.3010299956639812
0.13604782228086496
0.3010299956639812
0.3010299956639812
0.3010299956639812
0.3010299956639812
0.3010299956639812
0.3010299956639812
0.00014566513694377057
4.887340265718772e-08
1.9716491414368737e-05
3.611281841545078e-07
0.007882413541646564
0.007882413541646564
0.13604782228086496
0.0
0.0
0.13604782228086496
0.13604782228086496
0.05512413479491801
0.13604782228086496
5.359289005162458e-05
0.13604782228086496
8.95147673940449e-10
0.13604782228086496
0.13604782228086496
0.021101218678769414
0.0
0.13604782228086496
0.13604782228086496
8.95147673940449e-10
1.6395206287789552e-11
0.13604782228086496
0.021101218678769414
0.13604782228086496
0.1360478222808

In [93]:
# Let's implement the positional indexing mechanism and MinDist computation in Python.

# Sample document collection (doc_id: [list of terms in the document])
documents = {
    1: ["t1", "t2", "t3", "t1", "t2", "t3"],
    2: ["t1", "t1", "t2", "t3", "t2", "t1"],
    3: ["t3", "t1", "t2", "t1", "t2", "t3"],
}

# Step 1: Create a positional index
def create_positional_index(docs):
    positional_index = {}
    for doc_id, terms in docs.items():
        for pos, term in enumerate(terms):
            if term not in positional_index:
                positional_index[term] = {}
            if doc_id not in positional_index[term]:
                positional_index[term][doc_id] = []
            positional_index[term][doc_id].append(pos)
    return positional_index

# Step 2: Get positions of query terms in a document
def get_term_positions(query, pos_index):
    query_positions = {}
    for term in query:
        if term in pos_index:
            query_positions[term] = pos_index[term]
    return query_positions

# Step 3: Compute MinDist for a given query in a document
def compute_min_dist(query, doc_id, term_positions):
    positions_in_doc = [term_positions[term][doc_id] for term in query if doc_id in term_positions[term]]
    min_dist = float('inf')
    for i in range(len(positions_in_doc)):
        for j in range(i + 1, len(positions_in_doc)):
            for pos1 in positions_in_doc[i]:
                for pos2 in positions_in_doc[j]:
                    dist = abs(pos1 - pos2)
                    if dist < min_dist:
                        min_dist = dist
    return min_dist if min_dist != float('inf') else None

# Step 4: Rank documents by MinDist
def rank_documents_by_mindist(query, docs, pos_index):
    term_positions = get_term_positions(query, pos_index)
    doc_scores = {}
    for doc_id in docs:
        if all(doc_id in term_positions[term] for term in query):
            min_dist = compute_min_dist(query, doc_id, term_positions)
            if min_dist is not None:
                doc_scores[doc_id] = min_dist
    ranked_docs = sorted(doc_scores.items(), key=lambda item: item[1])
    return ranked_docs

# Now let's run this on the sample documents with a sample query
query = ["t1", "t2", "t3"]
pos_index = create_positional_index(documents)
print(pos_index)
ranked_docs = rank_documents_by_mindist(query, documents, pos_index)
print(ranked_docs)

ranked_docs


{'t1': {1: [0, 3], 2: [0, 1, 5], 3: [1, 3]}, 't2': {1: [1, 4], 2: [2, 4], 3: [2, 4]}, 't3': {1: [2, 5], 2: [3], 3: [0, 5]}}
[(1, 1), (2, 1), (3, 1)]


[(1, 1), (2, 1), (3, 1)]