In [87]:
from pathlib import Path
import pandas as pd
import re
import numpy as np

from src.preprocessing import read, tokenize
from src.helpers import tokens_to_term_freqs
from src.database import Database
from src.query import query_database
from src.evaluation import map_at_k, mar_at_k

In [2]:
test_db_tokens = {
    "doc1": ["hey", "hello", "how", "are", "you", "hello", "hello"],
    "doc2": ["what", "is", "your", "name"],
    "doc3": ["hey", "how", "are", "you", "doing"]
}

test_db = {
    "doc1": "hey hello how are you hello hello",
    "doc2": "what is your name",
    "doc3": "hey how you doing"
}


# def tokenize(text):
#     return text.split()

for t in test_db.values():
    print(tokenize(t))

['hey', 'hello', 'how', 'are', 'you', 'hello', 'hello']
['what', 'is', 'your', 'name']
['hey', 'how', 'you', 'doing']


In [34]:
base_path = Path("/Users/stighellemans/Desktop/Information_Retrieval/assignments/assignment 1")
base_doc_path = base_path / "full_docs_small"


docs = {int(re.search(r'\d+', doc_path.name).group()): read(doc_path) for doc_path in base_doc_path.glob("*.txt")}

queries = pd.read_csv(base_path / "dev_small_queries - dev_small_queries.csv", index_col="Query number")
query_results = pd.read_csv(base_path / "dev_query_results_small.csv")
query_results.index = query_results["Query_number"]



In [19]:
for q_id, row in queries.iterrows():
    print(row["Query"])

types of road hugger tires
what agency can i report a scammer concerning my computer
what airport is in wilder ky
what are isotopes quizlet
what are the products and by products of photosynthesis?
echo alexa what does it do
ecrater phone number
what caused conflict in the middle east
what did liliuokalani write
what did maria theresa do for the serfs
what dissolves plaque on teeth
what does acquired taste mean
what does draw weight mean archery
what does gyrene mean
what does hair tint do
what family do squirrels belong to
what flavor is space jam starhunter
what is (prospectus
what is a demand fee
what is a pedicle flap
what is a situational purpose
what is a tag sale
colorado routing number loveland colorado
commercial buildings definition
what is badger inc in penn yan ny
what is hydroxyz pam used for
what is ion plating on jewelry
what is maxim
what is pododermatitis
what is prop
what is qd?
what is schmorl's node
what is social impact assessment
what is stop order process
what is 

In [55]:
query_results.loc[1031861]

Unnamed: 0_level_0,Query_number,doc_number
Query_number,Unnamed: 1_level_1,Unnamed: 2_level_1
1031861,1031861,547
1031861,1031861,1162


In [4]:
db = Database(tokenize_fn=tokenize, docs=docs)

In [89]:
# Example usage for k = 3 and k = 10
k_values = [3, 10]

for k in k_values:
    map_k = map_at_k(queries, query_results,db, k)
    mar_k = mar_at_k(queries, query_results, db, k)
    print(f"MAP@{k}: {map_k}")
    print(f"MAR@{k}: {mar_k}")

MAP@3: 0.23924731182795697
MAR@3: 0.7056451612903226
MAP@10: 0.0842741935483871
MAR@10: 0.8286290322580645


In [93]:
# Let's implement the positional indexing mechanism and MinDist computation in Python.

# Sample document collection (doc_id: [list of terms in the document])
documents = {
    1: ["t1", "t2", "t3", "t1", "t2", "t3"],
    2: ["t1", "t1", "t2", "t3", "t2", "t1"],
    3: ["t3", "t1", "t2", "t1", "t2", "t3"],
}

# Step 1: Create a positional index
def create_positional_index(docs):
    positional_index = {}
    for doc_id, terms in docs.items():
        for pos, term in enumerate(terms):
            if term not in positional_index:
                positional_index[term] = {}
            if doc_id not in positional_index[term]:
                positional_index[term][doc_id] = []
            positional_index[term][doc_id].append(pos)
    return positional_index

# Step 2: Get positions of query terms in a document
def get_term_positions(query, pos_index):
    query_positions = {}
    for term in query:
        if term in pos_index:
            query_positions[term] = pos_index[term]
    return query_positions

# Step 3: Compute MinDist for a given query in a document
def compute_min_dist(query, doc_id, term_positions):
    positions_in_doc = [term_positions[term][doc_id] for term in query if doc_id in term_positions[term]]
    min_dist = float('inf')
    for i in range(len(positions_in_doc)):
        for j in range(i + 1, len(positions_in_doc)):
            for pos1 in positions_in_doc[i]:
                for pos2 in positions_in_doc[j]:
                    dist = abs(pos1 - pos2)
                    if dist < min_dist:
                        min_dist = dist
    return min_dist if min_dist != float('inf') else None

# Step 4: Rank documents by MinDist
def rank_documents_by_mindist(query, docs, pos_index):
    term_positions = get_term_positions(query, pos_index)
    doc_scores = {}
    for doc_id in docs:
        if all(doc_id in term_positions[term] for term in query):
            min_dist = compute_min_dist(query, doc_id, term_positions)
            if min_dist is not None:
                doc_scores[doc_id] = min_dist
    ranked_docs = sorted(doc_scores.items(), key=lambda item: item[1])
    return ranked_docs

# Now let's run this on the sample documents with a sample query
query = ["t1", "t2", "t3"]
pos_index = create_positional_index(documents)
print(pos_index)
ranked_docs = rank_documents_by_mindist(query, documents, pos_index)
print(ranked_docs)

ranked_docs


{'t1': {1: [0, 3], 2: [0, 1, 5], 3: [1, 3]}, 't2': {1: [1, 4], 2: [2, 4], 3: [2, 4]}, 't3': {1: [2, 5], 2: [3], 3: [0, 5]}}
[(1, 1), (2, 1), (3, 1)]


[(1, 1), (2, 1), (3, 1)]