In [1]:
from typing import List, Set, Dict
from dataclasses import dataclass

@dataclass
class SearchEvaluation:
    """
    Represents a single search evaluation.
    
    predicted: Ordered list of items our search system returned
    validation: Set of items that are actually relevant
    """
    predicted: List[int]  
    validation: Set[int]

In [2]:
def precision(eval: SearchEvaluation) -> float:
    """
    Calculate basic precision for a single search.
    
    Real-world example:
    predicted = [PyCharm, VSCode, Sublime, Atom, Eclipse]
    validation = {PyCharm, VSCode, Jupyter, Spyder}
    Result: 2/5 = 0.4 (40% precision)
    """
    if not eval.predicted:
        return 0.0
    
    retrieved_set = set(eval.predicted)
    relevant_retrieved = len(eval.validation.intersection(retrieved_set))
    return relevant_retrieved / len(retrieved_set)

In [3]:
def precision_at_k(eval: SearchEvaluation, k: int) -> float:
    """
    Evaluate precision for top-k results.
    
    Real-world example:
    For k=3:
    predicted = [PyCharm, VSCode, Sublime, Atom, Eclipse]
    validation = {PyCharm, VSCode, Jupyter, Spyder}
    Only look at [PyCharm, VSCode, Sublime]
    Result: 2/3 ≈ 0.67 (67% precision at k=3)
    """
    if k <= 0 or not eval.predicted:
        return 0.0
    
    top_k = eval.predicted[:k]
    return precision(SearchEvaluation(top_k, eval.validation))

In [4]:
def ap_at_k(eval: SearchEvaluation, k: int) -> float:
    """
    Calculate position-aware precision up to position k.
    
    Real-world example:
    predicted = [PyCharm, Sublime, VSCode]  (k=3)
    validation = {PyCharm, VSCode}
    
    Let's break it down:
    Position 1 (PyCharm): 1/1 = 1.0 (found a relevant item)
    Position 2 (Sublime): No change (not relevant)
    Position 3 (VSCode): 2/3 ≈ 0.67 (found second relevant item)
    
    AP@3 = (1.0 + 0.67) / 2 ≈ 0.835
    """
    if k <= 0 or not eval.predicted:
        return 0.0
    
    precisions = []
    num_relevant = 0
    
    for i in range(min(k, len(eval.predicted))):
        if eval.predicted[i] in eval.validation:
            num_relevant += 1
            precisions.append(num_relevant / (i + 1))
    
    if not precisions:
        return 0.0
        
    return sum(precisions) / min(k, len(eval.validation))

In [5]:
def map_at_k(evaluations: List[SearchEvaluation], k: int) -> float:
    """
    Evaluate overall system performance across multiple searches.
    
    Example scenario:
    Search 1: "python ide" → AP@3 = 0.835
    Search 2: "python web frameworks" → AP@3 = 0.92
    Search 3: "python data science" → AP@3 = 0.76
    
    MAP@3 = (0.835 + 0.92 + 0.76) / 3 ≈ 0.838
    """
    if not evaluations:
        return 0.0
    
    ap_scores = [ap_at_k(eval, k) for eval in evaluations]
    return sum(ap_scores) / len(evaluations)

In [6]:
# Sample searches
searches = [
    # Search: "python ide"
    SearchEvaluation(
        predicted=[1, 2, 3, 4, 5],  # 1=PyCharm, 2=VSCode, etc.
        validation={1, 3, 5}        # PyCharm, Sublime, Eclipse are relevant
    ),
    # Search: "python web frameworks"
    SearchEvaluation(
        predicted=[2, 4, 1, 3, 5],  # 2=Django, 4=Flask, etc.
        validation={1, 2, 3}        # Django, Flask, FastAPI are relevant
    ),
]

In [7]:
k = 3  # We care about top 3 results
# Evaluate individual searches
for i, search in enumerate(searches, 1):
    print(f"Search {i}:")
    print(f"  Precision@{k}: {precision_at_k(search, k):.3f}")
    print(f"  AP@{k}: {ap_at_k(search, k):.3f}")
# Evaluate overall system
print(f"\nOverall System MAP@{k}: {map_at_k(searches, k):.3f}")

Search 1:
  Precision@3: 0.667
  AP@3: 0.556
Search 2:
  Precision@3: 0.667
  AP@3: 0.556

Overall System MAP@3: 0.556
