# v22.1 Comprehensive Model Evaluation

## Overview

This notebook provides comprehensive evaluation of the Neural Sparse v22.1 model by comparing three search methods:

1. **BM25** - Traditional term-frequency baseline
2. **Semantic Search** - Dense embeddings using BGE-M3
3. **Neural Sparse v22.1** - Trained sparse retrieval model

## Evaluation Metrics

- **NDCG@10** - Normalized Discounted Cumulative Gain at 10
- **Recall@1, Recall@5, Recall@10** - Retrieval coverage at various cutoffs
- **MRR** - Mean Reciprocal Rank

In [None]:
import sys
from pathlib import Path


def find_project_root() -> Path:
    """Find project root directory."""
    current = Path.cwd()
    for parent in [current] + list(current.parents):
        if (parent / "pyproject.toml").exists() or (parent / "src").exists():
            return parent
    return Path.cwd().parent.parent


PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT))

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import json
import math
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any
from collections import defaultdict
from dataclasses import dataclass, field
from transformers import AutoTokenizer, AutoModelForMaskedLM
from tqdm.auto import tqdm

print(f"Project root: {PROJECT_ROOT}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Evaluation date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 1. Configuration

In [None]:
# Model and data configuration
MODEL_NAME = "skt/A.X-Encoder-base"
SEMANTIC_MODEL_NAME = "BAAI/bge-m3"

# Paths - adjust as needed for v22.1
V22_1_MODEL_PATH = PROJECT_ROOT / "outputs" / "v22.1" / "best_model.pt"
V22_0_MODEL_PATH = PROJECT_ROOT / "outputs" / "v22.0_infonce" / "best_model.pt"
V21_4_MODEL_PATH = PROJECT_ROOT / "outputs" / "v21.4_korean_enhanced" / "best_model.pt"

VALIDATION_DATA_PATH = PROJECT_ROOT / "data" / "v22.0" / "validation_triplets.jsonl"

OUTPUT_DIR = PROJECT_ROOT / "outputs" / "v22.1"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Evaluation settings
MAX_EVAL_SAMPLES = 500  # Number of queries to evaluate
TOP_K_VALUES = [1, 5, 10]
BATCH_SIZE = 32

print(f"Device: {DEVICE}")
print(f"\nModel paths:")
print(f"  v22.1: {V22_1_MODEL_PATH} (exists: {V22_1_MODEL_PATH.exists()})")
print(f"  v22.0: {V22_0_MODEL_PATH} (exists: {V22_0_MODEL_PATH.exists()})")
print(f"  v21.4: {V21_4_MODEL_PATH} (exists: {V21_4_MODEL_PATH.exists()})")
print(f"\nValidation data: {VALIDATION_DATA_PATH} (exists: {VALIDATION_DATA_PATH.exists()})")

## 2. Data Structures and Metrics

In [None]:
@dataclass
class EvalSample:
    """Single evaluation sample (query with ground truth)."""
    query: str
    positive_doc: str
    negative_doc: str
    doc_id: int = 0


@dataclass
class RetrievalResult:
    """Result from a retrieval method for a single query."""
    query_id: int
    retrieved_doc_ids: List[int]
    scores: List[float]
    ground_truth_id: int


@dataclass
class EvaluationMetrics:
    """Aggregated evaluation metrics."""
    method_name: str
    ndcg_at_10: float = 0.0
    recall_at_1: float = 0.0
    recall_at_5: float = 0.0
    recall_at_10: float = 0.0
    mrr: float = 0.0
    per_query_results: List[Dict] = field(default_factory=list)
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            "method_name": self.method_name,
            "ndcg_at_10": self.ndcg_at_10,
            "recall_at_1": self.recall_at_1,
            "recall_at_5": self.recall_at_5,
            "recall_at_10": self.recall_at_10,
            "mrr": self.mrr,
            "num_queries": len(self.per_query_results),
        }

In [None]:
class MetricsCalculator:
    """Calculate retrieval metrics."""
    
    @staticmethod
    def dcg_at_k(relevances: List[int], k: int) -> float:
        """
        Calculate Discounted Cumulative Gain at k.
        
        DCG@k = sum_{i=1}^{k} (2^{rel_i} - 1) / log_2(i + 1)
        """
        relevances = relevances[:k]
        dcg = 0.0
        for i, rel in enumerate(relevances):
            dcg += (2**rel - 1) / math.log2(i + 2)
        return dcg
    
    @staticmethod
    def ndcg_at_k(relevances: List[int], k: int) -> float:
        """
        Calculate Normalized Discounted Cumulative Gain at k.
        
        nDCG@k = DCG@k / IDCG@k
        """
        dcg = MetricsCalculator.dcg_at_k(relevances, k)
        # Ideal DCG: all relevant items at top
        ideal_relevances = sorted(relevances, reverse=True)
        idcg = MetricsCalculator.dcg_at_k(ideal_relevances, k)
        if idcg == 0:
            return 0.0
        return dcg / idcg
    
    @staticmethod
    def recall_at_k(
        retrieved_ids: List[int],
        relevant_ids: set,
        k: int,
    ) -> float:
        """Calculate Recall at k."""
        if not relevant_ids:
            return 0.0
        retrieved_set = set(retrieved_ids[:k])
        hits = len(retrieved_set & relevant_ids)
        return hits / len(relevant_ids)
    
    @staticmethod
    def reciprocal_rank(
        retrieved_ids: List[int],
        relevant_ids: set,
    ) -> float:
        """Calculate Reciprocal Rank (1/rank of first relevant item)."""
        for rank, doc_id in enumerate(retrieved_ids, start=1):
            if doc_id in relevant_ids:
                return 1.0 / rank
        return 0.0
    
    @staticmethod
    def compute_all_metrics(
        results: List[RetrievalResult],
        method_name: str,
    ) -> EvaluationMetrics:
        """Compute all metrics from retrieval results."""
        ndcg_10_scores = []
        recall_1_scores = []
        recall_5_scores = []
        recall_10_scores = []
        mrr_scores = []
        per_query = []
        
        for result in results:
            relevant_ids = {result.ground_truth_id}
            
            # Build relevance list (1 if relevant, 0 otherwise)
            relevances = [
                1 if doc_id in relevant_ids else 0
                for doc_id in result.retrieved_doc_ids
            ]
            
            ndcg_10 = MetricsCalculator.ndcg_at_k(relevances, 10)
            recall_1 = MetricsCalculator.recall_at_k(
                result.retrieved_doc_ids, relevant_ids, 1
            )
            recall_5 = MetricsCalculator.recall_at_k(
                result.retrieved_doc_ids, relevant_ids, 5
            )
            recall_10 = MetricsCalculator.recall_at_k(
                result.retrieved_doc_ids, relevant_ids, 10
            )
            rr = MetricsCalculator.reciprocal_rank(
                result.retrieved_doc_ids, relevant_ids
            )
            
            ndcg_10_scores.append(ndcg_10)
            recall_1_scores.append(recall_1)
            recall_5_scores.append(recall_5)
            recall_10_scores.append(recall_10)
            mrr_scores.append(rr)
            
            per_query.append({
                "query_id": result.query_id,
                "ndcg@10": ndcg_10,
                "recall@1": recall_1,
                "recall@5": recall_5,
                "recall@10": recall_10,
                "mrr": rr,
                "ground_truth_rank": (
                    result.retrieved_doc_ids.index(result.ground_truth_id) + 1
                    if result.ground_truth_id in result.retrieved_doc_ids
                    else -1
                ),
            })
        
        return EvaluationMetrics(
            method_name=method_name,
            ndcg_at_10=np.mean(ndcg_10_scores),
            recall_at_1=np.mean(recall_1_scores),
            recall_at_5=np.mean(recall_5_scores),
            recall_at_10=np.mean(recall_10_scores),
            mrr=np.mean(mrr_scores),
            per_query_results=per_query,
        )


print("MetricsCalculator defined.")

## 3. Load Validation Data

In [None]:
def load_validation_data(
    path: Path,
    max_samples: int = 500,
) -> Tuple[List[EvalSample], List[str], Dict[str, int]]:
    """
    Load validation data and build corpus.
    
    Returns:
        samples: List of evaluation samples
        corpus: List of unique documents
        doc_to_id: Mapping from document text to ID
    """
    samples = []
    corpus_set = set()
    
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= max_samples:
                break
            item = json.loads(line.strip())
            query = item.get("anchor", item.get("query", ""))
            positive = item.get("positive", item.get("positive_doc", ""))
            negative = item.get("negative", item.get("negative_doc", ""))
            
            if query and positive and negative:
                samples.append(EvalSample(
                    query=query,
                    positive_doc=positive,
                    negative_doc=negative,
                ))
                corpus_set.add(positive)
                corpus_set.add(negative)
    
    # Build corpus and mapping
    corpus = list(corpus_set)
    doc_to_id = {doc: i for i, doc in enumerate(corpus)}
    
    # Assign doc IDs to samples
    for sample in samples:
        sample.doc_id = doc_to_id[sample.positive_doc]
    
    print(f"Loaded {len(samples)} samples")
    print(f"Corpus size: {len(corpus)} unique documents")
    
    return samples, corpus, doc_to_id


# Load data
eval_samples, corpus, doc_to_id = load_validation_data(
    VALIDATION_DATA_PATH, max_samples=MAX_EVAL_SAMPLES
)

In [None]:
# Show sample data
print("Sample queries:")
for i, sample in enumerate(eval_samples[:3]):
    print(f"\n[{i+1}] Query: {sample.query[:80]}...")
    print(f"    Positive: {sample.positive_doc[:80]}...")
    print(f"    Ground truth doc_id: {sample.doc_id}")

## 4. BM25 Baseline

In [None]:
try:
    from rank_bm25 import BM25Okapi
    BM25_AVAILABLE = True
    print("rank_bm25 is available.")
except ImportError:
    BM25_AVAILABLE = False
    print("rank_bm25 not installed. Install with: pip install rank-bm25")

In [None]:
class BM25Retriever:
    """BM25 retrieval baseline."""
    
    def __init__(self, corpus: List[str]):
        """Initialize BM25 index."""
        self.corpus = corpus
        # Simple whitespace tokenization for Korean
        # For better performance, use a proper Korean tokenizer
        tokenized = [self._tokenize(doc) for doc in corpus]
        self.bm25 = BM25Okapi(tokenized)
    
    def _tokenize(self, text: str) -> List[str]:
        """Simple tokenization (whitespace + character n-grams)."""
        # Split on whitespace
        tokens = text.split()
        # Add character bigrams for Korean
        for word in text.split():
            if len(word) >= 2:
                for i in range(len(word) - 1):
                    tokens.append(word[i:i+2])
        return tokens
    
    def search(
        self,
        query: str,
        top_k: int = 10,
    ) -> Tuple[List[int], List[float]]:
        """Search for relevant documents."""
        tokenized_query = self._tokenize(query)
        scores = self.bm25.get_scores(tokenized_query)
        
        # Get top-k indices
        top_indices = np.argsort(scores)[::-1][:top_k]
        top_scores = [scores[i] for i in top_indices]
        
        return top_indices.tolist(), top_scores
    
    def evaluate(
        self,
        samples: List[EvalSample],
        top_k: int = 10,
    ) -> List[RetrievalResult]:
        """Evaluate on samples."""
        results = []
        
        for i, sample in enumerate(tqdm(samples, desc="BM25 search")):
            doc_ids, scores = self.search(sample.query, top_k=top_k)
            results.append(RetrievalResult(
                query_id=i,
                retrieved_doc_ids=doc_ids,
                scores=scores,
                ground_truth_id=sample.doc_id,
            ))
        
        return results


if BM25_AVAILABLE:
    print("Building BM25 index...")
    bm25_retriever = BM25Retriever(corpus)
    print("BM25 index built.")

In [None]:
# Evaluate BM25
if BM25_AVAILABLE:
    print("Evaluating BM25...")
    bm25_results = bm25_retriever.evaluate(eval_samples, top_k=10)
    bm25_metrics = MetricsCalculator.compute_all_metrics(bm25_results, "BM25")
    
    print(f"\nBM25 Results:")
    print(f"  NDCG@10: {bm25_metrics.ndcg_at_10:.4f}")
    print(f"  Recall@1: {bm25_metrics.recall_at_1:.4f}")
    print(f"  Recall@5: {bm25_metrics.recall_at_5:.4f}")
    print(f"  Recall@10: {bm25_metrics.recall_at_10:.4f}")
    print(f"  MRR: {bm25_metrics.mrr:.4f}")
else:
    bm25_metrics = None
    print("BM25 evaluation skipped (rank_bm25 not installed).")

## 5. Semantic Search (Dense Embeddings)

In [None]:
try:
    from sentence_transformers import SentenceTransformer
    SBERT_AVAILABLE = True
    print("sentence_transformers is available.")
except ImportError:
    SBERT_AVAILABLE = False
    print("sentence_transformers not installed.")
    print("Install with: pip install sentence-transformers")

In [None]:
class SemanticRetriever:
    """Semantic search using dense embeddings."""
    
    def __init__(
        self,
        model_name: str,
        corpus: List[str],
        device: torch.device,
        batch_size: int = 32,
    ):
        """Initialize semantic retriever."""
        self.model = SentenceTransformer(model_name, device=str(device))
        self.corpus = corpus
        self.device = device
        self.batch_size = batch_size
        
        # Precompute corpus embeddings
        print("Computing corpus embeddings...")
        self.corpus_embeddings = self.model.encode(
            corpus,
            batch_size=batch_size,
            show_progress_bar=True,
            convert_to_tensor=True,
            normalize_embeddings=True,
        )
        print(f"Corpus embeddings shape: {self.corpus_embeddings.shape}")
    
    def search(
        self,
        query: str,
        top_k: int = 10,
    ) -> Tuple[List[int], List[float]]:
        """Search for relevant documents."""
        query_embedding = self.model.encode(
            [query],
            convert_to_tensor=True,
            normalize_embeddings=True,
        )
        
        # Compute cosine similarities
        similarities = torch.mm(
            query_embedding, self.corpus_embeddings.T
        ).squeeze(0)
        
        # Get top-k
        top_scores, top_indices = torch.topk(similarities, k=top_k)
        
        return top_indices.cpu().tolist(), top_scores.cpu().tolist()
    
    def evaluate(
        self,
        samples: List[EvalSample],
        top_k: int = 10,
    ) -> List[RetrievalResult]:
        """Evaluate on samples."""
        results = []
        
        # Batch encode queries
        print("Encoding queries...")
        queries = [s.query for s in samples]
        query_embeddings = self.model.encode(
            queries,
            batch_size=self.batch_size,
            show_progress_bar=True,
            convert_to_tensor=True,
            normalize_embeddings=True,
        )
        
        # Compute all similarities at once
        print("Computing similarities...")
        all_similarities = torch.mm(
            query_embeddings, self.corpus_embeddings.T
        )
        
        # Get top-k for each query
        top_scores, top_indices = torch.topk(all_similarities, k=top_k, dim=1)
        
        for i, sample in enumerate(samples):
            results.append(RetrievalResult(
                query_id=i,
                retrieved_doc_ids=top_indices[i].cpu().tolist(),
                scores=top_scores[i].cpu().tolist(),
                ground_truth_id=sample.doc_id,
            ))
        
        return results


if SBERT_AVAILABLE:
    print(f"\nLoading semantic model: {SEMANTIC_MODEL_NAME}")
    print("This may take a while for the first time...")

In [None]:
# Initialize and evaluate semantic retriever
if SBERT_AVAILABLE:
    semantic_retriever = SemanticRetriever(
        model_name=SEMANTIC_MODEL_NAME,
        corpus=corpus,
        device=DEVICE,
        batch_size=BATCH_SIZE,
    )
    
    print("\nEvaluating Semantic Search...")
    semantic_results = semantic_retriever.evaluate(eval_samples, top_k=10)
    semantic_metrics = MetricsCalculator.compute_all_metrics(
        semantic_results, f"Semantic ({SEMANTIC_MODEL_NAME.split('/')[-1]})"
    )
    
    print(f"\nSemantic Search Results:")
    print(f"  NDCG@10: {semantic_metrics.ndcg_at_10:.4f}")
    print(f"  Recall@1: {semantic_metrics.recall_at_1:.4f}")
    print(f"  Recall@5: {semantic_metrics.recall_at_5:.4f}")
    print(f"  Recall@10: {semantic_metrics.recall_at_10:.4f}")
    print(f"  MRR: {semantic_metrics.mrr:.4f}")
else:
    semantic_metrics = None
    print("Semantic evaluation skipped.")

## 6. Neural Sparse Model

In [None]:
class SPLADEModel(nn.Module):
    """SPLADE model for Korean sparse retrieval."""
    
    def __init__(self, model_name: str = "skt/A.X-Encoder-base"):
        super().__init__()
        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
        self.config = self.model.config
        self.relu = nn.ReLU()
    
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Forward pass.
        
        Returns:
            sparse_repr: [batch_size, vocab_size]
            token_weights: [batch_size, seq_len]
        """
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # log(1 + ReLU(logits))
        token_scores = torch.log1p(self.relu(logits))
        
        # Mask padding
        mask = attention_mask.unsqueeze(-1).float()
        token_scores = token_scores * mask
        
        # Max pooling over sequence
        sparse_repr, _ = token_scores.max(dim=1)
        token_weights = token_scores.max(dim=-1).values
        
        return sparse_repr, token_weights


def load_splade_model(
    checkpoint_path: Path,
    model_name: str,
    device: torch.device,
) -> Optional[SPLADEModel]:
    """Load trained SPLADE model."""
    if not checkpoint_path.exists():
        print(f"Checkpoint not found: {checkpoint_path}")
        return None
    
    model = SPLADEModel(model_name)
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
    model.load_state_dict(checkpoint["model_state_dict"])
    model = model.to(device)
    model.eval()
    
    print(f"Loaded model from: {checkpoint_path}")
    return model


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer loaded: {MODEL_NAME}")

In [None]:
class NeuralSparseRetriever:
    """Neural sparse retrieval using SPLADE model."""
    
    def __init__(
        self,
        model: SPLADEModel,
        tokenizer,
        corpus: List[str],
        device: torch.device,
        batch_size: int = 32,
    ):
        """Initialize neural sparse retriever."""
        self.model = model
        self.tokenizer = tokenizer
        self.corpus = corpus
        self.device = device
        self.batch_size = batch_size
        
        # Precompute corpus sparse representations
        print("Computing corpus sparse representations...")
        self.corpus_sparse = self._encode_batch(corpus)
        print(f"Corpus sparse shape: {self.corpus_sparse.shape}")
    
    @torch.no_grad()
    def _encode_batch(
        self,
        texts: List[str],
    ) -> torch.Tensor:
        """Encode texts to sparse representations."""
        all_sparse = []
        
        for i in tqdm(range(0, len(texts), self.batch_size), desc="Encoding"):
            batch_texts = texts[i:i + self.batch_size]
            
            inputs = self.tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512,
            )
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            sparse_repr, _ = self.model(
                inputs["input_ids"],
                inputs["attention_mask"],
            )
            all_sparse.append(sparse_repr.cpu())
        
        return torch.cat(all_sparse, dim=0)
    
    def search(
        self,
        query: str,
        top_k: int = 10,
    ) -> Tuple[List[int], List[float]]:
        """Search for relevant documents."""
        query_sparse = self._encode_batch([query])
        
        # Dot product similarity for sparse vectors
        scores = torch.mm(query_sparse, self.corpus_sparse.T).squeeze(0)
        
        top_scores, top_indices = torch.topk(scores, k=top_k)
        
        return top_indices.tolist(), top_scores.tolist()
    
    def evaluate(
        self,
        samples: List[EvalSample],
        top_k: int = 10,
    ) -> List[RetrievalResult]:
        """Evaluate on samples."""
        results = []
        
        # Encode all queries
        queries = [s.query for s in samples]
        query_sparse = self._encode_batch(queries)
        
        # Compute all scores at once
        print("Computing scores...")
        all_scores = torch.mm(query_sparse, self.corpus_sparse.T)
        
        # Get top-k for each query
        top_scores, top_indices = torch.topk(all_scores, k=top_k, dim=1)
        
        for i, sample in enumerate(samples):
            results.append(RetrievalResult(
                query_id=i,
                retrieved_doc_ids=top_indices[i].tolist(),
                scores=top_scores[i].tolist(),
                ground_truth_id=sample.doc_id,
            ))
        
        return results

In [None]:
# Load Neural Sparse models
neural_sparse_models = {}

# Try v22.1 first, then fallback to v22.0, then v21.4
model_paths = [
    ("v22.1", V22_1_MODEL_PATH),
    ("v22.0", V22_0_MODEL_PATH),
    ("v21.4", V21_4_MODEL_PATH),
]

for name, path in model_paths:
    model = load_splade_model(path, MODEL_NAME, DEVICE)
    if model is not None:
        neural_sparse_models[name] = model

print(f"\nLoaded {len(neural_sparse_models)} Neural Sparse models:")
for name in neural_sparse_models:
    print(f"  - {name}")

In [None]:
# Evaluate Neural Sparse models
neural_sparse_metrics = {}

for name, model in neural_sparse_models.items():
    print(f"\n{'='*60}")
    print(f"Evaluating Neural Sparse {name}...")
    print(f"{'='*60}")
    
    retriever = NeuralSparseRetriever(
        model=model,
        tokenizer=tokenizer,
        corpus=corpus,
        device=DEVICE,
        batch_size=BATCH_SIZE,
    )
    
    results = retriever.evaluate(eval_samples, top_k=10)
    metrics = MetricsCalculator.compute_all_metrics(results, f"Neural Sparse {name}")
    neural_sparse_metrics[name] = metrics
    
    print(f"\nNeural Sparse {name} Results:")
    print(f"  NDCG@10: {metrics.ndcg_at_10:.4f}")
    print(f"  Recall@1: {metrics.recall_at_1:.4f}")
    print(f"  Recall@5: {metrics.recall_at_5:.4f}")
    print(f"  Recall@10: {metrics.recall_at_10:.4f}")
    print(f"  MRR: {metrics.mrr:.4f}")

## 7. Results Comparison

In [None]:
def print_comparison_table(all_metrics: Dict[str, EvaluationMetrics]):
    """Print formatted comparison table."""
    print("\n" + "=" * 80)
    print("v22.1 Evaluation Results")
    print("=" * 80)
    
    # Header
    print(f"{'Method':<30} {'NDCG@10':>10} {'Recall@1':>10} {'Recall@5':>10} {'Recall@10':>10} {'MRR':>10}")
    print("-" * 80)
    
    # Rows
    for name, metrics in all_metrics.items():
        print(
            f"{name:<30} "
            f"{metrics.ndcg_at_10:>10.4f} "
            f"{metrics.recall_at_1:>10.4f} "
            f"{metrics.recall_at_5:>10.4f} "
            f"{metrics.recall_at_10:>10.4f} "
            f"{metrics.mrr:>10.4f}"
        )
    
    print("=" * 80)


# Collect all metrics
all_metrics = {}

if bm25_metrics:
    all_metrics["BM25"] = bm25_metrics

if semantic_metrics:
    all_metrics[f"Semantic (BGE-M3)"] = semantic_metrics

for name, metrics in neural_sparse_metrics.items():
    all_metrics[f"Neural Sparse {name}"] = metrics

# Print comparison
print_comparison_table(all_metrics)

In [None]:
# Visualization
def plot_metrics_comparison(all_metrics: Dict[str, EvaluationMetrics]):
    """Create bar chart comparing metrics."""
    methods = list(all_metrics.keys())
    metrics_names = ["NDCG@10", "Recall@1", "Recall@5", "Recall@10", "MRR"]
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    x = np.arange(len(metrics_names))
    width = 0.8 / len(methods)
    
    colors = plt.cm.Set2(np.linspace(0, 1, len(methods)))
    
    for i, (method, metrics) in enumerate(all_metrics.items()):
        values = [
            metrics.ndcg_at_10,
            metrics.recall_at_1,
            metrics.recall_at_5,
            metrics.recall_at_10,
            metrics.mrr,
        ]
        offset = (i - len(methods) / 2 + 0.5) * width
        bars = ax.bar(x + offset, values, width, label=method, color=colors[i])
        
        # Add value labels
        for bar, val in zip(bars, values):
            height = bar.get_height()
            ax.annotate(
                f'{val:.2f}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom',
                fontsize=8,
                rotation=45,
            )
    
    ax.set_xlabel('Metric')
    ax.set_ylabel('Score')
    ax.set_title('v22.1 Model Comparison: Retrieval Metrics')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics_names)
    ax.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
    ax.set_ylim(0, 1.1)
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    
    # Save figure
    fig_path = OUTPUT_DIR / "evaluation_comparison.png"
    plt.savefig(fig_path, dpi=150, bbox_inches='tight')
    print(f"Figure saved to: {fig_path}")
    
    plt.show()


if all_metrics:
    plot_metrics_comparison(all_metrics)

## 8. Per-Query Analysis (Error Debugging)

In [None]:
def analyze_failures(
    metrics: EvaluationMetrics,
    samples: List[EvalSample],
    max_failures: int = 10,
):
    """Analyze queries where the model failed to retrieve ground truth."""
    failures = []
    
    for result in metrics.per_query_results:
        if result["recall@10"] == 0:  # Ground truth not in top-10
            query_id = result["query_id"]
            failures.append({
                "query_id": query_id,
                "query": samples[query_id].query,
                "positive_doc": samples[query_id].positive_doc,
                "mrr": result["mrr"],
                "ground_truth_rank": result["ground_truth_rank"],
            })
    
    print(f"\nTotal failures (not in top-10): {len(failures)}/{len(samples)}")
    print(f"Failure rate: {len(failures)/len(samples)*100:.1f}%")
    
    if failures:
        print(f"\nSample failures (showing {min(max_failures, len(failures))}):")
        for i, f in enumerate(failures[:max_failures]):
            print(f"\n[{i+1}] Query: {f['query'][:80]}...")
            print(f"    Expected: {f['positive_doc'][:80]}...")
            print(f"    Actual rank: {f['ground_truth_rank']}")
    
    return failures


# Analyze failures for the best Neural Sparse model
if neural_sparse_metrics:
    best_model_name = max(
        neural_sparse_metrics.keys(),
        key=lambda k: neural_sparse_metrics[k].mrr
    )
    print(f"\nAnalyzing failures for: Neural Sparse {best_model_name}")
    failures = analyze_failures(
        neural_sparse_metrics[best_model_name],
        eval_samples,
    )

In [None]:
def analyze_difficulty_breakdown(
    metrics: EvaluationMetrics,
    samples: List[EvalSample],
):
    """Analyze performance by query length."""
    short_queries = []  # < 20 chars
    medium_queries = []  # 20-50 chars
    long_queries = []  # > 50 chars
    
    for result in metrics.per_query_results:
        query_id = result["query_id"]
        query_len = len(samples[query_id].query)
        
        if query_len < 20:
            short_queries.append(result)
        elif query_len < 50:
            medium_queries.append(result)
        else:
            long_queries.append(result)
    
    print("\nPerformance by Query Length:")
    print("=" * 60)
    print(f"{'Category':<20} {'Count':>10} {'MRR':>10} {'R@10':>10}")
    print("-" * 60)
    
    for name, queries in [
        ("Short (<20)", short_queries),
        ("Medium (20-50)", medium_queries),
        ("Long (>50)", long_queries),
    ]:
        if queries:
            avg_mrr = np.mean([q["mrr"] for q in queries])
            avg_recall = np.mean([q["recall@10"] for q in queries])
            print(f"{name:<20} {len(queries):>10} {avg_mrr:>10.4f} {avg_recall:>10.4f}")
    
    print("=" * 60)


if neural_sparse_metrics:
    analyze_difficulty_breakdown(
        neural_sparse_metrics[best_model_name],
        eval_samples,
    )

## 9. Save Results

In [None]:
# Save evaluation results to JSON
results_dict = {
    "evaluation_date": datetime.now().isoformat(),
    "num_samples": len(eval_samples),
    "corpus_size": len(corpus),
    "methods": {},
}

for name, metrics in all_metrics.items():
    results_dict["methods"][name] = metrics.to_dict()

# Save JSON
json_path = OUTPUT_DIR / "evaluation_results.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(results_dict, f, indent=2, ensure_ascii=False)

print(f"Results saved to: {json_path}")

In [None]:
# Generate Markdown report
def generate_markdown_report(
    all_metrics: Dict[str, EvaluationMetrics],
    output_path: Path,
):
    """Generate detailed markdown evaluation report."""
    lines = [
        "# v22.1 Model Evaluation Report",
        "",
        f"**Evaluation Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "",
        "## Summary",
        "",
        f"- **Evaluation Samples:** {len(eval_samples)}",
        f"- **Corpus Size:** {len(corpus)} unique documents",
        "",
        "## Results",
        "",
        "| Method | NDCG@10 | Recall@1 | Recall@5 | Recall@10 | MRR |",
        "|--------|---------|----------|----------|-----------|-----|",
    ]
    
    for name, metrics in all_metrics.items():
        lines.append(
            f"| {name} | "
            f"{metrics.ndcg_at_10:.4f} | "
            f"{metrics.recall_at_1:.4f} | "
            f"{metrics.recall_at_5:.4f} | "
            f"{metrics.recall_at_10:.4f} | "
            f"{metrics.mrr:.4f} |"
        )
    
    lines.extend([
        "",
        "## Methodology",
        "",
        "### BM25",
        "- Traditional term-frequency based retrieval",
        "- Tokenization: whitespace + character bigrams",
        "",
        "### Semantic Search (BGE-M3)",
        "- Dense embedding model: BAAI/bge-m3",
        "- Similarity: cosine similarity",
        "",
        "### Neural Sparse",
        "- Base model: skt/A.X-Encoder-base",
        "- Architecture: SPLADE-style sparse retrieval",
        "- Similarity: dot product",
        "",
        "## Metrics Definition",
        "",
        "- **NDCG@10:** Normalized Discounted Cumulative Gain at 10",
        "- **Recall@K:** Fraction of relevant documents retrieved in top-K",
        "- **MRR:** Mean Reciprocal Rank (1/rank of first relevant document)",
        "",
    ])
    
    # Add best model highlight
    if all_metrics:
        best_by_mrr = max(all_metrics.items(), key=lambda x: x[1].mrr)
        lines.extend([
            "## Best Model",
            "",
            f"**{best_by_mrr[0]}** achieved the highest MRR of {best_by_mrr[1].mrr:.4f}",
            "",
        ])
    
    report_text = "\n".join(lines)
    
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(report_text)
    
    print(f"Report saved to: {output_path}")
    return report_text


# Generate report
report_path = OUTPUT_DIR / "evaluation_report.md"
report = generate_markdown_report(all_metrics, report_path)

print("\n" + "=" * 60)
print("Report Preview:")
print("=" * 60)
print(report[:1500])

## 10. Final Summary

In [None]:
print("\n" + "=" * 80)
print("EVALUATION COMPLETE")
print("=" * 80)

print("\nOutput Files:")
print(f"  - {OUTPUT_DIR / 'evaluation_results.json'}")
print(f"  - {OUTPUT_DIR / 'evaluation_report.md'}")
print(f"  - {OUTPUT_DIR / 'evaluation_comparison.png'}")

print("\nKey Findings:")
if all_metrics:
    best_model = max(all_metrics.items(), key=lambda x: x[1].mrr)
    print(f"  - Best model by MRR: {best_model[0]} ({best_model[1].mrr:.4f})")
    
    best_ndcg = max(all_metrics.items(), key=lambda x: x[1].ndcg_at_10)
    print(f"  - Best model by NDCG@10: {best_ndcg[0]} ({best_ndcg[1].ndcg_at_10:.4f})")
    
    best_recall = max(all_metrics.items(), key=lambda x: x[1].recall_at_10)
    print(f"  - Best model by Recall@10: {best_recall[0]} ({best_recall[1].recall_at_10:.4f})")

print("\n" + "=" * 80)

## Appendix: Statistical Significance Testing (Optional)

In [None]:
from scipy import stats


def paired_t_test(
    metrics_a: EvaluationMetrics,
    metrics_b: EvaluationMetrics,
    metric_name: str = "mrr",
    alpha: float = 0.05,
) -> Dict:
    """
    Perform paired t-test between two models.
    
    Returns:
        Dictionary with test results
    """
    scores_a = [r[metric_name] for r in metrics_a.per_query_results]
    scores_b = [r[metric_name] for r in metrics_b.per_query_results]
    
    if len(scores_a) != len(scores_b):
        return {"error": "Different number of samples"}
    
    t_stat, p_value = stats.ttest_rel(scores_a, scores_b)
    
    return {
        "metric": metric_name,
        "model_a": metrics_a.method_name,
        "model_b": metrics_b.method_name,
        "mean_a": np.mean(scores_a),
        "mean_b": np.mean(scores_b),
        "mean_diff": np.mean(scores_a) - np.mean(scores_b),
        "t_statistic": t_stat,
        "p_value": p_value,
        "significant": p_value < alpha,
    }


# Perform significance tests if we have multiple methods
if len(all_metrics) >= 2:
    print("\nStatistical Significance Tests (MRR):")
    print("=" * 70)
    
    metric_list = list(all_metrics.values())
    
    for i in range(len(metric_list)):
        for j in range(i + 1, len(metric_list)):
            result = paired_t_test(metric_list[i], metric_list[j], "mrr")
            
            sig_marker = "*" if result.get("significant", False) else ""
            print(
                f"{result['model_a']} vs {result['model_b']}: "
                f"diff={result['mean_diff']:.4f}, "
                f"p={result['p_value']:.4f}{sig_marker}"
            )
    
    print("\n* indicates statistically significant (p < 0.05)")
else:
    print("\nSkipping significance tests (need at least 2 methods).")

## 11. Amazon OpenSearch Service Evaluation (Production)

This section evaluates the models using Amazon OpenSearch Service.

**Authentication:** AWS default credentials via boto3 (IAM-based)

**Requirements:**
- Environment variables: `OPENSEARCH_HOST`, `AWS_REGION`
- Python packages: `opensearch-py`, `requests-aws4auth`, `boto3`

In [None]:
# Amazon OpenSearch Service Client Setup
from dotenv import load_dotenv
import boto3

# Load environment variables
load_dotenv(PROJECT_ROOT / ".env")

# OpenSearch Configuration from environment
OPENSEARCH_HOST = os.getenv("OPENSEARCH_HOST", "")
OPENSEARCH_PORT = int(os.getenv("OPENSEARCH_PORT", "443"))
OPENSEARCH_REGION = os.getenv("AWS_REGION", "us-east-1")
OPENSEARCH_USE_SSL = os.getenv("OPENSEARCH_USE_SSL", "true").lower() == "true"

# Index names
BM25_INDEX = os.getenv("OPENSEARCH_BM25_INDEX", "benchmark-bm25-v22")
DENSE_INDEX = os.getenv("OPENSEARCH_DENSE_INDEX", "benchmark-dense-v22")
SPARSE_INDEX = os.getenv("OPENSEARCH_SPARSE_INDEX", "benchmark-sparse-v22")

# Check if OpenSearch is configured
OPENSEARCH_CONFIGURED = bool(OPENSEARCH_HOST and OPENSEARCH_HOST != "your-domain.us-east-1.es.amazonaws.com")

print(f"OpenSearch Configuration:")
print(f"  Host: {OPENSEARCH_HOST or '(not configured)'}")
print(f"  Port: {OPENSEARCH_PORT}")
print(f"  Region: {OPENSEARCH_REGION}")
print(f"  SSL: {OPENSEARCH_USE_SSL}")
print(f"  Configured: {OPENSEARCH_CONFIGURED}")

In [None]:
class OpenSearchServiceClient:
    """
    Amazon OpenSearch Service client with AWS IAM authentication.
    
    Uses default AWS credentials from boto3.Session() - typically from:
    - Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    - AWS credentials file (~/.aws/credentials)
    - IAM role (when running on EC2/Lambda)
    """
    
    def __init__(
        self,
        host: str,
        port: int = 443,
        region: str = "us-east-1",
        use_ssl: bool = True,
    ):
        """Initialize OpenSearch client with AWS authentication."""
        self.host = host
        self.port = port
        self.region = region
        self.use_ssl = use_ssl
        self.client = self._create_client()
    
    def _create_client(self):
        """Create OpenSearch client with AWS4Auth."""
        try:
            from opensearchpy import OpenSearch, RequestsHttpConnection
            from requests_aws4auth import AWS4Auth
        except ImportError as e:
            print(f"Missing required packages: {e}")
            print("Install with: pip install opensearch-py requests-aws4auth boto3")
            return None
        
        # Get AWS credentials from default credential chain
        credentials = boto3.Session().get_credentials()
        if not credentials:
            print("ERROR: No AWS credentials found. Configure AWS credentials first.")
            return None
        
        aws_auth = AWS4Auth(
            credentials.access_key,
            credentials.secret_key,
            self.region,
            "es",  # Service name for OpenSearch
            session_token=credentials.token,
        )
        
        client = OpenSearch(
            hosts=[{"host": self.host, "port": self.port}],
            http_auth=aws_auth,
            use_ssl=self.use_ssl,
            verify_certs=True,
            connection_class=RequestsHttpConnection,
            timeout=120,
        )
        
        # Test connection
        try:
            info = client.info()
            print(f"Connected to OpenSearch: {info['version']['distribution']} {info['version']['number']}")
        except Exception as e:
            print(f"Connection test failed: {e}")
            return None
        
        return client
    
    def create_bm25_index(self, index_name: str) -> bool:
        """Create BM25 index with Korean analyzer."""
        if not self.client:
            return False
        
        if self.client.indices.exists(index=index_name):
            print(f"Index {index_name} already exists.")
            return True
        
        body = {
            "settings": {
                "analysis": {
                    "analyzer": {
                        "korean_analyzer": {
                            "type": "custom",
                            "tokenizer": "nori_tokenizer",
                        }
                    }
                },
                "number_of_shards": 2,
                "number_of_replicas": 1,
            },
            "mappings": {
                "properties": {
                    "doc_id": {"type": "keyword"},
                    "content": {"type": "text", "analyzer": "korean_analyzer"},
                }
            },
        }
        
        try:
            self.client.indices.create(index=index_name, body=body)
            print(f"Created index: {index_name}")
            return True
        except Exception as e:
            print(f"Failed to create index {index_name}: {e}")
            return False
    
    def create_sparse_index(self, index_name: str, dimension: int = 50000) -> bool:
        """Create sparse vector index for Neural Sparse."""
        if not self.client:
            return False
        
        if self.client.indices.exists(index=index_name):
            print(f"Index {index_name} already exists.")
            return True
        
        body = {
            "settings": {
                "number_of_shards": 2,
                "number_of_replicas": 1,
            },
            "mappings": {
                "properties": {
                    "doc_id": {"type": "keyword"},
                    "content": {"type": "text"},
                    "sparse_embedding": {
                        "type": "rank_features",
                    },
                }
            },
        }
        
        try:
            self.client.indices.create(index=index_name, body=body)
            print(f"Created sparse index: {index_name}")
            return True
        except Exception as e:
            print(f"Failed to create sparse index {index_name}: {e}")
            return False
    
    def index_documents(
        self,
        index_name: str,
        documents: List[Dict],
        batch_size: int = 100,
    ) -> int:
        """Bulk index documents."""
        if not self.client:
            return 0
        
        from opensearchpy.helpers import bulk
        
        indexed = 0
        for i in tqdm(range(0, len(documents), batch_size), desc=f"Indexing to {index_name}"):
            batch = documents[i:i + batch_size]
            actions = [
                {
                    "_index": index_name,
                    "_id": doc.get("doc_id", i + j),
                    "_source": doc,
                }
                for j, doc in enumerate(batch)
            ]
            
            try:
                success, _ = bulk(self.client, actions)
                indexed += success
            except Exception as e:
                print(f"Bulk indexing error: {e}")
        
        # Refresh index
        self.client.indices.refresh(index=index_name)
        return indexed
    
    def search_bm25(
        self,
        index_name: str,
        query: str,
        top_k: int = 10,
    ) -> List[Tuple[str, float]]:
        """BM25 search."""
        if not self.client:
            return []
        
        body = {
            "query": {
                "match": {
                    "content": query
                }
            },
            "size": top_k,
        }
        
        try:
            response = self.client.search(index=index_name, body=body)
            hits = response["hits"]["hits"]
            return [(hit["_source"]["doc_id"], hit["_score"]) for hit in hits]
        except Exception as e:
            print(f"Search error: {e}")
            return []
    
    def search_sparse(
        self,
        index_name: str,
        sparse_vector: Dict[str, float],
        top_k: int = 10,
    ) -> List[Tuple[str, float]]:
        """Sparse vector search using rank_features."""
        if not self.client:
            return []
        
        # Build rank_feature query
        should_clauses = [
            {"rank_feature": {"field": f"sparse_embedding.{token}", "boost": weight}}
            for token, weight in sparse_vector.items()
        ]
        
        body = {
            "query": {
                "bool": {
                    "should": should_clauses,
                }
            },
            "size": top_k,
        }
        
        try:
            response = self.client.search(index=index_name, body=body)
            hits = response["hits"]["hits"]
            return [(hit["_source"]["doc_id"], hit["_score"]) for hit in hits]
        except Exception as e:
            print(f"Sparse search error: {e}")
            return []


# Initialize OpenSearch client if configured
os_client = None
if OPENSEARCH_CONFIGURED:
    print("\nInitializing Amazon OpenSearch Service client...")
    os_client = OpenSearchServiceClient(
        host=OPENSEARCH_HOST,
        port=OPENSEARCH_PORT,
        region=OPENSEARCH_REGION,
        use_ssl=OPENSEARCH_USE_SSL,
    )
else:
    print("\nOpenSearch not configured. Set OPENSEARCH_HOST in .env to enable.")
    print("Skipping OpenSearch evaluation.")

In [None]:
# Index corpus to Amazon OpenSearch Service and run evaluation
if os_client and os_client.client:
    print("\n" + "=" * 60)
    print("Amazon OpenSearch Service Evaluation")
    print("=" * 60)
    
    # 1. Create BM25 index and index documents
    print("\n[1/3] Setting up BM25 index...")
    if os_client.create_bm25_index(BM25_INDEX):
        bm25_docs = [
            {"doc_id": str(i), "content": doc}
            for i, doc in enumerate(corpus)
        ]
        indexed = os_client.index_documents(BM25_INDEX, bm25_docs)
        print(f"Indexed {indexed} documents to {BM25_INDEX}")
    
    # 2. Create Sparse index and index documents
    print("\n[2/3] Setting up Neural Sparse index...")
    if neural_sparse_models and os_client.create_sparse_index(SPARSE_INDEX):
        # Use the best available model
        best_model_key = list(neural_sparse_models.keys())[0]
        model = neural_sparse_models[best_model_key]
        
        # Encode corpus to sparse vectors
        print(f"Encoding corpus with {best_model_key}...")
        sparse_docs = []
        
        for i, doc in enumerate(tqdm(corpus, desc="Encoding corpus")):
            inputs = tokenizer(
                doc,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512,
            )
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            
            with torch.no_grad():
                sparse_repr, _ = model(inputs["input_ids"], inputs["attention_mask"])
            
            # Convert to sparse dict (only non-zero values)
            sparse_vec = sparse_repr[0].cpu()
            nonzero_indices = torch.nonzero(sparse_vec > 0.01).squeeze(-1)
            sparse_dict = {
                str(idx.item()): float(sparse_vec[idx])
                for idx in nonzero_indices
            }
            
            sparse_docs.append({
                "doc_id": str(i),
                "content": doc,
                "sparse_embedding": sparse_dict,
            })
        
        indexed = os_client.index_documents(SPARSE_INDEX, sparse_docs)
        print(f"Indexed {indexed} sparse documents to {SPARSE_INDEX}")
    
    # 3. Evaluate on OpenSearch
    print("\n[3/3] Running evaluation on OpenSearch...")
    
    os_bm25_results = []
    os_sparse_results = []
    
    for i, sample in enumerate(tqdm(eval_samples, desc="OpenSearch Eval")):
        # BM25 search
        bm25_hits = os_client.search_bm25(BM25_INDEX, sample.query, top_k=10)
        bm25_doc_ids = [int(doc_id) for doc_id, _ in bm25_hits]
        bm25_scores = [score for _, score in bm25_hits]
        
        os_bm25_results.append(RetrievalResult(
            query_id=i,
            retrieved_doc_ids=bm25_doc_ids,
            scores=bm25_scores,
            ground_truth_id=sample.doc_id,
        ))
        
        # Sparse search (if model available)
        if neural_sparse_models:
            # Encode query
            inputs = tokenizer(
                sample.query,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512,
            )
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            
            with torch.no_grad():
                query_sparse, _ = model(inputs["input_ids"], inputs["attention_mask"])
            
            # Convert to sparse dict
            sparse_vec = query_sparse[0].cpu()
            nonzero_indices = torch.nonzero(sparse_vec > 0.01).squeeze(-1)
            query_sparse_dict = {
                str(idx.item()): float(sparse_vec[idx])
                for idx in nonzero_indices
            }
            
            sparse_hits = os_client.search_sparse(SPARSE_INDEX, query_sparse_dict, top_k=10)
            sparse_doc_ids = [int(doc_id) for doc_id, _ in sparse_hits]
            sparse_scores = [score for _, score in sparse_hits]
            
            os_sparse_results.append(RetrievalResult(
                query_id=i,
                retrieved_doc_ids=sparse_doc_ids,
                scores=sparse_scores,
                ground_truth_id=sample.doc_id,
            ))
    
    # Compute metrics for OpenSearch results
    os_bm25_metrics = MetricsCalculator.compute_all_metrics(os_bm25_results, "OpenSearch BM25 (Nori)")
    
    print(f"\nOpenSearch BM25 (Nori) Results:")
    print(f"  NDCG@10: {os_bm25_metrics.ndcg_at_10:.4f}")
    print(f"  Recall@1: {os_bm25_metrics.recall_at_1:.4f}")
    print(f"  Recall@10: {os_bm25_metrics.recall_at_10:.4f}")
    print(f"  MRR: {os_bm25_metrics.mrr:.4f}")
    
    if os_sparse_results:
        os_sparse_metrics = MetricsCalculator.compute_all_metrics(os_sparse_results, "OpenSearch Neural Sparse")
        
        print(f"\nOpenSearch Neural Sparse Results:")
        print(f"  NDCG@10: {os_sparse_metrics.ndcg_at_10:.4f}")
        print(f"  Recall@1: {os_sparse_metrics.recall_at_1:.4f}")
        print(f"  Recall@10: {os_sparse_metrics.recall_at_10:.4f}")
        print(f"  MRR: {os_sparse_metrics.mrr:.4f}")
        
        # Add to all_metrics for comparison
        all_metrics["OpenSearch BM25 (Nori)"] = os_bm25_metrics
        all_metrics["OpenSearch Neural Sparse"] = os_sparse_metrics
    
    print("\n" + "=" * 60)
    print("OpenSearch Evaluation Complete")
    print("=" * 60)
else:
    print("\nSkipping OpenSearch evaluation (not configured or connection failed).")