# Scientific Claim Retrieval: Complete Evaluation Workflow

This notebook:
1. Evaluates all models on the dev set
2. Identifies the best performing model
3. Generates test set predictions using the best model

In [1]:
# Import required libraries
from evaluator import evaluate_models
import json
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


## Step 1: Define Full Configuration for All Models

Available Models:
```
        # Traditional methods
        'bm25',              # Basic BM25
        'tfidf',             # Basic TF-IDF
        
        # Representation learning  
        'custom_retriever',          # Dense semantic embeddings
        'vector_store',      # Vector database retrieval
        
        # Reranking methods
        'bm25_reranker',     # BM25 + neural reranking
        'tfidf_reranker',    # TF-IDF + neural reranking
        'custom_retriever_reranker', # Two-stage with neural reranking
        'vector_store_reranker', # Vector store + reranking
        
        # Hybrid methods
        'multi_stage_hybrid',      # retrieve, fuse, rerank
        
        # Query expansion
        'query_expansion'    # LLM-enhanced queries
```

In [2]:
DEV_CONFIG = {
    # Data paths
    'collection_path': 'data/subtask4b_collection_data.pkl',
    'query_path': 'data/subtask4b_query_tweets_dev.tsv',
    
    # Available model categories and names:
    'models': [
        # Traditional methods
        'bm25',              # Basic BM25
        'tfidf',             # Basic TF-IDF
        
        # Representation learning  
        'custom_retriever',          # Dense semantic embeddings
        'vector_store',      # Vector database retrieval
        
        # Reranking methods
        'bm25_reranker',     # BM25 + neural reranking
        'tfidf_reranker',    # TF-IDF + neural reranking
        'custom_retriever_reranker', # Two-stage with neural reranking
        'vector_store_reranker', # Vector store + reranking
        
        # Hybrid methods
        'multi_stage_hybrid',      # retrieve, fuse, rerank
        
        # Query expansion
        'query_expansion'    # LLM-enhanced queries
    ], 
    
    # Output settings
    'output_dir': f'results/eval_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
    
    # Model configuration
    'top_k': 5,
    'mrr_k': [1, 5, 10],
    'collection_columns': ['title', 'abstract'],
    
    # Model-specific settings
    'embedding_model': 'sentence-transformers/allenai-specter',
    'vectordb_model': 'all-minilm', # 'nomic-embed-text', #  'all-minilm', // momic performs worse
    'reranker_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2', # 'BAAI/bge-reranker-base', #
    
    # Hybrid settings
    'rrf_k': 60,
    'sparse_weight': 0.6,
    
    # Performance settings
    'candidate_count': 50,
    'batch_size': 32,
    'reranker_batch_size': 8,
    'use_gpu': True,

    # Sampling for testing
    'sample_size': None, # tweets
    'collection_sample_size': None, # abstracts
    
    'cache_dir': 'cache',
    'show_progress': False,
}

## Step 2: Evaluate All Models on Dev Set

In [3]:
# Run evaluation
dev_results = evaluate_models(DEV_CONFIG)

print(f"\nEvaluation completed! Results saved to: {dev_results['output_dir']}")

2025-05-30 12:19:59,829 - INFO - Running bm25...
Processing bm25: 100%|██████████| 1400/1400 [00:40<00:00, 34.92it/s]
2025-05-30 12:20:41,180 - INFO - bm25 MRR@5: 0.5590
2025-05-30 12:20:41,184 - INFO - Running tfidf...
Processing tfidf: 100%|██████████| 1400/1400 [00:08<00:00, 159.25it/s]
2025-05-30 12:20:52,803 - INFO - tfidf MRR@5: 0.5092
2025-05-30 12:20:52,806 - INFO - Running custom_retriever...
Processing custom_retriever: 100%|██████████| 1400/1400 [00:26<00:00, 52.38it/s]
2025-05-30 12:24:50,521 - INFO - custom_retriever MRR@5: 0.3252
2025-05-30 12:24:50,524 - INFO - Running vector_store...
Creating documents: 100%|██████████| 7718/7718 [00:00<00:00, 47741.27it/s]
Processing vector_store: 100%|██████████| 1400/1400 [00:19<00:00, 70.02it/s]
2025-05-30 12:26:17,841 - INFO - vector_store MRR@5: 0.4847
2025-05-30 12:26:17,845 - INFO - Running bm25_reranker...
Processing bm25_reranker: 100%|██████████| 1400/1400 [11:09<00:00,  2.09it/s]
2025-05-30 12:37:39,703 - INFO - bm25_reranke

Hybrid retriever using device: mps
Building hybrid retrieval indices...
Building sparse index (BM25)...
Building dense embeddings...
Building FAISS index...


Processing multi_stage_hybrid: 100%|██████████| 1400/1400 [10:57<00:00,  2.13it/s]
2025-05-30 13:32:06,508 - INFO - multi_stage_hybrid MRR@5: 0.6209
2025-05-30 13:32:06,512 - INFO - Running query_expansion...
Creating documents: 100%|██████████| 7718/7718 [00:00<00:00, 46389.20it/s]
Processing query_expansion: 100%|██████████| 1400/1400 [42:44<00:00,  1.83s/it]
2025-05-30 14:15:59,298 - INFO - query_expansion MRR@5: 0.4920



Evaluation completed! Results saved to: results/eval_20250530_121959


## Step 3: Display Results and Find Best Model

In [4]:
# Display results for all models
print("=== Dev Set Evaluation Results ===")

if dev_results['metrics']:
    for model_name, metrics in dev_results['metrics'].items():
        print(f"\n{model_name}:")
        print(f"  MRR@1: {metrics[1]:.4f}")
        print(f"  MRR@5: {metrics[5]:.4f}")
        print(f"  MRR@10: {metrics[10]:.4f}")
    
    # Find best model based on MRR@5
    best_model = max(dev_results['metrics'].items(), key=lambda x: x[1][5])[0]
    best_score = dev_results['metrics'][best_model][5]
    
    print("\n" + "=" * 30)
    print(f"Best model: {best_model} (MRR@5: {best_score:.4f})")
    print("=" * 30)
else:
    print("No evaluation metrics available (test set mode)")

=== Dev Set Evaluation Results ===

bm25:
  MRR@1: 0.5114
  MRR@5: 0.5590
  MRR@10: 0.5590

tfidf:
  MRR@1: 0.4407
  MRR@5: 0.5092
  MRR@10: 0.5092

custom_retriever:
  MRR@1: 0.2671
  MRR@5: 0.3252
  MRR@10: 0.3252

vector_store:
  MRR@1: 0.4100
  MRR@5: 0.4847
  MRR@10: 0.4847

bm25_reranker:
  MRR@1: 0.5579
  MRR@5: 0.6069
  MRR@10: 0.6069

tfidf_reranker:
  MRR@1: 0.5636
  MRR@5: 0.6189
  MRR@10: 0.6189

custom_retriever_reranker:
  MRR@1: 0.5236
  MRR@5: 0.5594
  MRR@10: 0.5594

vector_store_reranker:
  MRR@1: 0.5564
  MRR@5: 0.5820
  MRR@10: 0.5820

multi_stage_hybrid:
  MRR@1: 0.5721
  MRR@5: 0.6209
  MRR@10: 0.6209

query_expansion:
  MRR@1: 0.4221
  MRR@5: 0.4920
  MRR@10: 0.4920

Best model: multi_stage_hybrid (MRR@5: 0.6209)


## Step 4: Create Configuration for Best Model

In [None]:
# Extract relevant configuration parameters for the best model
print(f"Creating test configuration for model: {best_model}")

# Start with the base configuration
TEST_CONFIG = {

    'collection_path': DEV_CONFIG['collection_path'],
    'query_path': 'data/subtask4b_query_tweets_test.tsv',  # Test set for final submission

    # Use only the best model
    'models': [best_model],

    'output_dir': f'results/test_{best_model}_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
    
    # Copy relevant settings from dev config
    'top_k': DEV_CONFIG['top_k'],
    'collection_columns': DEV_CONFIG['collection_columns'],
    'cache_dir': DEV_CONFIG['cache_dir'],
    'batch_size': DEV_CONFIG['batch_size'],
    'use_gpu': DEV_CONFIG['use_gpu'],
}

# Add settings based on the best model type
if 'langchain' in best_model:
    TEST_CONFIG['langchain_embedding'] = DEV_CONFIG['vectordb_model']
    TEST_CONFIG['candidate_count'] = DEV_CONFIG['candidate_count']
    
    if 'reranker' in best_model:
        TEST_CONFIG['reranker_model'] = DEV_CONFIG['reranker_model']
        TEST_CONFIG['reranker_batch_size'] = DEV_CONFIG['reranker_batch_size']
    
    if 'query_expansion' in best_model:
        TEST_CONFIG['sample_for_expansion'] = DEV_CONFIG['sample_for_expansion']

elif best_model == 'dense':
    TEST_CONFIG['embedding_model'] = DEV_CONFIG['embedding_model']

elif best_model == 'neural_rerank':
    TEST_CONFIG['reranker_model'] = DEV_CONFIG['reranker_model']
    TEST_CONFIG['reranker_batch_size'] = DEV_CONFIG['reranker_batch_size']
    TEST_CONFIG['candidate_count'] = DEV_CONFIG['candidate_count']

print("\nTest configuration created:")
print(json.dumps(TEST_CONFIG, indent=2))

Creating test configuration for model: multi_stage_hybrid

Test configuration created:
{
  "collection_path": "data/subtask4b_collection_data.pkl",
  "query_path": "data/subtask4b_query_tweets_test.tsv",
  "models": [
    "multi_stage_hybrid"
  ],
  "output_dir": "results/test_multi_stage_hybrid_20250530_141559",
  "top_k": 5,
  "collection_columns": [
    "title",
    "abstract"
  ],
  "cache_dir": "cache",
  "batch_size": 32,
  "use_gpu": true
}


## Step 5: Generate Test Set Predictions

In [6]:
print(f"Generating test predictions using {best_model}...")

# Run prediction on test set
test_results = evaluate_models(TEST_CONFIG)

print("\nTest predictions completed!")
print(f"Prediction file saved to: {test_results['output_dir']}")

2025-05-30 14:15:59,344 - INFO - Running multi_stage_hybrid...


Generating test predictions using multi_stage_hybrid...
Hybrid retriever using device: mps


Processing multi_stage_hybrid: 100%|██████████| 1446/1446 [23:12<00:00,  1.04it/s]


Test predictions completed!
Prediction file saved to: results/test_multi_stage_hybrid_20250530_141559





## Step 6: Save Complete Results Summary

In [7]:
# Create summary of the entire evaluation process
summary = {
    'evaluation_date': datetime.now().isoformat(),
    'dev_results': {
        'metrics': dev_results['metrics'],
        'best_model': best_model,
        'best_score': best_score,
        'output_dir': dev_results['output_dir']
    },
    'test_results': {
        'model_used': best_model,
        'output_dir': test_results['output_dir'],
        'config': TEST_CONFIG
    }
}

summary_file = f'results/evaluation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nComplete evaluation summary saved to: {summary_file}")

print("\n=== EVALUATION COMPLETE ===")
print(f"1. Evaluated {len(dev_results['metrics'])} models on dev set")
print(f"2. Best model: {best_model} (MRR@5: {best_score:.4f})")
print("3. Test predictions generated and saved")
print("\nAll results are in the 'results' directory.")


Complete evaluation summary saved to: results/evaluation_summary_20250530_143926.json

=== EVALUATION COMPLETE ===
1. Evaluated 10 models on dev set
2. Best model: multi_stage_hybrid (MRR@5: 0.6209)
3. Test predictions generated and saved

All results are in the 'results' directory.
