# Scientific Claim Retrieval: Complete Evaluation Workflow

This notebook:
1. Evaluates all models on the dev set
2. Identifies the best performing model
3. Generates test set predictions using the best model

In [2]:
# Import required libraries
from evaluator import evaluate_models
import json
from datetime import datetime

## Step 1: Define Full Configuration for All Models

Available Models:
```
    'bm25': BM25Retriever,
    'enhanced_bm25': EnhancedBM25Retriever,
    'tfidf': TfidfRetriever,

    'dense': DenseRetriever,
    'neural_rerank': NeuralReranker,
    'hybrid_rerank': HybridNeuralReranker,

    'langchain_rag': LangChainRAGRetriever,
    'langchain_reranker': LangChainRerankerRetriever,
    'langchain_query_expansion': LangChainQueryExpansionRetriever,

    'hybrid_retriever': HybridRetriever
```

In [None]:
DEV_CONFIG = {
    # Data paths
    'collection_path': 'data/subtask4b_collection_data.pkl',
    'query_path': 'data/subtask4b_query_tweets_dev.tsv',
    
    # Evaluate models           
    'models': [
               #'bm25', 
               #'enhanced_bm25', 
               #'tfidf', 
               'langchain_rag', 
               'langchain_reranker',
               #'dense', 
               #'neural_rerank',
               #'hybrid_rerank', 
               #'hybrid_retriever',
               ], 
    
    # Output directory with timestamp
    'output_dir': f'results/dev_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
    
    # Evaluation settings
    'top_k': 5,
    'mrr_k': [1, 5, 10],
    'collection_columns': ['title', 'abstract', 'authors', 'journal', 'publish_time'],
    
    # Existing model settings
    'embedding_model': 'sentence-transformers/allenai-specter',
    'vectordb_model': 'nomic-embed-text', # 'all-minilm',
    'reranker_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2', # 'BAAI/bge-reranker-base', #
    
    # Hybrid retrieval settings
    'rrf_k': 60,  # Reciprocal Rank Fusion constant
    'sparse_weight': 0.6,  # Weight for sparse retrieval in hybrid
    
    # Performance settings
    'candidate_count': 75,
    'batch_size': 32,
    'reranker_batch_size': 8,
    'use_gpu': True,

    # Data sampling (None = use full datasets)
    'sample_size': 150,
    'collection_sample_size': 1000,
    
    # Cache directory
    'cache_dir': 'cache',

    'show_progress': False,
}

## Step 2: Evaluate All Models on Dev Set

In [6]:
# Run evaluation
dev_results = evaluate_models(DEV_CONFIG)

print(f"\nEvaluation completed! Results saved to: {dev_results['output_dir']}")

2025-05-29 13:50:58,478 - INFO - Running langchain_rag...
Creating documents: 100%|██████████| 1000/1000 [00:00<00:00, 40926.43it/s]
  retrieved_docs = self.base_retriever.get_relevant_documents(query_text)[:top_k]
Processing langchain_rag: 100%|██████████| 150/150 [00:02<00:00, 58.97it/s]
2025-05-29 13:51:19,210 - INFO - langchain_rag MRR@5: 0.7014
2025-05-29 13:51:19,212 - INFO - Running langchain_reranker...
Creating documents: 100%|██████████| 1000/1000 [00:00<00:00, 34781.52it/s]
Processing langchain_reranker: 100%|██████████| 150/150 [02:24<00:00,  1.04it/s]
2025-05-29 13:54:08,329 - INFO - langchain_reranker MRR@5: 0.7351



Evaluation completed! Results saved to: results/dev_20250529_135056


## Step 3: Display Results and Find Best Model

In [None]:
# Display results for all models
print("=== Dev Set Evaluation Results ===")

if dev_results['metrics']:
    for model_name, metrics in dev_results['metrics'].items():
        print(f"\n{model_name}:")
        print(f"  MRR@1: {metrics[1]:.4f}")
        print(f"  MRR@5: {metrics[5]:.4f}")
        print(f"  MRR@10: {metrics[10]:.4f}")
    
    # Find best model based on MRR@5
    best_model = max(dev_results['metrics'].items(), key=lambda x: x[1][5])[0]
    best_score = dev_results['metrics'][best_model][5]
    
    print("\n" + "=" * 30)
    print(f"Best model: {best_model} (MRR@5: {best_score:.4f})")
    print("=" * 30)
else:
    print("No evaluation metrics available (test set mode)")

## Step 4: Create Configuration for Best Model

In [None]:
# Extract relevant configuration parameters for the best model
print(f"Creating test configuration for model: {best_model}")

# Start with the base configuration
TEST_CONFIG = {
    # Update paths for test set
    'collection_path': DEV_CONFIG['collection_path'],
    'query_path': 'data/subtask4b_query_tweets_test.tsv',  # Test set for final submission

    # Use only the best model
    'models': [best_model],
    
    # New output directory for test predictions
    'output_dir': f'results/test_{best_model}_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
    
    # Copy relevant settings from dev config
    'top_k': DEV_CONFIG['top_k'],
    'collection_columns': DEV_CONFIG['collection_columns'],
    'cache_dir': DEV_CONFIG['cache_dir'],
    'batch_size': DEV_CONFIG['batch_size'],
    'use_gpu': DEV_CONFIG['use_gpu'],
}

# Add model-specific settings based on the best model type
if 'langchain' in best_model:
    TEST_CONFIG['langchain_embedding'] = DEV_CONFIG['vectordb_model']
    TEST_CONFIG['candidate_count'] = DEV_CONFIG['candidate_count']
    
    if 'reranker' in best_model:
        TEST_CONFIG['reranker_model'] = DEV_CONFIG['reranker_model']
        TEST_CONFIG['reranker_batch_size'] = DEV_CONFIG['reranker_batch_size']
    
    if 'query_expansion' in best_model:
        TEST_CONFIG['sample_for_expansion'] = DEV_CONFIG['sample_for_expansion']

elif best_model == 'dense':
    TEST_CONFIG['embedding_model'] = DEV_CONFIG['embedding_model']

elif best_model == 'neural_rerank':
    TEST_CONFIG['reranker_model'] = DEV_CONFIG['reranker_model']
    TEST_CONFIG['reranker_batch_size'] = DEV_CONFIG['reranker_batch_size']
    TEST_CONFIG['candidate_count'] = DEV_CONFIG['candidate_count']

print("\nTest configuration created:")
print(json.dumps(TEST_CONFIG, indent=2))

## Step 5: Generate Test Set Predictions

In [None]:
print(f"Generating test predictions using {best_model}...")

# Run prediction on test set
test_results = evaluate_models(TEST_CONFIG)

print("\nTest predictions completed!")
print(f"Prediction file saved to: {test_results['output_dir']}")

## Step 6: Save Complete Results Summary

In [None]:
# Create summary of the entire evaluation process
summary = {
    'evaluation_date': datetime.now().isoformat(),
    'dev_results': {
        'metrics': dev_results['metrics'],
        'best_model': best_model,
        'best_score': best_score,
        'output_dir': dev_results['output_dir']
    },
    'test_results': {
        'model_used': best_model,
        'output_dir': test_results['output_dir'],
        'config': TEST_CONFIG
    }
}

# Save summary to file
summary_file = f'results/evaluation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nComplete evaluation summary saved to: {summary_file}")

# Final summary
print("\n=== EVALUATION COMPLETE ===")
print(f"1. Evaluated {len(dev_results['metrics'])} models on dev set")
print(f"2. Best model: {best_model} (MRR@5: {best_score:.4f})")
print("3. Test predictions generated and saved")
print("\nAll results are in the 'results' directory.")