# Scientific Claim Retrieval: Complete Evaluation Workflow

This notebook:
1. Evaluates all models on the dev set
2. Identifies the best performing model
3. Generates test set predictions using the best model

In [1]:
# Import required libraries
from evaluator import evaluate_models
import json
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


## Step 1: Define Full Configuration for All Models

Available Models:
```
    'bm25': BM25Retriever,
    'enhanced_bm25': EnhancedBM25Retriever,
    'tfidf': TfidfRetriever,
    'enhanced_tfidf': EnhancedTfidfRetriever,
    'hybrid_tfidf_bm25': HybridTfidfBM25Retriever,

    'dense': DenseRetriever,
    'neural_rerank': NeuralReranker,
    'distilled_rerank': DistilledNeuralReranker,
    'hybrid_rerank': HybridNeuralReranker,
    'contrastive_rerank': ContrastiveReranker,

    'langchain_rag': LangChainRAGRetriever,
    'langchain_reranker': LangChainRerankerRetriever,
    'langchain_query_expansion': LangChainQueryExpansionRetriever,
```

In [None]:
DEV_CONFIG = {
    # Data paths
    'collection_path': 'data/subtask4b_collection_data.pkl',
    'query_path': 'data/subtask4b_query_tweets_dev.tsv',
    
    # Evaluate ALL models including advanced ones
    'models': ['all'], 
    
    # Output directory with timestamp
    'output_dir': f'results/dev_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
    
    # Evaluation settings
    'top_k': 5,
    'mrr_k': [1, 5, 10],
    'collection_columns': ['title', 'abstract'],
    
    # Existing model settings
    'embedding_model': 'sentence-transformers/allenai-specter',
    'vectordb_model': 'nomic-embed-text',
    'reranker_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2',
    
    # Knowledge distillation settings
    'teacher_model': 'cross-encoder/ms-marco-MiniLM-L-12-v2',
    'student_model': 'cross-encoder/ms-marco-TinyBERT-L-2-v2',
    
    # Hybrid retrieval settings
    'rrf_k': 60,  # Reciprocal Rank Fusion constant
    'sparse_weight': 0.5,  # Weight for sparse retrieval in hybrid
    
    # Contrastive learning settings
    'contrastive_base_model': 'sentence-transformers/all-MiniLM-L6-v2',
    
    # Performance settings
    'candidate_count': 100,
    'batch_size': 32,
    'reranker_batch_size': 8,
    'use_gpu': True,
    'sample_for_expansion': 100,
    
    # Data sampling (None = use full datasets)
    'sample_size': None,
    'collection_sample_size': None,
    
    # Cache directory
    'cache_dir': 'cache'
}

## Step 2: Evaluate All Models on Dev Set

In [3]:
print("Starting evaluation of all models on dev set...")

# Run evaluation
dev_results = evaluate_models(DEV_CONFIG)

print(f"\nEvaluation completed! Results saved to: {dev_results['output_dir']}")

Starting evaluation of all models on dev set...


2025-05-10 06:35:51,467 - INFO - Running bm25...
Processing bm25_baseline batches: 100%|██████████| 2/2 [00:00<00:00, 77.71it/s]
2025-05-10 06:35:51,532 - INFO - bm25 MRR@5: 0.1000
2025-05-10 06:35:51,534 - INFO - Running enhanced_bm25...
Preprocessing documents: 100%|██████████| 100/100 [00:00<00:00, 592.99it/s]
Processing enhanced_bm25 batches: 100%|██████████| 2/2 [00:00<00:00, 12.09it/s]
2025-05-10 06:35:51,891 - INFO - enhanced_bm25 MRR@5: 0.1000
2025-05-10 06:35:51,892 - INFO - Running tfidf...
Vectorizing documents: 100%|██████████| 100/100 [00:00<00:00, 7408.08it/s]
Processing tfidf_baseline batches: 100%|██████████| 2/2 [00:00<00:00, 91.60it/s]
2025-05-10 06:35:51,946 - INFO - tfidf MRR@5: 0.1100
2025-05-10 06:35:51,947 - INFO - Running enhanced_tfidf...
Preprocessing documents: 100%|██████████| 100/100 [00:00<00:00, 598.46it/s]
Creating TF-IDF matrix: 100%|██████████| 100/100 [00:00<00:00, 5383.59it/s]
Processing enhanced_tfidf batches: 100%|██████████| 2/2 [00:00<00:00, 30.0


Evaluation completed! Results saved to: results/dev_20250510_063550


## Step 3: Display Results and Find Best Model

In [4]:
# Display results for all models
print("=== Dev Set Evaluation Results ===")

if dev_results['metrics']:
    for model_name, metrics in dev_results['metrics'].items():
        print(f"\n{model_name}:")
        print(f"  MRR@1: {metrics[1]:.4f}")
        print(f"  MRR@5: {metrics[5]:.4f}")
        print(f"  MRR@10: {metrics[10]:.4f}")
    
    # Find best model based on MRR@5
    best_model = max(dev_results['metrics'].items(), key=lambda x: x[1][5])[0]
    best_score = dev_results['metrics'][best_model][5]
    
    print("\n" + "=" * 30)
    print(f"Best model: {best_model} (MRR@5: {best_score:.4f})")
    print("=" * 30)
else:
    print("No evaluation metrics available (test set mode)")

=== Dev Set Evaluation Results ===

bm25:
  MRR@1: 0.1000
  MRR@5: 0.1000
  MRR@10: 0.1000

enhanced_bm25:
  MRR@1: 0.0800
  MRR@5: 0.1000
  MRR@10: 0.1000

tfidf:
  MRR@1: 0.1000
  MRR@5: 0.1100
  MRR@10: 0.1100

enhanced_tfidf:
  MRR@1: 0.1200
  MRR@5: 0.1200
  MRR@10: 0.1200

hybrid_tfidf_bm25:
  MRR@1: 0.1000
  MRR@5: 0.1000
  MRR@10: 0.1000

dense:
  MRR@1: 0.1200
  MRR@5: 0.1200
  MRR@10: 0.1200

neural_rerank:
  MRR@1: 0.1000
  MRR@5: 0.1000
  MRR@10: 0.1000

distilled_rerank:
  MRR@1: 0.1200
  MRR@5: 0.1200
  MRR@10: 0.1200

hybrid_rerank:
  MRR@1: 0.1200
  MRR@5: 0.1200
  MRR@10: 0.1200

contrastive_rerank:
  MRR@1: 0.1200
  MRR@5: 0.1200
  MRR@10: 0.1200

langchain_rag:
  MRR@1: 0.1200
  MRR@5: 0.1200
  MRR@10: 0.1200

langchain_reranker:
  MRR@1: 0.1200
  MRR@5: 0.1200
  MRR@10: 0.1200

langchain_query_expansion:
  MRR@1: 0.1200
  MRR@5: 0.1200
  MRR@10: 0.1200

Best model: enhanced_tfidf (MRR@5: 0.1200)


## Step 4: Create Configuration for Best Model

In [5]:
# Extract relevant configuration parameters for the best model
print(f"Creating test configuration for model: {best_model}")

# Start with the base configuration
TEST_CONFIG = {
    # Update paths for test set
    'collection_path': DEV_CONFIG['collection_path'],
    'query_path': 'data/subtask4b_query_tweets_test.tsv',  # Test set
    
    # Use only the best model
    'models': [best_model],
    
    # New output directory for test predictions
    'output_dir': f'results/test_{best_model}_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
    
    # Copy relevant settings from dev config
    'top_k': DEV_CONFIG['top_k'],
    'collection_columns': DEV_CONFIG['collection_columns'],
    'cache_dir': DEV_CONFIG['cache_dir'],
    'batch_size': DEV_CONFIG['batch_size'],
    'use_gpu': DEV_CONFIG['use_gpu'],
}

# Add model-specific settings based on the best model type
if 'langchain' in best_model:
    TEST_CONFIG['langchain_embedding'] = DEV_CONFIG['langchain_embedding']
    TEST_CONFIG['candidate_count'] = DEV_CONFIG['candidate_count']
    
    if 'reranker' in best_model:
        TEST_CONFIG['reranker_model'] = DEV_CONFIG['reranker_model']
        TEST_CONFIG['reranker_batch_size'] = DEV_CONFIG['reranker_batch_size']
    
    if 'query_expansion' in best_model:
        TEST_CONFIG['sample_for_expansion'] = DEV_CONFIG['sample_for_expansion']

elif best_model == 'dense':
    TEST_CONFIG['embedding_model'] = DEV_CONFIG['embedding_model']

elif best_model == 'neural_rerank':
    TEST_CONFIG['reranker_model'] = DEV_CONFIG['reranker_model']
    TEST_CONFIG['reranker_batch_size'] = DEV_CONFIG['reranker_batch_size']
    TEST_CONFIG['candidate_count'] = DEV_CONFIG['candidate_count']

# For BM25 models, no additional config needed

print("\nTest configuration created:")
print(json.dumps(TEST_CONFIG, indent=2))

Creating test configuration for model: enhanced_tfidf

Test configuration created:
{
  "collection_path": "data/subtask4b_collection_data.pkl",
  "query_path": "data/subtask4b_query_tweets_test.tsv",
  "models": [
    "enhanced_tfidf"
  ],
  "output_dir": "results/test_enhanced_tfidf_20250510_064322",
  "top_k": 5,
  "collection_columns": [
    "title",
    "abstract"
  ],
  "cache_dir": "cache",
  "batch_size": 32,
  "use_gpu": true
}


## Step 5: Generate Test Set Predictions

In [6]:
print(f"Generating test predictions using {best_model}...")

# Run prediction on test set
test_results = evaluate_models(TEST_CONFIG)

print("\nTest predictions completed!")
print(f"Prediction file saved to: {test_results['output_dir']}")

Generating test predictions using enhanced_tfidf...


2025-05-10 06:43:23,036 - INFO - Running enhanced_tfidf...
Preprocessing documents: 100%|██████████| 7718/7718 [00:11<00:00, 648.78it/s]
Creating TF-IDF matrix: 100%|██████████| 7718/7718 [00:01<00:00, 5011.03it/s]
Processing enhanced_tfidf batches: 100%|██████████| 46/46 [00:25<00:00,  1.81it/s]


Test predictions completed!
Prediction file saved to: results/test_enhanced_tfidf_20250510_064322





## Step 6: Save Complete Results Summary

In [7]:
# Create summary of the entire evaluation process
summary = {
    'evaluation_date': datetime.now().isoformat(),
    'dev_results': {
        'metrics': dev_results['metrics'],
        'best_model': best_model,
        'best_score': best_score,
        'output_dir': dev_results['output_dir']
    },
    'test_results': {
        'model_used': best_model,
        'output_dir': test_results['output_dir'],
        'config': TEST_CONFIG
    }
}

# Save summary to file
summary_file = f'results/evaluation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nComplete evaluation summary saved to: {summary_file}")

# Final summary
print("\n=== EVALUATION COMPLETE ===")
print(f"1. Evaluated {len(dev_results['metrics'])} models on dev set")
print(f"2. Best model: {best_model} (MRR@5: {best_score:.4f})")
print("3. Test predictions generated and saved")
print("\nAll results are in the 'results' directory.")


Complete evaluation summary saved to: results/evaluation_summary_20250510_064404.json

=== EVALUATION COMPLETE ===
1. Evaluated 13 models on dev set
2. Best model: enhanced_tfidf (MRR@5: 0.1200)
3. Test predictions generated and saved

All results are in the 'results' directory.


In [8]:
'''from ollama import chat
from pydantic import BaseModel

class ScientificQueryExpansion(BaseModel):
    """Schema for scientific query expansions"""
    title: str 
    abstract: str


scientific_tweet = """We should track the long-term effects of these vaccines closely, particularly when given to otherwise healthy people."""
prompt = f"Create one scientific variation out of your imagination of this tweet in form of a short abstract - only take information given from this tweet.: {scientific_tweet}"

response = chat(
  messages=[
    {
      'role': 'user',
      'content': prompt,
    }
  ],
  model='llama3.2',
  options={'temperature': 0.15},
  format=ScientificQueryExpansion.model_json_schema(),
)

country = ScientificQueryExpansion.model_validate_json(response.message.content)


print(country.title) # if expansion 1 contaions a title. skip as it is not needed
print(country.abstract)
'''

'from ollama import chat\nfrom pydantic import BaseModel\n\nclass ScientificQueryExpansion(BaseModel):\n    """Schema for scientific query expansions"""\n    title: str \n    abstract: str\n\n\nscientific_tweet = """We should track the long-term effects of these vaccines closely, particularly when given to otherwise healthy people."""\nprompt = f"Create one scientific variation out of your imagination of this tweet in form of a short abstract - only take information given from this tweet.: {scientific_tweet}"\n\nresponse = chat(\n  messages=[\n    {\n      \'role\': \'user\',\n      \'content\': prompt,\n    }\n  ],\n  model=\'llama3.2\',\n  options={\'temperature\': 0.15},\n  format=ScientificQueryExpansion.model_json_schema(),\n)\n\ncountry = ScientificQueryExpansion.model_validate_json(response.message.content)\n\n\nprint(country.title) # if expansion 1 contaions a title. skip as it is not needed\nprint(country.abstract)\n'