In [3]:
from paper_quest import evaluate_models

def run_evaluation():
    """
    Run the scientific claim retrieval evaluation with custom configuration
    """
    # Create a configuration with custom parameters
    config = {
        'cache_dir': 'cache_temp',
        'embedding_model': 'sentence-transformers/allenai-specter',
        'langchain_embedding': 'nomic-embed-text',
        'reranker_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2',
        'use_gpu': True,  
    }

    # Run evaluation with specific models
    results = evaluate_models(
        collection_path='data/subtask4b_collection_data.pkl',
        query_path='data/subtask4b_query_tweets_dev.tsv',
        #models_to_run=['bm25', 'enhanced_bm25', 'dense', 'neural_rerank', 'langchain_rag', 'langchain_reranker', 'langchain_query_expansion], 
        models_to_run =["langchain_rag", "langchain_query_expansion"],
        output_dir='results',
        collection_columns=['title', 'abstract', 'authors'],
        sample_size=200,  # None for full query dataset
        collection_sample_size=100,  # None for full paper collection
        **config
    )

    print("\n=== Final Results ===")
    for model, scores in results.items():
        print(f"{model} MRR@1: {scores.get(1, 'N/A')}")
        print(f"{model} MRR@5: {scores.get(5, 'N/A')}")
        print(f"{model} MRR@10: {scores.get(10, 'N/A')}")
        print("-" * 30)
    
    # Determine best model
    best_model = max(results.items(), key=lambda x: x[1].get(5, 0))[0]
    print(f"\nBest model: {best_model}")
    
    return results

if __name__ == "__main__":
    run_evaluation()

2025-05-09 14:59:13,065 - INFO - Loading collection data from: data/subtask4b_collection_data.pkl
2025-05-09 14:59:13,639 - INFO - Loading query data from: data/subtask4b_query_tweets_dev.tsv
2025-05-09 14:59:13,643 - INFO - Sampling 100 papers from collection (from 7718 total)
2025-05-09 14:59:13,647 - INFO - Loading query data from: data/subtask4b_query_tweets_dev.tsv
2025-05-09 14:59:13,651 - INFO - Sampling 200 queries
2025-05-09 14:59:13,651 - INFO - Collection size: 100
2025-05-09 14:59:13,652 - INFO - Query set size: 200
2025-05-09 14:59:13,652 - INFO - 
=== Running langchain_rag ===
2025-05-09 14:59:13,662 - INFO - Retrieving documents for 200 queries...
Processing batches:   0%|          | 0/1 [00:00<?, ?it/s]2025-05-09 14:59:14,001 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-05-09 14:59:14,024 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-05-09 14:59:14,050 - INFO - HTTP Request: POST http://127.0.0


=== Final Results ===
langchain_rag MRR@1: 0.1
langchain_rag MRR@5: 0.10416666666666669
langchain_rag MRR@10: 0.10416666666666669
------------------------------
langchain_query_expansion MRR@1: 0.105
langchain_query_expansion MRR@5: 0.1085
langchain_query_expansion MRR@10: 0.1085
------------------------------

Best model: langchain_query_expansion
