In [1]:
from paper_quest import evaluate_models

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
DEFAULT_CONFIG = {
    "cache_dir": "cache_temp",
    "embedding_model": "sentence-transformers/allenai-specter",
    "langchain_embedding": "nomic-embed-text",
    "reranker_model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
    "candidate_count": 100,
    "batch_size": 32,
    "use_gpu": True,
    "sample_for_expansion": 100,
}

In [None]:
def run_test_predictions():
    """
    Run predictions on the test set for final submission
    """
    # Run evaluation (prediction only) on test set
    results = evaluate_models(
        collection_path="data/subtask4b_collection_data.pkl",
        query_path="data/subtask4b_query_tweets_test.tsv",
        models_to_run=["langchain_rag"],
        output_dir="results",
        collection_columns=["title", "abstract", "authors"],
        top_k=5,
        sample_size=None,             # use full test set
        collection_sample_size=None,  # use full collection
        mrr_k=[1, 5, 10],             # ignored for test set
        **DEFAULT_CONFIG
    )

    print("\nPredictions completed successfully!")
    print("Check the 'results' directory for your submission file.")
    return results


def run_dev_evaluation():
    """
    Run evaluation on the dev set to check model performance
    """
    # Run evaluation on dev set
    results = evaluate_models(
        collection_path="data/subtask4b_collection_data.pkl",
        query_path="data/subtask4b_query_tweets_dev.tsv",
        models_to_run=["langchain_rag"],
        output_dir="results",
        collection_columns=["title", "abstract", "authors"],
        top_k=5,
        sample_size=None,             # full dev set
        collection_sample_size=None,  # full collection
        mrr_k=[1, 5, 10],
        **DEFAULT_CONFIG
    )

    # Print and analyze results
    print("\n=== Dev Set Results ===")
    for model, scores in results.items():
        print(f"{model} MRR@1: {scores.get(1, 'N/A')}")
        print(f"{model} MRR@5: {scores.get(5, 'N/A')}")
        print(f"{model} MRR@10: {scores.get(10, 'N/A')}")
        print("-" * 30)

    # Determine best model (by MRR@5)
    if results:
        best_model = max(results.items(), key=lambda x: x[1].get(5, 0))[0]
        print(f"\nBest model: {best_model}")

    return results


if __name__ == "__main__":
    print("=== Running evaluation on dev set ===")
    dev_results = run_dev_evaluation()

    print("\n=== Running predictions on test set ===")
    test_results = run_test_predictions()

2025-05-09 23:39:19,993 - INFO - Loading collection data from: data/subtask4b_collection_data.pkl


=== Running evaluation on dev set ===


2025-05-09 23:39:20,553 - INFO - Loading query data from: data/subtask4b_query_tweets_dev.tsv
2025-05-09 23:39:20,557 - INFO - Collection size: 7718
2025-05-09 23:39:20,558 - INFO - Query set size: 1400
2025-05-09 23:39:20,558 - INFO - 
=== Running langchain_rag ===
Creating documents: 100%|██████████| 7718/7718 [00:00<00:00, 37911.80it/s]
2025-05-09 23:39:20,773 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
