# PubMed Search Testing Notebook

This notebook tests the PubMed search and article retrieval functionality.

## Overview
- Test PubMed API integration
- Search for recent articles
- Retrieve article details
- Validate data quality

In [None]:
# Setup
import sys
import os
sys.path.append('../src')

import asyncio
import pandas as pd
from pathlib import Path
import json

In [None]:
# Import our modules
from pubmed.searcher import PubMedSearcher
from utils.config import load_config
from utils.logger import setup_logger, get_logger

# Setup logging
setup_logger(level="INFO")
logger = get_logger(__name__)

## 1. Configure PubMed Access

**Important**: You need to set your email in the config for PubMed API access.

In [None]:
# Load configuration
config = load_config()
print("PubMed configuration:")
print(f"Email: {config['pubmed']['email']}")
print(f"Base URL: {config['pubmed']['base_url']}")
print(f"Rate limit: {config['pubmed']['rate_limit_delay']}s")
print(f"Max articles per week: {config['pubmed']['max_articles_per_week']}")

if config['pubmed']['email'] == 'your-email@example.com':
    print("\n⚠️  WARNING: Please update your email in config/config.yaml or .env file")
    print("NCBI requires a valid email for API access")

In [None]:
# Initialize searcher
searcher = PubMedSearcher(config)
print("PubMed searcher initialized")

## 2. Test Basic Search

Start with a simple search to test the API connection.

In [None]:
# Test basic search with neuroscience terms
test_terms = ["neuroscience", "physiology"]
print(f"Testing search with terms: {test_terms}")

try:
    pmids = await searcher.search_recent_articles(
        query_terms=test_terms,
        days_back=7,
        max_results=10  # Small number for testing
    )
    
    print(f"✅ Search successful! Found {len(pmids)} articles")
    print(f"Sample PMIDs: {pmids[:5]}")
    
except Exception as e:
    print(f"❌ Search failed: {e}")
    pmids = []

## 3. Test Article Detail Retrieval

In [None]:
# Test fetching details for found articles
if pmids:
    print(f"Fetching details for {len(pmids)} articles...")
    
    try:
        articles = await searcher.fetch_article_details(pmids)
        print(f"✅ Retrieved details for {len(articles)} articles")
        
        if articles:
            sample = articles[0]
            print("\n📄 Sample article:")
            print(f"PMID: {sample.pmid}")
            print(f"Title: {sample.title}")
            print(f"Authors: {', '.join(sample.authors[:3]) if sample.authors else 'No authors'}")
            print(f"Journal: {sample.journal}")
            print(f"Publication Date: {sample.publication_date}")
            print(f"DOI: {sample.doi}")
            print(f"Abstract length: {len(sample.abstract) if sample.abstract else 0} characters")
            print(f"MeSH terms: {sample.mesh_terms[:5] if sample.mesh_terms else 'None'}")
            
            if sample.abstract:
                print(f"\nAbstract preview: {sample.abstract[:200]}...")
        
    except Exception as e:
        print(f"❌ Detail retrieval failed: {e}")
        articles = []
else:
    print("⏭️  Skipping detail retrieval (no PMIDs found)")
    articles = []

## 4. Test Different Search Strategies

In [None]:
# Test different search approaches
search_tests = [
    {
        'name': 'Broad biomedical search',
        'terms': None,  # Uses default broad search
        'days': 7,
        'max_results': 5
    },
    {
        'name': 'Specific neuroscience terms',
        'terms': ['hippocampus', 'memory', 'synaptic plasticity'],
        'days': 14,
        'max_results': 5
    },
    {
        'name': 'Cardiovascular research',
        'terms': ['cardiac', 'heart', 'cardiovascular'],
        'days': 7,
        'max_results': 5
    }
]

search_results = {}

for test in search_tests:
    print(f"\n🔍 Testing: {test['name']}")
    
    try:
        test_pmids = await searcher.search_recent_articles(
            query_terms=test['terms'],
            days_back=test['days'],
            max_results=test['max_results']
        )
        
        search_results[test['name']] = {
            'pmids': test_pmids,
            'count': len(test_pmids),
            'terms': test['terms']
        }
        
        print(f"   Found {len(test_pmids)} articles")
        
    except Exception as e:
        print(f"   ❌ Failed: {e}")
        search_results[test['name']] = {'pmids': [], 'count': 0, 'error': str(e)}

# Summary
print("\n📊 Search Results Summary:")
for name, result in search_results.items():
    print(f"   {name}: {result['count']} articles")

## 5. Data Quality Analysis

In [None]:
# Analyze data quality if we have articles
if articles:
    print("📊 Data Quality Analysis:")
    
    # Convert to DataFrame for analysis
    df_data = []
    for article in articles:
        df_data.append({
            'pmid': article.pmid,
            'title_length': len(article.title) if article.title else 0,
            'has_abstract': bool(article.abstract),
            'abstract_length': len(article.abstract) if article.abstract else 0,
            'author_count': len(article.authors) if article.authors else 0,
            'has_doi': bool(article.doi),
            'mesh_term_count': len(article.mesh_terms) if article.mesh_terms else 0,
            'journal': article.journal
        })
    
    df = pd.DataFrame(df_data)
    
    print(f"\nTotal articles analyzed: {len(df)}")
    print(f"Articles with abstracts: {df['has_abstract'].sum()} ({df['has_abstract'].mean()*100:.1f}%)")
    print(f"Articles with DOI: {df['has_doi'].sum()} ({df['has_doi'].mean()*100:.1f}%)")
    print(f"Average abstract length: {df['abstract_length'].mean():.0f} characters")
    print(f"Average author count: {df['author_count'].mean():.1f}")
    print(f"Average MeSH terms: {df['mesh_term_count'].mean():.1f}")
    
    # Top journals
    top_journals = df['journal'].value_counts().head()
    print(f"\nTop journals:")
    for journal, count in top_journals.items():
        print(f"   {journal}: {count}")
        
else:
    print("⏭️  No articles available for quality analysis")

## 6. Save Test Data

In [None]:
# Save test results
output_dir = Path("../data/raw")
output_dir.mkdir(parents=True, exist_ok=True)

if articles:
    # Save articles
    searcher.save_articles(articles, output_dir / "test_pubmed_articles.json")
    print(f"💾 Saved {len(articles)} test articles")
    
    # Save search results summary
    summary = {
        'timestamp': pd.Timestamp.now().isoformat(),
        'total_articles': len(articles),
        'search_results': search_results,
        'quality_metrics': {
            'articles_with_abstracts': int(df['has_abstract'].sum()),
            'articles_with_doi': int(df['has_doi'].sum()),
            'avg_abstract_length': float(df['abstract_length'].mean()),
            'avg_author_count': float(df['author_count'].mean())
        } if 'df' in locals() else {}
    }
    
    with open(output_dir / "pubmed_test_summary.json", 'w') as f:
        json.dump(summary, f, indent=2, default=str)
    
    print("💾 Saved test summary")
else:
    print("⏭️  No data to save")

## 7. Test Rate Limiting

In [None]:
# Test rate limiting with multiple requests
print("🕐 Testing rate limiting...")

import time

rate_test_results = []
start_time = time.time()

for i in range(3):  # Test 3 requests
    request_start = time.time()
    
    try:
        test_pmids = await searcher.search_recent_articles(
            query_terms=["test"],
            days_back=30,
            max_results=2
        )
        
        request_time = time.time() - request_start
        rate_test_results.append({
            'request': i+1,
            'time': request_time,
            'pmids_found': len(test_pmids),
            'success': True
        })
        
        print(f"   Request {i+1}: {request_time:.2f}s, {len(test_pmids)} PMIDs")
        
    except Exception as e:
        rate_test_results.append({
            'request': i+1,
            'error': str(e),
            'success': False
        })
        print(f"   Request {i+1}: Failed - {e}")

total_time = time.time() - start_time
print(f"\nTotal time for {len(rate_test_results)} requests: {total_time:.2f}s")
print(f"Average time per request: {total_time/len(rate_test_results):.2f}s")

successful_requests = [r for r in rate_test_results if r['success']]
print(f"Successful requests: {len(successful_requests)}/{len(rate_test_results)}")

## Next Steps

1. **Configure email**: Make sure you have a valid email in the configuration
2. **API key**: Consider getting a PubMed API key for higher rate limits
3. **Search optimization**: Fine-tune search terms based on IFC research areas
4. **Error handling**: Test how the system handles network issues, rate limits, etc.

## Common Issues
- **Email required**: NCBI requires a valid email address
- **Rate limiting**: Too many requests will get blocked
- **Network timeouts**: Large requests may timeout
- **XML parsing**: Malformed XML responses can cause errors