# PubMed Search Testing Notebook

This notebook tests the PubMed search and article retrieval functionality.

## Overview
- Test PubMed API integration
- Search for recent articles
- Retrieve article details
- Validate data quality

In [3]:
# Setup
import sys
import os
sys.path.append('../src')

import asyncio
import pandas as pd
from pathlib import Path
import json

In [4]:
# Import our modules - Fixed import paths
import sys
import os
from pathlib import Path

# Add src directory to path for imports
notebook_dir = Path().resolve()
src_dir = notebook_dir.parent / "src"
sys.path.insert(0, str(src_dir))

print(f"Notebook directory: {notebook_dir}")
print(f"Source directory: {src_dir}")
print(f"Source exists: {src_dir.exists()}")

# Now import our modules
from pubmed.searcher import PubMedSearcher
from utils.config import load_config
from utils.logger import setup_logger, get_logger

# Setup logging
setup_logger(level="INFO")
logger = get_logger(__name__)

print("✅ All imports successful!")

Notebook directory: /home/santi/Projects/UBMI-IFC-Podcast/notebooks
Source directory: /home/santi/Projects/UBMI-IFC-Podcast/src
Source exists: True
✅ All imports successful!


## 1. Configure PubMed Access

**Important**: You need to set your email in the config for PubMed API access.

In [5]:
# Load configuration
config = load_config()
print("PubMed configuration:")
print(f"Email: {config['pubmed']['email']}")
print(f"Base URL: {config['pubmed']['base_url']}")
print(f"Rate limit: {config['pubmed']['rate_limit_delay']}s")
print(f"Max articles per week: {config['pubmed']['max_articles_per_week']}")

if config['pubmed']['email'] == 'your-email@example.com':
    print("\n⚠️  WARNING: Please update your email in config/config.yaml or .env file")
    print("NCBI requires a valid email for API access")

PubMed configuration:
Email: santiago_gr@ciencias.unam.mx
Base URL: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/
Rate limit: 0.34s
Max articles per week: 1000


In [6]:
# Initialize searcher
searcher = PubMedSearcher(config)
print("PubMed searcher initialized")

[32m2025-09-18 21:11:27[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m__init__[0m:[36m64[0m - [1mPyMed backend enabled[0m


PubMed searcher initialized


## 2. Test Basic Search

Start with a simple search to test the API connection.

In [7]:
# Test basic search with neuroscience terms
import asyncio

async def test_basic_search():
    test_terms = ["neuroscience", "physiology"]
    print(f"Testing search with terms: {test_terms}")

    try:
        pmids = await searcher.search_recent_articles(
            query_terms=test_terms,
            days_back=7,
            max_results=10  # Small number for testing
        )
        
        print(f"✅ Search successful! Found {len(pmids)} articles")
        print(f"Sample PMIDs: {pmids[:5]}")
        return pmids
        
    except Exception as e:
        print(f"❌ Search failed: {e}")
        return []

# Run the async function
pmids = await test_basic_search()

[32m2025-09-18 21:11:27[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m95[0m - [1mSearching PubMed with PyMed query: "neuroscience"[Abstract] OR "physiology"[Abstract][0m


Testing search with terms: ['neuroscience', 'physiology']


[32m2025-09-18 21:11:28[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m105[0m - [1mFound 10 articles with PyMed[0m


✅ Search successful! Found 10 articles
Sample PMIDs: ['40966807', '40966771', '40966769\n31130835\n38977678\n33383465\n36800922\n27347565\n34806255\n35557839\n29607681\n33817338\n38063381\n39155691\n39010724\n32498641\n39958843\n37085963\n40134415\n30471068\n38770650\n27865153\n36573880\n39746340\n38535006\n37478044\n38796881\n28061367\n34149344\n37382113\n32523031\n29435706\n39252680\n39633895\n31104716\n40185793\n31046514\n38770651\n38669700\n35804833\n37990157\n27518905\n27845889\n30265656\n35091282\n29998214\n39789428\n33423031\n32866134\n38252192\n34503108\n38623649\n39935841\n37958188\n38666952\n27908154\n39984532\n29078042\n36641543\n28301734\n39901804\n39732110\n31514144\n32414878\n32658780\n34203573\n32027965\n38814770\n39761623\n39004932\n37581055\n31993885\n30207966\n37830300\n34460653\n39822290\n32834795\n37103982\n36059677\n36846979', '40966722', '40966720']


In [8]:
# Test with a simpler query to isolate the issue
import aiohttp

async def test_simple_search():
    """Test with a very simple query without date filters"""
    
    # Override the searcher method temporarily for testing
    original_search = searcher.search_recent_articles
    
    async def simple_search_override(query_terms=None, days_back=7, max_results=1000):
        """Simplified search without complex date filters"""
        
        # Simple query without date filters
        if query_terms:
            query = " OR ".join([f'"{term}"[Title/Abstract]' for term in query_terms])
        else:
            query = "neuroscience"
        
        searcher.logger.info(f"Simple search query: {query}")
        
        # Parameters for esearch
        params = {
            'db': 'pubmed',
            'term': query,
            'retmax': max_results,
            'retmode': 'xml',
            'tool': 'ubmi-ifc-podcast',
            'email': searcher.email,
            'sort': 'relevance'
        }
        
        url = f"{searcher.base_url}esearch.fcgi"
        
        async with aiohttp.ClientSession() as session:
            try:
                async with session.get(url, params=params) as response:
                    if response.status == 200:
                        xml_content = await response.text()
                        pmids = searcher._parse_search_results(xml_content)
                        searcher.logger.info(f"Found {len(pmids)} articles")
                        return pmids[:max_results]
                    else:
                        searcher.logger.error(f"Search failed with status {response.status}")
                        error_content = await response.text()
                        searcher.logger.error(f"Error content: {error_content}")
                        return []
            except Exception as e:
                searcher.logger.error(f"Error in simple search: {str(e)}")
                return []
    
    # Temporarily replace the method
    searcher.search_recent_articles = simple_search_override
    
    try:
        pmids = await searcher.search_recent_articles(
            query_terms=["neuroscience"],
            max_results=5
        )
        print(f"Simple search found {len(pmids)} PMIDs: {pmids}")
        return pmids
    finally:
        # Restore original method
        searcher.search_recent_articles = original_search

# Test simple search
simple_pmids = await test_simple_search()

[32m2025-09-18 21:11:28[0m | [1mINFO[0m | [36m__main__[0m:[36msimple_search_override[0m:[36m19[0m - [1mSimple search query: "neuroscience"[Title/Abstract][0m
[32m2025-09-18 21:11:29[0m | [1mINFO[0m | [36m__main__[0m:[36msimple_search_override[0m:[36m40[0m - [1mFound 5 articles[0m


Simple search found 5 PMIDs: ['30085354', '29723499', '30522733', '37736162', '34381347']


In [9]:
# Test PubMed API directly with a temporary valid email
import aiohttp

async def test_pubmed_api_direct():
    """Test PubMed API directly to diagnose issues"""
    
    # Use a simple test email for API testing
    test_email = "test@example.com"  # You should replace this with your actual email
    
    # Simple search query
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        'db': 'pubmed',
        'term': 'neuroscience[Title]',
        'retmax': 5,
        'retmode': 'xml',
        'tool': 'ifc-podcast-generator',
        'email': test_email
    }
    
    print(f"Testing direct PubMed API call...")
    print(f"URL: {base_url}")
    print(f"Params: {params}")
    
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(base_url, params=params) as response:
                print(f"Status: {response.status}")
                print(f"Headers: {dict(response.headers)}")
                
                if response.status == 200:
                    content = await response.text()
                    print(f"Response length: {len(content)}")
                    print(f"Response preview: {content[:500]}...")
                    
                    # Try to parse XML
                    from xml.etree import ElementTree as ET
                    try:
                        root = ET.fromstring(content)
                        id_list = root.find('.//IdList')
                        if id_list is not None:
                            pmids = [id_elem.text for id_elem in id_list.findall('Id')]
                            print(f"Found PMIDs: {pmids}")
                        else:
                            print("No IdList found in response")
                    except ET.ParseError as e:
                        print(f"XML parse error: {e}")
                        
                else:
                    error_content = await response.text()
                    print(f"Error response: {error_content}")
                    
        except Exception as e:
            print(f"Request failed: {e}")

# Run direct API test
await test_pubmed_api_direct()

Testing direct PubMed API call...
URL: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi
Params: {'db': 'pubmed', 'term': 'neuroscience[Title]', 'retmax': 5, 'retmode': 'xml', 'tool': 'ifc-podcast-generator', 'email': 'test@example.com'}
Status: 200
Headers: {'Date': 'Fri, 19 Sep 2025 03:11:28 GMT', 'Server': 'Finatra', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'Content-Security-Policy': 'upgrade-insecure-requests', 'Referrer-Policy': 'origin-when-cross-origin', 'NCBI-SID': '3C9138DA6EC2903E_C299SID', 'NCBI-PHID': '1D34A706699E40C500004503BD583483.1.1.m_1', 'Content-Type': 'text/xml; charset=UTF-8', 'Cache-Control': 'private', 'Content-Encoding': 'gzip', 'X-RateLimit-Limit': '3', 'X-RateLimit-Remaining': '0', 'Access-Control-Allow-Origin': '*', 'Access-Control-Expose-Headers': 'X-RateLimit-Limit,X-RateLimit-Remaining', 'Set-Cookie': 'ncbi_sid=3C9138DA6EC2903E_C299SID; domain=.nih.gov; path=/; expires=Sat, 19 Sep 2026 03:11:29 GMT', 'X-UA-Comp

## 3. Test Article Detail Retrieval

In [10]:
# Test fetching details for found articles
async def test_article_details():
    if pmids:
        print(f"Fetching details for {len(pmids)} articles...")
        
        try:
            articles = await searcher.fetch_article_details(pmids)
            print(f"✅ Retrieved details for {len(articles)} articles")
            
            if articles:
                sample = articles[0]
                print("\n📄 Sample article:")
                print(f"PMID: {sample.pmid}")
                print(f"Title: {sample.title}")
                print(f"Authors: {', '.join(sample.authors[:3]) if sample.authors else 'No authors'}")
                print(f"Journal: {sample.journal}")
                print(f"Publication Date: {sample.publication_date}")
                print(f"DOI: {sample.doi}")
                print(f"Abstract length: {len(sample.abstract) if sample.abstract else 0} characters")
                print(f"MeSH terms: {sample.mesh_terms[:5] if sample.mesh_terms else 'None'}")
                
                if sample.abstract:
                    print(f"\nAbstract preview: {sample.abstract[:200]}...")
            
            return articles
            
        except Exception as e:
            print(f"❌ Detail retrieval failed: {e}")
            return []
    else:
        print("⏭️  Skipping detail retrieval (no PMIDs found)")
        return []

# Run the async function
articles = await test_article_details()

Fetching details for 10 articles...


[32m2025-09-18 21:11:30[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36mfetch_article_details[0m:[36m195[0m - [1mRetrieved details for 87 articles[0m


✅ Retrieved details for 87 articles

📄 Sample article:
PMID: 40966807
Title: Single low-density polyethylene microplastics stress and drought co-exposure effects on lettuce (Lactuca sativa) physiology, growth, and root development.
Authors: Angelica Barone, Giorgio Impollonia, Michele Croci
Journal: The Science of the total environment
Publication Date: 2025-Sep-17
DOI: 10.1016/j.scitotenv.2025.180513
Abstract length: 1584 characters
MeSH terms: None

Abstract preview: Microplastics (MPs), plastic particles smaller than 5 mm, can be found agricultural soils and interact with other sources of stress, such as drought. In this study, low-density polyethylene (LDPE) MPs...


## 4. Test Different Search Strategies

In [11]:
# Test different search approaches
async def test_search_strategies():
    search_tests = [
        {
            'name': 'Broad biomedical search',
            'terms': None,  # Uses default broad search
            'days': 7,
            'max_results': 5
        },
        {
            'name': 'Specific neuroscience terms',
            'terms': ['hippocampus', 'memory', 'synaptic plasticity'],
            'days': 14,
            'max_results': 5
        },
        {
            'name': 'Cardiovascular research',
            'terms': ['cardiac', 'heart', 'cardiovascular'],
            'days': 7,
            'max_results': 5
        }
    ]

    search_results = {}

    for test in search_tests:
        print(f"\n🔍 Testing: {test['name']}")
        
        try:
            test_pmids = await searcher.search_recent_articles(
                query_terms=test['terms'],
                days_back=test['days'],
                max_results=test['max_results']
            )
            
            search_results[test['name']] = {
                'pmids': test_pmids,
                'count': len(test_pmids),
                'terms': test['terms']
            }
            
            print(f"   Found {len(test_pmids)} articles")
            
        except Exception as e:
            print(f"   ❌ Failed: {e}")
            search_results[test['name']] = {'pmids': [], 'count': 0, 'error': str(e)}

    # Summary
    print("\n📊 Search Results Summary:")
    for name, result in search_results.items():
        print(f"   {name}: {result['count']} articles")
    
    return search_results

# Run the async function
search_results = await test_search_strategies()

[32m2025-09-18 21:11:30[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m95[0m - [1mSearching PubMed with PyMed query: (humans[MeSH Terms]) AND (english[Language])[0m



🔍 Testing: Broad biomedical search


[32m2025-09-18 21:11:32[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m105[0m - [1mFound 5 articles with PyMed[0m
[32m2025-09-18 21:11:32[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m95[0m - [1mSearching PubMed with PyMed query: "hippocampus"[Abstract] OR "memory"[Abstract] OR "synaptic plasticity"[Abstract][0m


   Found 5 articles

🔍 Testing: Specific neuroscience terms


[32m2025-09-18 21:11:32[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m105[0m - [1mFound 5 articles with PyMed[0m
[32m2025-09-18 21:11:32[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m95[0m - [1mSearching PubMed with PyMed query: "cardiac"[Abstract] OR "heart"[Abstract] OR "cardiovascular"[Abstract][0m


   Found 5 articles

🔍 Testing: Cardiovascular research


[32m2025-09-18 21:11:33[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m105[0m - [1mFound 5 articles with PyMed[0m


   Found 5 articles

📊 Search Results Summary:
   Broad biomedical search: 5 articles
   Specific neuroscience terms: 5 articles
   Cardiovascular research: 5 articles


## 5. Data Quality Analysis

In [12]:
# Analyze data quality if we have articles
if articles:
    print("📊 Data Quality Analysis:")
    
    # Convert to DataFrame for analysis
    df_data = []
    for article in articles:
        df_data.append({
            'pmid': article.pmid,
            'title_length': len(article.title) if article.title else 0,
            'has_abstract': bool(article.abstract),
            'abstract_length': len(article.abstract) if article.abstract else 0,
            'author_count': len(article.authors) if article.authors else 0,
            'has_doi': bool(article.doi),
            'mesh_term_count': len(article.mesh_terms) if article.mesh_terms else 0,
            'journal': article.journal
        })
    
    df = pd.DataFrame(df_data)
    
    print(f"\nTotal articles analyzed: {len(df)}")
    print(f"Articles with abstracts: {df['has_abstract'].sum()} ({df['has_abstract'].mean()*100:.1f}%)")
    print(f"Articles with DOI: {df['has_doi'].sum()} ({df['has_doi'].mean()*100:.1f}%)")
    print(f"Average abstract length: {df['abstract_length'].mean():.0f} characters")
    print(f"Average author count: {df['author_count'].mean():.1f}")
    print(f"Average MeSH terms: {df['mesh_term_count'].mean():.1f}")
    
    # Top journals
    top_journals = df['journal'].value_counts().head()
    print(f"\nTop journals:")
    for journal, count in top_journals.items():
        print(f"   {journal}: {count}")
        
else:
    print("⏭️  No articles available for quality analysis")

📊 Data Quality Analysis:

Total articles analyzed: 87
Articles with abstracts: 85 (97.7%)
Articles with DOI: 87 (100.0%)
Average abstract length: 1489 characters
Average author count: 6.3
Average MeSH terms: 5.7

Top journals:
   International journal of neural systems: 19
   Frontiers in neuroscience: 5
   Reviews in the neurosciences: 5
   Brain : a journal of neurology: 5
   Medical & biological engineering & computing: 3


## 6. Save Test Data

In [13]:
# Save test results
output_dir = Path("../data/raw")
output_dir.mkdir(parents=True, exist_ok=True)

if articles:
    # Save articles
    searcher.save_articles(articles, output_dir / "test_pubmed_articles.json")
    print(f"💾 Saved {len(articles)} test articles")
    
    # Save search results summary
    summary = {
        'timestamp': pd.Timestamp.now().isoformat(),
        'total_articles': len(articles),
        'search_results': search_results,
        'quality_metrics': {
            'articles_with_abstracts': int(df['has_abstract'].sum()),
            'articles_with_doi': int(df['has_doi'].sum()),
            'avg_abstract_length': float(df['abstract_length'].mean()),
            'avg_author_count': float(df['author_count'].mean())
        } if 'df' in locals() else {}
    }
    
    with open(output_dir / "pubmed_test_summary.json", 'w') as f:
        json.dump(summary, f, indent=2, default=str)
    
    print("💾 Saved test summary")
else:
    print("⏭️  No data to save")

[32m2025-09-18 21:11:33[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36msave_articles[0m:[36m346[0m - [1mSaved 87 articles to ../data/raw/test_pubmed_articles.json[0m


💾 Saved 87 test articles
💾 Saved test summary


## 7. Test Rate Limiting

In [14]:
# Test rate limiting with multiple requests
async def test_rate_limiting():
    print("🕐 Testing rate limiting...")

    import time

    rate_test_results = []
    start_time = time.time()

    for i in range(3):  # Test 3 requests
        request_start = time.time()
        
        try:
            test_pmids = await searcher.search_recent_articles(
                query_terms=["test"],
                days_back=30,
                max_results=2
            )
            
            request_time = time.time() - request_start
            rate_test_results.append({
                'request': i+1,
                'time': request_time,
                'pmids_found': len(test_pmids),
                'success': True
            })
            
            print(f"   Request {i+1}: {request_time:.2f}s, {len(test_pmids)} PMIDs")
            
        except Exception as e:
            rate_test_results.append({
                'request': i+1,
                'error': str(e),
                'success': False
            })
            print(f"   Request {i+1}: Failed - {e}")

    total_time = time.time() - start_time
    print(f"\nTotal time for {len(rate_test_results)} requests: {total_time:.2f}s")
    print(f"Average time per request: {total_time/len(rate_test_results):.2f}s")

    successful_requests = [r for r in rate_test_results if r['success']]
    print(f"Successful requests: {len(successful_requests)}/{len(rate_test_results)}")
    
    return rate_test_results

# Run the async function
rate_test_results = await test_rate_limiting()

[32m2025-09-18 21:11:33[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m95[0m - [1mSearching PubMed with PyMed query: "test"[Abstract][0m


🕐 Testing rate limiting...


[32m2025-09-18 21:11:34[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m105[0m - [1mFound 2 articles with PyMed[0m
[32m2025-09-18 21:11:34[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m95[0m - [1mSearching PubMed with PyMed query: "test"[Abstract][0m


   Request 1: 0.82s, 2 PMIDs


[32m2025-09-18 21:11:36[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m105[0m - [1mFound 2 articles with PyMed[0m
[32m2025-09-18 21:11:36[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m95[0m - [1mSearching PubMed with PyMed query: "test"[Abstract][0m


   Request 2: 2.18s, 2 PMIDs


[32m2025-09-18 21:11:37[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m105[0m - [1mFound 2 articles with PyMed[0m


   Request 3: 0.78s, 2 PMIDs

Total time for 3 requests: 3.78s
Average time per request: 1.26s
Successful requests: 3/3


## Next Steps

1. **Configure email**: Make sure you have a valid email in the configuration
2. **API key**: Consider getting a PubMed API key for higher rate limits
3. **Search optimization**: Fine-tune search terms based on IFC research areas
4. **Error handling**: Test how the system handles network issues, rate limits, etc.

## Common Issues
- **Email required**: NCBI requires a valid email address
- **Rate limiting**: Too many requests will get blocked
- **Network timeouts**: Large requests may timeout
- **XML parsing**: Malformed XML responses can cause errors

## Test libraries

our current PubMedSearcher class:

Uses direct E-utilities API calls (esearch and efetch)
Handles XML parsing manually with ElementTree
Implements rate limiting and batch processing
Extracts comprehensive metadata (MeSH terms, DOI, authors, etc.)


PyMed (Most Promising)
Advantages:

Clean, Pythonic API
Better query syntax support
Automatic handling of pagination and rate limiting
Built-in article parsing


> Our current implementation works well - successfully fetching articles as shown in the notebook

- Direct control - Fine-grained control over error handling and parsing
- Async support - implementation is properly async
Proven functionality - 

In [15]:
# Restart your notebook kernel first (Kernel -> Restart Kernel)
# Then run the imports again
from pubmed.searcher import PubMedSearcher
from utils.config import load_config

config = load_config()

# Test with direct API (current method)
searcher_direct = PubMedSearcher(config, use_pymed=False)
pmids_direct = await searcher_direct.search_recent_articles(["neuroscience"], max_results=5)

# Test with PyMed (enhanced method)
searcher_pymed = PubMedSearcher(config, use_pymed=True)
pmids_pymed = await searcher_pymed.search_recent_articles(["neuroscience"], max_results=5)

print(f"Direct API: {len(pmids_direct)} results")
print(f"PyMed: {len(pmids_pymed)} results")

[32m2025-09-18 21:11:41[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_direct_api[0m:[36m123[0m - [1mSearching PubMed with direct API query: "neuroscience"[Abstract][0m
[32m2025-09-18 21:11:42[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_direct_api[0m:[36m147[0m - [1mFound 5 articles with direct API[0m
[32m2025-09-18 21:11:42[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m__init__[0m:[36m64[0m - [1mPyMed backend enabled[0m
[32m2025-09-18 21:11:42[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m95[0m - [1mSearching PubMed with PyMed query: "neuroscience"[Abstract][0m
[32m2025-09-18 21:11:42[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m105[0m - [1mFound 5 articles with PyMed[0m


Direct API: 5 results
PyMed: 5 results


## 8. Advanced Search Comparison

Let's test more sophisticated search queries to see where PyMed really shines.

In [16]:
# Advanced Search Comparison - PyMed vs Direct API
import asyncio
from datetime import datetime, timedelta

async def test_advanced_searches():
    """Compare advanced search capabilities between direct API and PyMed"""
    
    # Define advanced search scenarios
    advanced_searches = [
        {
            'name': 'Date Range + Author + Keywords',
            'description': 'Articles from last 2 years by specific research groups with neuroscience keywords',
            'direct_query': '("2022/01/01"[Date - Publication] : "2024/12/31"[Date - Publication]) AND (neuroscience[Title/Abstract] OR "neural network"[Title/Abstract]) AND (Garcia[Author] OR Rodriguez[Author])',
            'pymed_query': '(("2022/01/01"[Date - Publication] : "2024/12/31"[Date - Publication])) AND ((neuroscience[Title/Abstract] OR "neural network"[Title/Abstract])) AND (Garcia[Author] OR Rodriguez[Author])'
        },
        {
            'name': 'MeSH Terms + Journal Filter',
            'description': 'Specific MeSH terms in high-impact journals',
            'direct_query': '("Brain"[Mesh] OR "Neurons"[Mesh]) AND ("Nature"[Journal] OR "Science"[Journal] OR "Cell"[Journal])',
            'pymed_query': '(("Brain"[Mesh] OR "Neurons"[Mesh])) AND (("Nature"[Journal] OR "Science"[Journal] OR "Cell"[Journal]))'
        },
        {
            'name': 'Complex Boolean Logic',
            'description': 'Multiple conditions with nested Boolean operators',
            'direct_query': '(("machine learning"[Title/Abstract] OR "artificial intelligence"[Title/Abstract]) AND ("medical imaging"[Title/Abstract] OR "radiology"[Title/Abstract])) NOT "review"[Publication Type]',
            'pymed_query': '(("machine learning"[Title/Abstract] OR "artificial intelligence"[Title/Abstract]) AND ("medical imaging"[Title/Abstract] OR "radiology"[Title/Abstract])) NOT "review"[Publication Type]'
        },
        {
            'name': 'Recent High-Impact Research',
            'description': 'Recent articles with specific impact criteria',
            'direct_query': '("2024/01/01"[Date - Publication] : "2024/12/31"[Date - Publication]) AND ("breakthrough"[Title/Abstract] OR "novel"[Title/Abstract]) AND "humans"[MeSH Terms]',
            'pymed_query': '(("2024/01/01"[Date - Publication] : "2024/12/31"[Date - Publication])) AND (("breakthrough"[Title/Abstract] OR "novel"[Title/Abstract])) AND ("humans"[MeSH Terms])'
        }
    ]
    
    results_comparison = {}
    
    for search in advanced_searches:
        print(f"\n🔬 Testing: {search['name']}")
        print(f"   Description: {search['description']}")
        
        # Test with Direct API
        try:
            searcher_direct = PubMedSearcher(config, use_pymed=False)
            # Override the query building for direct API
            direct_pmids = await test_direct_advanced_query(searcher_direct, search['direct_query'])
            direct_count = len(direct_pmids)
        except Exception as e:
            print(f"   ❌ Direct API failed: {e}")
            direct_count = 0
            direct_pmids = []
        
        # Test with PyMed
        try:
            searcher_pymed = PubMedSearcher(config, use_pymed=True)
            pymed_pmids = await test_pymed_advanced_query(searcher_pymed, search['pymed_query'])
            pymed_count = len(pymed_pmids)
        except Exception as e:
            print(f"   ❌ PyMed failed: {e}")
            pymed_count = 0
            pymed_pmids = []
        
        results_comparison[search['name']] = {
            'direct_api': {'count': direct_count, 'pmids': direct_pmids[:5]},  # Store first 5 for comparison
            'pymed': {'count': pymed_count, 'pmids': pymed_pmids[:5]},
            'query': search['direct_query']
        }
        
        print(f"   📊 Results: Direct API: {direct_count}, PyMed: {pymed_count}")
        
        # Brief pause between searches
        await asyncio.sleep(1)
    
    return results_comparison

async def test_direct_advanced_query(searcher, query, max_results=20):
    """Test advanced query with direct API"""
    # Parameters for advanced search
    params = {
        'db': 'pubmed',
        'term': query,
        'retmax': max_results,
        'retmode': 'xml',
        'tool': 'ubmi-ifc-podcast',
        'email': searcher.email,
        'sort': 'relevance'
    }
    
    if searcher.api_key:
        params['api_key'] = searcher.api_key
    
    url = f"{searcher.base_url}esearch.fcgi"
    
    import aiohttp
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(url, params=params) as response:
                if response.status == 200:
                    xml_content = await response.text()
                    pmids = searcher._parse_search_results(xml_content)
                    return pmids[:max_results]
                else:
                    searcher.logger.error(f"Advanced search failed with status {response.status}")
                    return []
        except Exception as e:
            searcher.logger.error(f"Error in advanced search: {str(e)}")
            return []

async def test_pymed_advanced_query(searcher, query, max_results=20):
    """Test advanced query with PyMed"""
    try:
        # PyMed handles complex queries better
        results = searcher.pymed.query(query, max_results=max_results)
        pmids = [result.pubmed_id for result in results if result.pubmed_id]
        return pmids[:max_results]
    except Exception as e:
        searcher.logger.error(f"PyMed advanced search failed: {e}")
        return []

# Run the advanced search comparison
print("🚀 Starting Advanced Search Comparison...")
advanced_results = await test_advanced_searches()

🚀 Starting Advanced Search Comparison...

🔬 Testing: Date Range + Author + Keywords
   Description: Articles from last 2 years by specific research groups with neuroscience keywords


[32m2025-09-18 21:15:40[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m__init__[0m:[36m64[0m - [1mPyMed backend enabled[0m


   📊 Results: Direct API: 20, PyMed: 20

🔬 Testing: MeSH Terms + Journal Filter
   Description: Specific MeSH terms in high-impact journals


[32m2025-09-18 21:15:43[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m__init__[0m:[36m64[0m - [1mPyMed backend enabled[0m


   📊 Results: Direct API: 20, PyMed: 20

🔬 Testing: Complex Boolean Logic
   Description: Multiple conditions with nested Boolean operators


[32m2025-09-18 21:15:45[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m__init__[0m:[36m64[0m - [1mPyMed backend enabled[0m


   📊 Results: Direct API: 20, PyMed: 20

🔬 Testing: Recent High-Impact Research
   Description: Recent articles with specific impact criteria


[32m2025-09-18 21:15:49[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m__init__[0m:[36m64[0m - [1mPyMed backend enabled[0m


   📊 Results: Direct API: 20, PyMed: 20


### Detailed article comparison

In [17]:
# Analyze and display the advanced search results
print("\n📈 Advanced Search Results Analysis:")
print("=" * 60)

total_direct = 0
total_pymed = 0

for search_name, results in advanced_results.items():
    direct_count = results['direct_api']['count']
    pymed_count = results['pymed']['count']
    
    total_direct += direct_count
    total_pymed += pymed_count
    
    print(f"\n🔍 {search_name}:")
    print(f"   Direct API: {direct_count:>3} articles")
    print(f"   PyMed:      {pymed_count:>3} articles")
    
    # Show query complexity
    query_length = len(results['query'])
    print(f"   Query complexity: {query_length} characters")
    
    # Performance indicator
    if pymed_count > direct_count:
        print(f"   🏆 PyMed found {pymed_count - direct_count} more articles")
    elif direct_count > pymed_count:
        print(f"   🏆 Direct API found {direct_count - pymed_count} more articles")
    else:
        print(f"   🤝 Both methods found the same number")

print(f"\n📊 Overall Summary:")
print(f"   Total Direct API results: {total_direct}")
print(f"   Total PyMed results:      {total_pymed}")
print(f"   Difference: {abs(total_pymed - total_direct)} articles")

if total_pymed > total_direct:
    print(f"   🏆 PyMed performed better overall (+{total_pymed - total_direct} articles)")
elif total_direct > total_pymed:
    print(f"   🏆 Direct API performed better overall (+{total_direct - total_pymed} articles)")
else:
    print(f"   🤝 Both methods performed equally well")


📈 Advanced Search Results Analysis:

🔍 Date Range + Author + Keywords:
   Direct API:  20 articles
   PyMed:       20 articles
   Query complexity: 182 characters
   🤝 Both methods found the same number

🔍 MeSH Terms + Journal Filter:
   Direct API:  20 articles
   PyMed:       20 articles
   Query complexity: 99 characters
   🤝 Both methods found the same number

🔍 Complex Boolean Logic:
   Direct API:  20 articles
   PyMed:       20 articles
   Query complexity: 185 characters
   🤝 Both methods found the same number

🔍 Recent High-Impact Research:
   Direct API:  20 articles
   PyMed:       20 articles
   Query complexity: 158 characters
   🤝 Both methods found the same number

📊 Overall Summary:
   Total Direct API results: 80
   Total PyMed results:      80
   Difference: 0 articles
   🤝 Both methods performed equally well


In [18]:
# Test PyMed's unique advanced features
async def test_pymed_unique_features():
    """Test features that are easier/better with PyMed"""
    
    print("🌟 Testing PyMed's Unique Advanced Features:")
    
    # Feature 1: Complex date ranges with better syntax
    print("\n1️⃣ Complex Date Range Query:")
    date_query = '(("2023/01/01"[Date - Create] : "2024/12/31"[Date - Create])) AND ((neuroscience[Title/Abstract] OR "brain imaging"[Title/Abstract]))'
    
    try:
        searcher_pymed = PubMedSearcher(config, use_pymed=True)
        results = searcher_pymed.pymed.query(date_query, max_results=10)
        
        articles_info = []
        for article in results:
            if article.pubmed_id:
                articles_info.append({
                    'pmid': article.pubmed_id,
                    'title': article.title[:80] + "..." if article.title and len(article.title) > 80 else article.title,
                    'pub_date': str(article.publication_date) if article.publication_date else 'Unknown',
                    'journal': article.journal[:30] + "..." if article.journal and len(article.journal) > 30 else article.journal,
                    'authors': len(article.authors) if article.authors else 0
                })
        
        print(f"   Found {len(articles_info)} articles with complex date range")
        
        # Display sample results
        for i, article in enumerate(articles_info[:3]):
            print(f"   📄 Article {i+1}:")
            print(f"      PMID: {article['pmid']}")
            print(f"      Title: {article['title']}")
            print(f"      Journal: {article['journal']}")
            print(f"      Date: {article['pub_date']}")
            print(f"      Authors: {article['authors']}")
            
    except Exception as e:
        print(f"   ❌ PyMed advanced date query failed: {e}")
    
    # Feature 2: Author-focused search with better results
    print("\n2️⃣ Advanced Author & Topic Search:")
    author_topic_query = '((Garcia[Author] OR Rodriguez[Author] OR Martinez[Author])) AND ((neuroscience[Title/Abstract] OR "brain"[Title/Abstract])) AND ("2022/01/01"[Date - Publication] : "2024/12/31"[Date - Publication])'
    
    try:
        results = searcher_pymed.pymed.query(author_topic_query, max_results=15)
        
        author_articles = []
        for article in results:
            if article.pubmed_id and article.authors:
                # Extract relevant authors
                relevant_authors = [author for author in article.authors 
                                  if any(name in str(author) for name in ['Garcia', 'Rodriguez', 'Martinez'])]
                
                if relevant_authors:
                    author_articles.append({
                        'pmid': article.pubmed_id,
                        'title': article.title[:60] + "..." if article.title and len(article.title) > 60 else article.title,
                        'relevant_authors': relevant_authors[:2],  # First 2 relevant authors
                        'total_authors': len(article.authors),
                        'journal': article.journal
                    })
        
        print(f"   Found {len(author_articles)} articles by target authors")
        
        # Display results with author information
        for i, article in enumerate(author_articles[:3]):
            print(f"   👥 Article {i+1}:")
            print(f"      PMID: {article['pmid']}")
            print(f"      Title: {article['title']}")
            print(f"      Target Authors: {', '.join(map(str, article['relevant_authors']))}")
            print(f"      Total Authors: {article['total_authors']}")
            print(f"      Journal: {article['journal']}")
            
    except Exception as e:
        print(f"   ❌ PyMed author search failed: {e}")
    
    print("\n✨ PyMed Advanced Features Summary:")
    print("   • Better handling of complex Boolean queries")
    print("   • More intuitive date range syntax")
    print("   • Automatic handling of pagination")
    print("   • Built-in article object parsing")
    print("   • Better error handling for malformed queries")

# Run PyMed unique features test
await test_pymed_unique_features()

[32m2025-09-18 21:21:53[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m__init__[0m:[36m64[0m - [1mPyMed backend enabled[0m


🌟 Testing PyMed's Unique Advanced Features:

1️⃣ Complex Date Range Query:
   Found 10 articles with complex date range
   📄 Article 1:
      PMID: 39741190
29853555
33058762
35285522
32823064
33058767
28407470
31036888
32358930
25673764
28955599
31684259
30839624
29379121
35224847
31961223
36790969
36067379
37023203
37316719
29535413
37019883
28628101
31591577
32989293
16229641
17222020
20734031
29892069
17639353
15764662
26299612
33433612
24084410
27905576
30662753
34123511
24019106
18568041
27696676
35224188
26186260
33806065
28788920
35508706
31384010
35668147
36988556
37138086
37652007
36253568
31474560
29427611
32667302
26556561
30863275
37082944
28419111
31304459
24970081
      Title: Vibrational fiber photometry: label-free and reporter-free minimally invasive Ra...
      Journal: Nature methods
      Date: 2025-01-01
      Authors: 20
   📄 Article 2:
      PMID: 39741143
21692661
19829373
26322269
29663003
21483974
29874052
23614556
28859250
32015505
32903010
35373231
32328424

In [19]:
# Performance and usability comparison
def compare_implementations():
    """Compare the two implementations from a developer perspective"""
    
    print("🔧 Implementation Comparison:")
    print("=" * 50)
    
    comparison_table = {
        "Feature": [
            "Query Complexity",
            "Date Range Handling", 
            "Error Handling",
            "Result Parsing",
            "Rate Limiting",
            "Boolean Logic",
            "MeSH Term Support",
            "Author Search",
            "Async Support",
            "Code Simplicity"
        ],
        "Direct API": [
            "Manual query building",
            "Manual date formatting", 
            "Manual error checking",
            "Manual XML parsing",
            "Manual implementation",
            "Basic support",
            "Full support",
            "Manual formatting",
            "Full async support",
            "More complex code"
        ],
        "PyMed": [
            "Built-in query validation",
            "Intuitive date syntax",
            "Built-in error handling", 
            "Automatic parsing",
            "Built-in rate limiting",
            "Advanced Boolean support",
            "Full support",
            "Enhanced author queries",
            "Sync (can be wrapped)",
            "Cleaner, simpler code"
        ]
    }
    
    # Display comparison
    for i, feature in enumerate(comparison_table["Feature"]):
        print(f"\n📋 {feature}:")
        print(f"   Direct API: {comparison_table['Direct API'][i]}")
        print(f"   PyMed:      {comparison_table['PyMed'][i]}")

compare_implementations()

print("\n🎯 Recommendation:")
print("   • Use PyMed for: Complex queries, rapid prototyping, simpler code")
print("   • Use Direct API for: Maximum control, custom parsing, fully async workflows")
print("   • Hybrid approach: Use both based on query complexity and requirements")

🔧 Implementation Comparison:

📋 Query Complexity:
   Direct API: Manual query building
   PyMed:      Built-in query validation

📋 Date Range Handling:
   Direct API: Manual date formatting
   PyMed:      Intuitive date syntax

📋 Error Handling:
   Direct API: Manual error checking
   PyMed:      Built-in error handling

📋 Result Parsing:
   Direct API: Manual XML parsing
   PyMed:      Automatic parsing

📋 Rate Limiting:
   Direct API: Manual implementation
   PyMed:      Built-in rate limiting

📋 Boolean Logic:
   Direct API: Basic support
   PyMed:      Advanced Boolean support

📋 MeSH Term Support:
   Direct API: Full support
   PyMed:      Full support

📋 Author Search:
   Direct API: Manual formatting
   PyMed:      Enhanced author queries

📋 Async Support:
   Direct API: Full async support
   PyMed:      Sync (can be wrapped)

📋 Code Simplicity:
   Direct API: More complex code
   PyMed:      Cleaner, simpler code

🎯 Recommendation:
   • Use PyMed for: Complex queries, rapid pro

In [20]:
# Detailed Article Information Comparison
async def compare_article_details():
    """Compare the detailed information retrieved by both methods"""
    
    print("🔍 Comparing Article Detail Retrieval Methods:")
    print("=" * 60)
    
    # Use a simple search to get some PMIDs for testing
    test_query = ["neuroscience", "brain"]
    
    # Get PMIDs from both methods
    searcher_direct = PubMedSearcher(config, use_pymed=False)
    searcher_pymed = PubMedSearcher(config, use_pymed=True)
    
    # Search with both methods
    print("🔎 Getting PMIDs from both methods...")
    pmids_direct = await searcher_direct.search_recent_articles(test_query, max_results=5)
    pmids_pymed = await searcher_pymed.search_recent_articles(test_query, max_results=5)
    
    print(f"Direct API found: {len(pmids_direct)} PMIDs")
    print(f"PyMed found: {len(pmids_pymed)} PMIDs")
    
    # Use the first few PMIDs for detailed comparison
    test_pmids = pmids_direct[:3] if pmids_direct else pmids_pymed[:3]
    
    if not test_pmids:
        print("❌ No PMIDs found for comparison")
        return
    
    print(f"\n📋 Testing article details for PMIDs: {test_pmids}")
    
    # Method 1: Direct API with our custom parser
    print("\n" + "="*50)
    print("📄 METHOD 1: Direct API + Custom Parsing")
    print("="*50)
    
    try:
        articles_direct = await searcher_direct.fetch_article_details(test_pmids)
        print(f"✅ Retrieved {len(articles_direct)} articles via Direct API")
        
        for i, article in enumerate(articles_direct):
            print(f"\n🔬 Article {i+1} (PMID: {article.pmid}):")
            print(f"   Title: {article.title[:100]}..." if len(article.title) > 100 else f"   Title: {article.title}")
            print(f"   Authors ({len(article.authors)}): {', '.join(article.authors[:3])}{'...' if len(article.authors) > 3 else ''}")
            print(f"   Journal: {article.journal}")
            print(f"   Publication Date: {article.publication_date}")
            print(f"   DOI: {article.doi or 'Not available'}")
            print(f"   Abstract Length: {len(article.abstract) if article.abstract else 0} chars")
            print(f"   MeSH Terms ({len(article.mesh_terms) if article.mesh_terms else 0}): {article.mesh_terms[:3] if article.mesh_terms else 'None'}")
            print(f"   Keywords: {article.keywords or 'Not available'}")
            
    except Exception as e:
        print(f"❌ Direct API detail retrieval failed: {e}")
        articles_direct = []
    
    # Method 2: PyMed with built-in parsing
    print("\n" + "="*50)
    print("📄 METHOD 2: PyMed Built-in Parsing")
    print("="*50)
    
    try:
        # Use PyMed to get full article details directly
        query_for_pmids = " OR ".join([f"{pmid}[PMID]" for pmid in test_pmids])
        pymed_results = searcher_pymed.pymed.query(query_for_pmids, max_results=len(test_pmids))
        
        print(f"✅ Retrieved {len(list(pymed_results))} articles via PyMed")
        
        # Re-query to get fresh iterator
        pymed_results = searcher_pymed.pymed.query(query_for_pmids, max_results=len(test_pmids))
        
        for i, article in enumerate(pymed_results):
            print(f"\n🔬 Article {i+1} (PMID: {article.pubmed_id}):")
            print(f"   Title: {(article.title[:100] + '...') if article.title and len(article.title) > 100 else article.title}")
            
            # Handle authors differently for PyMed
            if article.authors:
                author_names = []
                for author in article.authors:
                    if hasattr(author, 'firstname') and hasattr(author, 'lastname'):
                        name = f"{author.firstname or ''} {author.lastname or ''}".strip()
                        author_names.append(name)
                    else:
                        author_names.append(str(author))
                print(f"   Authors ({len(author_names)}): {', '.join(author_names[:3])}{'...' if len(author_names) > 3 else ''}")
            else:
                print("   Authors: None")
            
            print(f"   Journal: {article.journal or 'Not available'}")
            print(f"   Publication Date: {article.publication_date or 'Not available'}")
            print(f"   DOI: {article.doi or 'Not available'}")
            print(f"   Abstract Length: {len(article.abstract) if article.abstract else 0} chars")
            
            # PyMed provides additional fields
            if hasattr(article, 'keywords') and article.keywords:
                clean_keywords = [k for k in article.keywords if k is not None]
                print(f"   Keywords ({len(clean_keywords)}): {clean_keywords[:3] if clean_keywords else 'None'}")
            else:
                print("   Keywords: Not available")
            
            if hasattr(article, 'mesh') and article.mesh:
                mesh_terms = list(article.mesh) if article.mesh else []
                print(f"   MeSH Terms ({len(mesh_terms)}): {mesh_terms[:3] if mesh_terms else 'None'}")
            else:
                print("   MeSH Terms: Not available")
                
            # PyMed specific additional fields
            if hasattr(article, 'publication_type'):
                print(f"   Publication Type: {article.publication_type or 'Not available'}")
            
            if hasattr(article, 'conclusions'):
                print(f"   Conclusions: {('Available' if article.conclusions else 'Not available')}")
                
    except Exception as e:
        print(f"❌ PyMed detail retrieval failed: {e}")
    
    return articles_direct

# Run the detailed comparison
articles_comparison = await compare_article_details()

[32m2025-09-18 21:22:41[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m__init__[0m:[36m64[0m - [1mPyMed backend enabled[0m
[32m2025-09-18 21:22:41[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_direct_api[0m:[36m123[0m - [1mSearching PubMed with direct API query: "neuroscience"[Abstract] OR "brain"[Abstract][0m


🔍 Comparing Article Detail Retrieval Methods:
🔎 Getting PMIDs from both methods...


[32m2025-09-18 21:22:42[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_direct_api[0m:[36m147[0m - [1mFound 5 articles with direct API[0m
[32m2025-09-18 21:22:42[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m95[0m - [1mSearching PubMed with PyMed query: "neuroscience"[Abstract] OR "brain"[Abstract][0m
[32m2025-09-18 21:22:43[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_pymed[0m:[36m105[0m - [1mFound 5 articles with PyMed[0m


Direct API found: 5 PMIDs
PyMed found: 5 PMIDs

📋 Testing article details for PMIDs: ['37938706', '39069198', '36599475']

📄 METHOD 1: Direct API + Custom Parsing


[32m2025-09-18 21:22:44[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36mfetch_article_details[0m:[36m195[0m - [1mRetrieved details for 3 articles[0m


✅ Retrieved 3 articles via Direct API

🔬 Article 1 (PMID: 37938706):
   Title: Functional neuroimaging as a catalyst for integrated neuroscience.
   Authors (3): Emily S Finn, Russell A Poldrack, James M Shine
   Journal: Nature
   Publication Date: 2023-Nov
   DOI: 10.1038/s41586-023-06670-9
   Abstract Length: 1169 chars
   MeSH Terms (7): ['Humans', 'Brain', 'Cognitive Neuroscience']
   Keywords: Not available

🔬 Article 2 (PMID: 39069198):
   Title: The golden age of integrative neuroscience? The brain joins the body in the latest renaissance of in...
   Authors (3): Thomas Ritz, André Schulz, Sahib Khalsa
   Journal: Biological psychology
   Publication Date: 2024-Oct
   DOI: 10.1016/j.biopsycho.2024.108851
   Abstract Length: 0 chars
   MeSH Terms (4): ['Humans', 'Interoception', 'Neurosciences']
   Keywords: Not available

🔬 Article 3 (PMID: 36599475):
   Title: Brain Trauma Imaging.
   Authors (2): Gérard N Bischof, Donna J Cross
   Journal: Journal of nuclear medicine : offici

In [21]:
# Detailed Field-by-Field Comparison
async def detailed_field_comparison():
    """Compare specific fields available from each method"""
    
    print("\n🔬 Field-by-Field Comparison:")
    print("=" * 60)
    
    # Get a single article for detailed inspection
    test_query = ["machine learning", "medical"]
    
    searcher_direct = PubMedSearcher(config, use_pymed=False)
    searcher_pymed = PubMedSearcher(config, use_pymed=True)
    
    # Get one PMID
    pmids = await searcher_direct.search_recent_articles(test_query, max_results=1)
    
    if not pmids:
        print("❌ No articles found for detailed comparison")
        return
    
    test_pmid = pmids[0]
    print(f"📄 Analyzing PMID: {test_pmid}")
    
    # Method 1: Direct API
    print(f"\n🔧 Direct API Method:")
    try:
        direct_articles = await searcher_direct.fetch_article_details([test_pmid])
        if direct_articles:
            article = direct_articles[0]
            direct_fields = {
                'PMID': article.pmid,
                'Title': len(article.title) if article.title else 0,
                'Abstract': len(article.abstract) if article.abstract else 0,
                'Authors': len(article.authors) if article.authors else 0,
                'Journal': bool(article.journal),
                'Publication Date': bool(article.publication_date),
                'DOI': bool(article.doi),
                'Keywords': len(article.keywords) if article.keywords else 0,
                'MeSH Terms': len(article.mesh_terms) if article.mesh_terms else 0,
                'Similarity Score': bool(article.similarity_score)
            }
            
            for field, value in direct_fields.items():
                print(f"   {field:15}: {value}")
        else:
            print("   ❌ No article retrieved")
            direct_fields = {}
            
    except Exception as e:
        print(f"   ❌ Error: {e}")
        direct_fields = {}
    
    # Method 2: PyMed
    print(f"\n🐍 PyMed Method:")
    try:
        query = f"{test_pmid}[PMID]"
        pymed_results = list(searcher_pymed.pymed.query(query, max_results=1))
        
        if pymed_results:
            article = pymed_results[0]
            
            # Count authors properly
            author_count = 0
            if article.authors:
                author_count = len(article.authors)
            
            # Count keywords properly
            keyword_count = 0
            if hasattr(article, 'keywords') and article.keywords:
                keyword_count = len([k for k in article.keywords if k is not None])
            
            # Count MeSH terms
            mesh_count = 0
            if hasattr(article, 'mesh') and article.mesh:
                mesh_count = len(list(article.mesh))
            
            pymed_fields = {
                'PMID': article.pubmed_id,
                'Title': len(article.title) if article.title else 0,
                'Abstract': len(article.abstract) if article.abstract else 0,
                'Authors': author_count,
                'Journal': bool(article.journal),
                'Publication Date': bool(article.publication_date),
                'DOI': bool(article.doi),
                'Keywords': keyword_count,
                'MeSH Terms': mesh_count,
                'Publication Type': bool(getattr(article, 'publication_type', None)),
                'Conclusions': bool(getattr(article, 'conclusions', None)),
                'Methods': bool(getattr(article, 'methods', None)),
                'Results': bool(getattr(article, 'results', None)),
                'Copyrights': bool(getattr(article, 'copyrights', None))
            }
            
            for field, value in pymed_fields.items():
                print(f"   {field:15}: {value}")
                
        else:
            print("   ❌ No article retrieved")
            pymed_fields = {}
            
    except Exception as e:
        print(f"   ❌ Error: {e}")
        pymed_fields = {}
    
    # Comparison summary
    if direct_fields and pymed_fields:
        print(f"\n📊 Comparison Summary:")
        print(f"   Direct API fields: {len(direct_fields)}")
        print(f"   PyMed fields: {len(pymed_fields)}")
        
        # Common fields comparison
        common_fields = set(direct_fields.keys()) & set(pymed_fields.keys())
        print(f"   Common fields: {len(common_fields)}")
        
        # PyMed exclusive fields
        pymed_exclusive = set(pymed_fields.keys()) - set(direct_fields.keys())
        if pymed_exclusive:
            print(f"   PyMed exclusive fields: {', '.join(pymed_exclusive)}")
        
        # Field-by-field comparison for common fields
        print(f"\n🔍 Field Value Comparison:")
        for field in sorted(common_fields):
            direct_val = direct_fields[field]
            pymed_val = pymed_fields[field]
            status = "✅" if direct_val == pymed_val else "⚠️"
            print(f"   {status} {field:15}: Direct={direct_val}, PyMed={pymed_val}")

# Run detailed field comparison
await detailed_field_comparison()

[32m2025-09-18 21:24:41[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m__init__[0m:[36m64[0m - [1mPyMed backend enabled[0m
[32m2025-09-18 21:24:41[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_direct_api[0m:[36m123[0m - [1mSearching PubMed with direct API query: "machine learning"[Abstract] OR "medical"[Abstract][0m



🔬 Field-by-Field Comparison:


[32m2025-09-18 21:24:42[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_direct_api[0m:[36m147[0m - [1mFound 1 articles with direct API[0m


📄 Analyzing PMID: 28212054

🔧 Direct API Method:


[32m2025-09-18 21:24:43[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36mfetch_article_details[0m:[36m195[0m - [1mRetrieved details for 1 articles[0m


   PMID           : 28212054
   Title          : 37
   Abstract       : 1360
   Authors        : 4
   Journal        : True
   Publication Date: True
   DOI            : True
   Keywords       : 0
   MeSH Terms     : 5
   Similarity Score: False

🐍 PyMed Method:
   PMID           : 28212054
14752178
18043385
18334393
20451814
17376650
8551980
23743802
18511683
17174012
18602482
22481907
23153689
11470218
18252317
5234703
20229900
17720704
   Title          : 37
   Abstract       : 1360
   Authors        : 4
   Journal        : True
   Publication Date: True
   DOI            : True
   Keywords       : 0
   MeSH Terms     : 0
   Publication Type: False
   Conclusions    : False
   Methods        : False
   Results        : False
   Copyrights     : False

📊 Comparison Summary:
   Direct API fields: 10
   PyMed fields: 14
   Common fields: 9
   PyMed exclusive fields: Publication Type, Results, Copyrights, Methods, Conclusions

🔍 Field Value Comparison:
   ✅ Abstract       : Direct=1360,

In [22]:
# Abstract Content Quality Comparison
async def compare_abstract_quality():
    """Compare the quality and completeness of abstracts retrieved"""
    
    print("\n📝 Abstract Content Quality Comparison:")
    print("=" * 60)
    
    # Get articles with substantial abstracts
    test_query = ["systematic review", "meta-analysis"]
    
    searcher_direct = PubMedSearcher(config, use_pymed=False)
    searcher_pymed = PubMedSearcher(config, use_pymed=True)
    
    # Get PMIDs
    pmids = await searcher_direct.search_recent_articles(test_query, max_results=3)
    
    if not pmids:
        print("❌ No articles found for abstract comparison")
        return
    
    print(f"📄 Comparing abstracts for PMIDs: {pmids}")
    
    for i, pmid in enumerate(pmids):
        print(f"\n{'='*20} ARTICLE {i+1} (PMID: {pmid}) {'='*20}")
        
        # Direct API abstract
        try:
            direct_articles = await searcher_direct.fetch_article_details([pmid])
            direct_abstract = direct_articles[0].abstract if direct_articles else ""
            
            print(f"🔧 Direct API Abstract ({len(direct_abstract)} chars):")
            if direct_abstract:
                # Show first 200 chars
                preview = direct_abstract[:200] + "..." if len(direct_abstract) > 200 else direct_abstract
                print(f"   {preview}")
                
                # Check for structured abstract markers
                structured_markers = ['BACKGROUND:', 'OBJECTIVE:', 'METHODS:', 'RESULTS:', 'CONCLUSION:', 
                                    'PURPOSE:', 'DESIGN:', 'SETTING:', 'PARTICIPANTS:']
                found_markers = [marker for marker in structured_markers if marker in direct_abstract.upper()]
                if found_markers:
                    print(f"   📋 Structured abstract markers found: {', '.join(found_markers)}")
            else:
                print("   ❌ No abstract retrieved")
                
        except Exception as e:
            print(f"   ❌ Direct API failed: {e}")
            direct_abstract = ""
        
        # PyMed abstract
        try:
            query = f"{pmid}[PMID]"
            pymed_results = list(searcher_pymed.pymed.query(query, max_results=1))
            pymed_abstract = pymed_results[0].abstract if pymed_results and pymed_results[0].abstract else ""
            
            print(f"\n🐍 PyMed Abstract ({len(pymed_abstract)} chars):")
            if pymed_abstract:
                # Show first 200 chars
                preview = pymed_abstract[:200] + "..." if len(pymed_abstract) > 200 else pymed_abstract
                print(f"   {preview}")
                
                # Check for structured abstract markers
                found_markers = [marker for marker in structured_markers if marker in pymed_abstract.upper()]
                if found_markers:
                    print(f"   📋 Structured abstract markers found: {', '.join(found_markers)}")
            else:
                print("   ❌ No abstract retrieved")
                
        except Exception as e:
            print(f"   ❌ PyMed failed: {e}")
            pymed_abstract = ""
        
        # Compare abstracts
        if direct_abstract and pymed_abstract:
            if direct_abstract == pymed_abstract:
                print(f"   ✅ Abstracts are identical")
            else:
                print(f"   ⚠️  Abstracts differ:")
                print(f"      Length difference: {len(pymed_abstract) - len(direct_abstract)} chars")
                
                # Check if one is a subset of the other
                if direct_abstract in pymed_abstract:
                    print(f"      Direct API abstract is subset of PyMed")
                elif pymed_abstract in direct_abstract:
                    print(f"      PyMed abstract is subset of Direct API")
                else:
                    print(f"      Abstracts have different content")
        
        print(f"   📊 Summary: Direct={len(direct_abstract)}, PyMed={len(pymed_abstract)}")

# Run abstract quality comparison
await compare_abstract_quality()

[32m2025-09-18 21:25:57[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m__init__[0m:[36m64[0m - [1mPyMed backend enabled[0m
[32m2025-09-18 21:25:57[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_direct_api[0m:[36m123[0m - [1mSearching PubMed with direct API query: "systematic review"[Abstract] OR "meta-analysis"[Abstract][0m



📝 Abstract Content Quality Comparison:


[32m2025-09-18 21:25:58[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36m_search_with_direct_api[0m:[36m147[0m - [1mFound 3 articles with direct API[0m


📄 Comparing abstracts for PMIDs: ['26696565', '28396101', '36429915']



[32m2025-09-18 21:25:59[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36mfetch_article_details[0m:[36m195[0m - [1mRetrieved details for 1 articles[0m


🔧 Direct API Abstract (1238 chars):
   A systematic review and meta-analysis was performed to investigate the ability of simple measures of childhood obesity such as body mass index (BMI) to predict future obesity in adolescence and adulth...

🐍 PyMed Abstract (1238 chars):
   A systematic review and meta-analysis was performed to investigate the ability of simple measures of childhood obesity such as body mass index (BMI) to predict future obesity in adolescence and adulth...
   ✅ Abstracts are identical
   📊 Summary: Direct=1238, PyMed=1238



[32m2025-09-18 21:26:00[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36mfetch_article_details[0m:[36m195[0m - [1mRetrieved details for 1 articles[0m


🔧 Direct API Abstract (1646 chars):
   Androgenetic alopecia, or male pattern hair loss, is a hair loss disorder mediated by dihydrotestosterone, the potent form of testosterone. Currently, minoxidil and finasteride are Food and Drug Admin...

🐍 PyMed Abstract (1646 chars):
   Androgenetic alopecia, or male pattern hair loss, is a hair loss disorder mediated by dihydrotestosterone, the potent form of testosterone. Currently, minoxidil and finasteride are Food and Drug Admin...
   ⚠️  Abstracts differ:
      Length difference: 0 chars
      Abstracts have different content
   📊 Summary: Direct=1646, PyMed=1646



[32m2025-09-18 21:26:02[0m | [1mINFO[0m | [36mpubmed.searcher[0m:[36mfetch_article_details[0m:[36m195[0m - [1mRetrieved details for 1 articles[0m


🔧 Direct API Abstract (674 chars):
   (1) Objectives: Mindfulness-based interventions have been receiving more attention in research for children with attention deficit hyperactivity disorder (ADHD). This systematic review and meta-analys...
   📋 Structured abstract markers found: METHODS:, RESULTS:

🐍 PyMed Abstract (674 chars):
   (1) Objectives: Mindfulness-based interventions have been receiving more attention in research for children with attention deficit hyperactivity disorder (ADHD). This systematic review and meta-analys...
   📋 Structured abstract markers found: METHODS:, RESULTS:
   ✅ Abstracts are identical
   📊 Summary: Direct=674, PyMed=674


In [23]:
# Summary Comparison Table
def create_comparison_summary():
    """Create a comprehensive comparison summary"""
    
    print("\n📋 COMPREHENSIVE METHOD COMPARISON")
    print("=" * 80)
    
    comparison_data = {
        'Aspect': [
            'Search Speed',
            'Article Detail Retrieval',
            'Abstract Quality',
            'Author Information',
            'MeSH Terms',
            'Keywords',
            'DOI Retrieval',
            'Publication Types',
            'Date Handling', 
            'Error Handling',
            'Complex Queries',
            'Rate Limiting',
            'Code Complexity',
            'Documentation',
            'Maintenance'
        ],
        'Direct API': [
            'Fast (direct HTTP)',
            'Full XML parsing control',
            'Complete abstracts',
            'Manual parsing required',
            'Complete MeSH data',
            'Limited keyword extraction',
            'Full DOI support',
            'Manual extraction',
            'Manual date formatting',
            'Manual implementation',
            'Manual query building',
            'Manual implementation',
            'Higher complexity',
            'NCBI documentation',
            'More maintenance needed'
        ],
        'PyMed': [
            'Slightly slower (library overhead)',
            'Automatic parsing',
            'Complete abstracts',
            'Structured author objects',
            'Automatic MeSH extraction',
            'Built-in keyword handling',
            'Full DOI support',
            'Built-in extraction',
            'Automatic date parsing',
            'Built-in error handling',
            'Advanced query syntax',
            'Built-in rate limiting',
            'Lower complexity',
            'Library documentation',
            'Library maintained'
        ]
    }
    
    # Print comparison table
    for i, aspect in enumerate(comparison_data['Aspect']):
        print(f"\n🔍 {aspect}:")
        print(f"   Direct API: {comparison_data['Direct API'][i]}")
        print(f"   PyMed:      {comparison_data['PyMed'][i]}")
    
    print(f"\n🏆 RECOMMENDATIONS:")
    print(f"   • Use Direct API when: Maximum control, custom parsing, fully async workflows")
    print(f"   • Use PyMed when: Rapid development, complex queries, less maintenance")
    print(f"   • Hybrid approach: Use both based on specific needs")

create_comparison_summary()


📋 COMPREHENSIVE METHOD COMPARISON

🔍 Search Speed:
   Direct API: Fast (direct HTTP)
   PyMed:      Slightly slower (library overhead)

🔍 Article Detail Retrieval:
   Direct API: Full XML parsing control
   PyMed:      Automatic parsing

🔍 Abstract Quality:
   Direct API: Complete abstracts
   PyMed:      Complete abstracts

🔍 Author Information:
   Direct API: Manual parsing required
   PyMed:      Structured author objects

🔍 MeSH Terms:
   Direct API: Complete MeSH data
   PyMed:      Automatic MeSH extraction

🔍 Keywords:
   Direct API: Limited keyword extraction
   PyMed:      Built-in keyword handling

🔍 DOI Retrieval:
   Direct API: Full DOI support
   PyMed:      Full DOI support

🔍 Publication Types:
   Direct API: Manual extraction
   PyMed:      Built-in extraction

🔍 Date Handling:
   Direct API: Manual date formatting
   PyMed:      Automatic date parsing

🔍 Error Handling:
   Direct API: Manual implementation
   PyMed:      Built-in error handling

🔍 Complex Queries:
   