In [7]:
import sys
import os

# Add the src directory to Python path
project_root = os.path.dirname(os.path.abspath(''))
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

print(f"Added to path: {src_path}")
print(f"Current working directory: {os.getcwd()}")

# Now import the pipeline
try:
    from pipeline.workflow import DatabaseExpansionPipeline
    print("‚úì Successfully imported DatabaseExpansionPipeline")
    
    # Test instantiation
    pipeline = DatabaseExpansionPipeline()
    print("‚úì Successfully created pipeline instance")
    
except Exception as e:
    print(f"‚úó Import error: {e}")
    print(f"Available in sys.path: {sys.path}")
    
    # Try to list what's actually available
    import os
    if os.path.exists(src_path):
        print(f"Contents of src directory:")
        for item in os.listdir(src_path):
            print(f"  - {item}")
        
        pipeline_path = os.path.join(src_path, 'pipeline')
        if os.path.exists(pipeline_path):
            print(f"Contents of pipeline directory:")
            for item in os.listdir(pipeline_path):
                print(f"  - {item}")
    else:
        print(f"src directory doesn't exist at: {src_path}")

Added to path: /home/santi/Projects/UBMI-IFC-Podcast/src
Current working directory: /home/santi/Projects/UBMI-IFC-Podcast/notebooks
‚úì Successfully imported DatabaseExpansionPipeline
‚úÖ Loaded en_core_web_sm
‚úÖ Loaded en_core_web_sm
‚úÖ Loaded es_core_news_sm
‚úì Successfully created pipeline instance
‚úÖ Loaded es_core_news_sm
‚úì Successfully created pipeline instance


## üîß Refactored Modular Structure

This notebook has been refactored to use a clean modular architecture. The code is now organized into:

- **`pdf_acquisition/`**: Download research papers (DirectDownloader, PyPaperBotWrapper)
- **`publication_management/`**: Handle BibTeX and database operations (BibTexManager, PublicationDatabase)
- **`text_extraction/`**: Extract text from PDF files (PDFTextExtractor)
- **`affiliation_mining/`**: Mine institutional affiliations (EnhancedAffiliationMiner, AffiliationClustering)  
- **`pubmed/`**: Search and retrieve PubMed articles (EnhancedPubmedSearcher)
- **`data_quality/`**: Keyword extraction and classification (KeywordExtractor, PublicationClassifier)
- **`pipeline/`**: Main workflow orchestration (DatabaseExpansionPipeline)

### Usage Examples

You can now use either:
1. **Individual modules** for specific tasks
2. **Pipeline class** for complete workflows

Below are examples of both approaches.

In [8]:
# Example 1: Using individual modules
print("üìñ Example 1: Using individual modules")

# Load publications using the database module
db_manager = PublicationDatabase()
publications = db_manager.load_publications('../data/raw/test_ifc_publications.json')
print(f"Loaded {len(publications)} publications using PublicationDatabase module")

# Create BibTeX using the bibtex module  
bibtex_manager = BibTexManager()
bibtex_file = bibtex_manager.create_bibtex_from_publications(
    publications, 
    '../data/processed/modular_example.bib'
)
print(f"Created BibTeX file: {bibtex_file}")

print("\n" + "="*60 + "\n")

# Example 2: Using the complete pipeline
print("üöÄ Example 2: Using the complete pipeline")
print("This would run the entire workflow:")
print("1. Load existing publications")
print("2. Mine affiliations from PDFs") 
print("3. Search PubMed with discovered affiliations")
print("4. Merge and deduplicate databases")
print("5. Generate final BibTeX and reports")

# Uncomment the following line to run the complete pipeline:
# final_db, report = pipeline.run_complete_pipeline_with_review(
#     initial_json_path='../data/raw/all_ifc_publications.json',
#     pdf_dir='../papers/downloaded',
#     output_dir='../data/processed'
# )

üìñ Example 1: Using individual modules


NameError: name 'PublicationDatabase' is not defined

---

## üìö Original Implementation (Legacy Code)

The code below represents the original notebook implementation that has been refactored into the modular structure above. This code is preserved for reference and comparison purposes.

**Note**: You can now replace the following lengthy code blocks with simple module imports and function calls as shown in the examples above.

# Building IFC Publications Database

This notebook implements multiple strategies to build a comprehensive database of Instituto de Fisiolog√≠a Celular publications:

1. **PDF Acquisition**: Sci-Hub integration + BibTeX export for Zotero
2. **Affiliation Mining**: Extract all variations of institute names from existing PDFs
3. **PubMed Search Strategy**: Use discovered affiliations to find more papers
4. **Database Expansion**: Automated workflow to grow the collection

In [None]:
import json
import requests
import re
import os
import time
from urllib.parse import quote
from pathlib import Path
import pandas as pd
from collections import Counter
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
from pypdf import PdfReader
import warnings
warnings.filterwarnings('ignore')

## 1. Load Existing Publications Data

In [None]:
# Load your existing publications
with open('../data/raw/all_ifc_publications.json', 'r', encoding='utf-8') as f:
    publications = json.load(f)

print(f"Loaded {len(publications)} publications")
print("Sample publication:")
print(json.dumps(publications[0], indent=2, ensure_ascii=False))

Loaded 404 publications
Sample publication:
{
  "title": "An automatic representation of peptides for effective antimicrobial activity classification",
  "authors": "Beltran, J. A., Del Rio, G., & Brizuela, C. A.",
  "journal": "Computational and Structural Biotechnology Journal, 18, 455?463",
  "year": 2020,
  "doi": "10.1016/j.csbj.2020.02.002",
  "pubmed_id": "32180904",
  "ifc_url": "https://www.ifc.unam.mx/publicacion.php?ut=000607742800020",
  "abstract": "ABSTRACTAntimicrobial peptides (AMPs) are a promising alternative to small-molecules-based antibiotics. These peptides are part of most living organisms' innate defense system. In order to computationally identify new AMPs within the peptides these organisms produce, an automatic AMP/non-AMP classifier is required. In order to have an efficient classifier, a set of robust features that can capture what differentiates an AMP from another that is not, has to be selected. However, the number of candidate descriptors is large (in t

## 2. PDF Acquisition Strategy

### Option A: Sci-Hub Integration (own implementation)

> ‚ö†Ô∏è USE OPTION C

- Not tested for CAPTCHAs
- Use method B or C

In [None]:
def direct_doi_download(dois, output_dir='../papers/downloaded/direct'):
    """Directly download papers from Sci-Hub using DOIs"""
    os.makedirs(output_dir, exist_ok=True)
    
    # List of Sci-Hub mirrors to try
    mirrors = [
        "https://sci-hub.se/",
        "https://sci-hub.st/",
        "https://sci-hub.ru/",
        # Add more mirrors as needed
    ]
    
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    
    success_count = 0
    
    for doi in dois:
        print(f"Downloading DOI: {doi}")
        
        for mirror in mirrors:
            try:
                url = f"{mirror}{doi}"
                response = session.get(url, timeout=10)
                
                if response.status_code == 200:
                    # Check if response is PDF
                    if 'application/pdf' in response.headers.get('content-type', ''):
                        # Save PDF
                        filename = f"{doi.replace('/', '_')}.pdf"
                        filepath = os.path.join(output_dir, filename)
                        
                        with open(filepath, 'wb') as f:
                            f.write(response.content)
                            
                        print(f"‚úÖ Downloaded to {filepath}")
                        success_count += 1
                        break  # Move to next DOI after successful download
                    else:
                        # Handle HTML response (Sci-Hub page)
                        # You'd need a more sophisticated parser to extract the PDF link from the HTML
                        pass
                        
            except Exception as e:
                print(f"Failed with {mirror}: {e}")
                continue
                
        time.sleep(2)  # Be respectful
    
    print(f"\nDownload summary: {success_count}/{len(dois)} papers downloaded")
    return success_count

# Usage example
print("\nüìÑ Trying direct download approach...")
direct_doi_download(sample_dois)

### Option B: BibTeX Export for Zotero

> ‚ö†Ô∏è USE OPTION C

> Use one of the multiple zotero -> sci-hub plugins

In [None]:
def create_bibtex_from_publications(publications, output_file='../data/processed/all_ifc_publications.bib'):
    """Convert JSON publications to BibTeX format for Zotero import"""
    
    db = BibDatabase()
    entries = []
    
    def format_authors_for_bibtex(author_string):
        """Convert author string to proper BibTeX format"""
        if not author_string:
            return "Unknown"
        
        # Split by commas and clean each author
        authors = [author.strip() for author in author_string.split(',')]
        
        # Group authors (assuming they come in pairs: LastName, FirstName)
        formatted_authors = []
        i = 0
        while i < len(authors):
            if i + 1 < len(authors):
                # Check if next item looks like a first name (short, no hyphens typically)
                next_item = authors[i + 1].strip()
                if (len(next_item) <= 3 or 
                    (len(next_item.split()) == 1 and '.' in next_item) or
                    re.match(r'^[A-Z]\.?$', next_item)):
                    # This is likely a first name/initial
                    last_name = authors[i].strip()
                    first_name = next_item
                    formatted_authors.append(f"{last_name}, {first_name}")
                    i += 2
                else:
                    # This is likely a full name or last name only
                    formatted_authors.append(authors[i].strip())
                    i += 1
            else:
                # Last author, no pair
                formatted_authors.append(authors[i].strip())
                i += 1
        
        # Join with " and " for BibTeX format
        return " and ".join(formatted_authors)
    
    for i, pub in enumerate(publications):
        # Create a unique citation key
        first_author = pub['authors'].split(',')[0].strip() if pub['authors'] else 'Unknown'
        first_author_clean = re.sub(r'[^a-zA-Z]', '', first_author)
        citation_key = f"{first_author_clean}{pub['year']}_ifc_{i}"
        
        # Format authors properly for BibTeX
        formatted_authors = format_authors_for_bibtex(pub['authors'])
        
        entry = {
            'ENTRYTYPE': 'article',
            'ID': citation_key,
            'title': pub['title'],
            'author': formatted_authors,  # Now properly formatted
            'journal': pub['journal'],
            'year': str(pub['year']),
            'abstract': pub.get('abstract', ''),
            'url': pub.get('ifc_url', ''),
            'note': 'Instituto de Fisiolog√≠a Celular, UNAM'
        }
        
        if pub.get('doi'):
            entry['doi'] = pub['doi']
            
        if pub.get('pubmed_id'):
            entry['pmid'] = pub['pubmed_id']
            
        entries.append(entry)
    
    db.entries = entries
    
    # Write to file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    writer = BibTexWriter()
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(writer.write(db))
    
    print(f"üìö Created BibTeX file with {len(entries)} entries: {output_file}")
    print("Import this file into Zotero to download PDFs automatically")
    
    # Show sample formatted authors for verification
    print("\nüîç Sample author formatting:")
    for i, entry in enumerate(entries[:3]):
        print(f"{i+1}. Original: {publications[i]['authors']}")
        print(f"   BibTeX:   {entry['author']}")
    
    return output_file

# Test the improved function
print("üîß Testing improved BibTeX creation...")
bibtex_file = create_bibtex_from_publications(publications)
print(f"\nBibTeX file created at: {bibtex_file}")

# Let's also check the actual BibTeX content
print("\nüìÑ Sample BibTeX entries:")
with open(bibtex_file, 'r', encoding='utf-8') as f:
    content = f.read()
    # Show first entry
    first_entry_end = content.find('\n}\n') + 3
    print(content[:first_entry_end])

üîß Testing improved BibTeX creation...
üìö Created BibTeX file with 404 entries: ../data/processed/all_ifc_publications.bib
Import this file into Zotero to download PDFs automatically

üîç Sample author formatting:
1. Original: Beltran, J. A., Del Rio, G., & Brizuela, C. A.
   BibTeX:   Beltran and J. A. and Del Rio, G. and & Brizuela and C. A.
2. Original: Mart√≠nez?Gonz√°lez, K., Islas?Hern√°ndez, A., Mart√≠nez?Ezquerro, J. D., Berm√∫dez?Rattoni, F., & Garcia?delaTorre, P.
   BibTeX:   Mart√≠nez?Gonz√°lez, K. and Islas?Hern√°ndez, A. and Mart√≠nez?Ezquerro and J. D. and Berm√∫dez?Rattoni, F. and & Garcia?delaTorre, P.
3. Original: Yin, W., Cerda-Hern√°ndez, N., Castillo-Morales, A., Ruiz-Tejada-Segura, M. L., Monz√≥n-Sandoval, J., Moreno-Castilla, P., ? Guti√©rrez, H.
   BibTeX:   Yin, W. and Cerda-Hern√°ndez, N. and Castillo-Morales, A. and Ruiz-Tejada-Segura and M. L. and Monz√≥n-Sandoval, J. and Moreno-Castilla, P. and ? Guti√©rrez, H.

BibTeX file created at: ../data/processe

### Option C: PyPaperBot

- [Repo](https://github.com/ferru97/PyPaperBot)

- Download papers given a query
- Download papers given paper's DOIs
- Generate Bibtex of the downloaded paper
- Filter downloaded paper by year, journal and citations number

#### Key Features:

- Multiple Download Methods: DOI-based downloads (most reliable)
- Google Scholar queries
- BibTeX-only generation
- Flexible Modes: Download PDFs only, BibTeX only, or both
- IFC-Specific Queries: Pre-configured searches for the institute
- Deduplication: Automatic removal of duplicate downloads
- Rate Limiting: Respectful delays between requests

In [None]:
# pip install PyPaperBot undetected-chromedriver

Note: you may need to restart the kernel to use updated packages.


Chrome Installation Check

(required by undetected_chromedriver):

In [None]:
def check_chrome_installed():
    """Check if Chrome/Chromium is installed on the system"""
    chrome_paths = [
        "google-chrome",
        "chromium-browser",
        "chromium",
        "/usr/bin/google-chrome",
        "/usr/bin/chromium-browser"
    ]
    
    for path in chrome_paths:
        try:
            result = subprocess.run(["which", path], capture_output=True, text=True)
            if result.returncode == 0:
                print(f"‚úÖ Chrome/Chromium found at: {result.stdout.strip()}")
                return True
        except Exception:
            continue
    
    print("‚ùå Chrome/Chromium not found. Please install it for PyPaperBot to work properly.")
    print("   On Ubuntu/Debian: sudo apt install chromium-browser")
    print("   On Fedora: sudo dnf install chromium")
    return False

# Check if Chrome is installed
check_chrome_installed()

‚ùå Chrome/Chromium not found. Please install it for PyPaperBot to work properly.
   On Ubuntu/Debian: sudo apt install chromium-browser
   On Fedora: sudo dnf install chromium


False

In [None]:
import subprocess
import os
import json

def download_with_pypaperbot(dois=None, output_dir='../papers/downloaded', 
                           min_year=None, mode=2, use_doi_filename=True):
    """
    Download papers using PyPaperBot command-line interface with dependency checking
    
    Args:
        dois: List of DOIs or None
        output_dir: Where to save outputs
        min_year: Minimum publication year
        mode: 0=BibTeX only, 1=PDF only, 2=both
        use_doi_filename: Use DOI as filename instead of paper title
    """
    # Check for required dependencies
    try:
        import importlib
        if importlib.util.find_spec("undetected_chromedriver") is None:
            print("Installing missing dependency: undetected-chromedriver")
            subprocess.check_call([sys.executable, "-m", "pip", "install", "undetected-chromedriver"])
            print("Dependency installed successfully")
    except Exception as e:
        print(f"Warning: Could not verify/install dependencies: {e}")
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Base command
    cmd = ["python", "-m", "PyPaperBot"]
    
    # Add arguments based on parameters
    if dois:
        # For multiple DOIs, create a temporary DOI file
        doi_file = os.path.join(output_dir, "temp_dois.txt")
        with open(doi_file, 'w') as f:
            f.write('\n'.join(dois))
        cmd.extend(["--doi-file", doi_file])
    
    # Add output directory
    cmd.extend(["--dwn-dir", output_dir])
    
    # Add optional parameters
    if min_year:
        cmd.extend(["--min-year", str(min_year)])
    
    # Add mode (restrict parameter)
    cmd.extend(["--restrict", str(mode)])
    
    # Use DOI as filename if requested
    if use_doi_filename:
        cmd.append("--use-doi-as-filename")
    
    # Execute command
    print(f"Executing command: {' '.join(cmd)}")
    try:
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        # Print output
        print("\nOutput:")
        print(result.stdout)
        
        if result.stderr:
            print("\nErrors:")
            print(result.stderr)
            
        return result.returncode == 0
    except Exception as e:
        print(f"Error executing PyPaperBot: {e}")
        return False

# Test with a small sample from the publications
print(f"Working with {len(publications)} publications")

# Extract DOIs for PyPaperBot
dois = [pub.get('doi') for pub in publications if pub.get('doi')]
print(f"Found {len(dois)} DOIs to process")

# Test with a small sample (2)
# print("\nüß™ Testing PyPaperBot with 2 sample publications...")
# sample_dois = dois[:2]
sample_dois = dois
# print(f"Sample DOIs: {sample_dois}")

# Run PyPaperBot in different modes
print("\nüì• Test 1: Downloading PDFs only")
pdf_success = download_with_pypaperbot(sample_dois, output_dir='../papers/downloaded/pdf_only', mode=1)

print("\nüìö Test 2: Generating BibTeX only")
# bibtex_success = download_with_pypaperbot(sample_dois, output_dir='../papers/downloaded/bibtex_only', mode=0)

print("\nüîç Test 3: Downloading both PDF and BibTeX")
# combined_success = download_with_pypaperbot(sample_dois, output_dir='../papers/downloaded/combined', mode=2)

print("\nüìä Download Summary:")
print(f"   Sample DOIs processed: {len(sample_dois)}")
print(f"   PDF download successful: {pdf_success}")
print(f"   BibTeX download successful: {bibtex_success}")
print(f"   Combined download successful: {combined_success}")

Working with 404 publications
Found 402 DOIs to process

üì• Test 1: Downloading PDFs only
Executing command: python -m PyPaperBot --doi-file ../papers/downloaded/pdf_only/temp_dois.txt --dwn-dir ../papers/downloaded/pdf_only --restrict 1 --use-doi-as-filename


KeyboardInterrupt: 

def run_complete_pipeline(initial_json_path, output_dir='../data/processed'):
    """Complete automated pipeline to expand publication database"""
    
    # ... existing code ...
    
    # Step 3b: Download PDFs for existing publications
    print("\nüì• Step 3b: Downloading PDFs using PyPaperBot")
    dois = [pub.get('doi') for pub in existing_pubs if pub.get('doi')]
    pdf_dir = os.path.join(output_dir, 'pdfs')
    download_success = download_with_pypaperbot(
        dois, 
        output_dir=pdf_dir,
        mode=1  # PDF only
    )
    print(f"   PDF download {'successful' if download_success else 'failed'}")
    print(f"   Check output directory: {pdf_dir}")
    
    # ... rest of existing pipeline ...

In [None]:
def bulk_download_with_pypaperbot(all_dois, output_dir, chunk_size=50):
    """Download papers in chunks to avoid overwhelming the system"""
    os.makedirs(output_dir, exist_ok=True)
    
    # Process in chunks
    success_count = 0
    total_chunks = (len(all_dois) + chunk_size - 1) // chunk_size
    
    for i in range(0, len(all_dois), chunk_size):
        chunk = all_dois[i:i+chunk_size]
        chunk_num = (i // chunk_size) + 1
        
        print(f"\nProcessing chunk {chunk_num}/{total_chunks} ({len(chunk)} DOIs)")
        chunk_dir = os.path.join(output_dir, f"batch_{chunk_num}")
        
        if download_with_pypaperbot(chunk, output_dir=chunk_dir, mode=1):
            success_count += 1
            
        # Add delay between chunks
        if chunk_num < total_chunks:
            print("Waiting before next batch...")
            time.sleep(30)  # 30 second delay between batches
    
    print(f"\nCompleted {success_count}/{total_chunks} batches successfully")
    return success_count == total_chunks

dois_test=['10.1016/j.cell.2025.03.050', '10.1523/JNEUROSCI.1234-24.2024', '10.1073/pnas.2420356122', '10.1364/ol.547539', '10.1016/j.neulet.2025.138361 ']

bulk_download_with_pypaperbot(dois_test, 'papers/mining/01_run1/')


Processing chunk 1/1 (5 DOIs)
Executing command: python -m PyPaperBot --doi-file papers/mining/01_run1/batch_1/temp_dois.txt --dwn-dir papers/mining/01_run1/batch_1 --restrict 1 --use-doi-as-filename

Output:
PyPaperBot v1.4.1
PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref and SciHub.
        -Join the telegram channel to stay updated --> https://t.me/pypaperbotdatawizards <--
        -If you like this project, you can share a cup of coffee at --> https://www.paypal.com/paypalme/ferru97 <-- :)

Downloading papers from DOIs

Searching paper 1 of 5 with DOI 10.1016/j.cell.2025.03.050
Searching paper 2 of 5 with DOI 10.1523/JNEUROSCI.1234-24.2024
Searching paper 3 of 5 with DOI 10.1073/pnas.2420356122
Searching paper 4 of 5 with DOI 10.1364/ol.547539
Searching paper 5 of 5 with DOI 10.1016/j.neulet.2025.138361 
Searching for a sci-hub mirror
Trying with https://sci-hub.ee...

Using Sci-Hub mirror https://sci-hub.ee
Using Sci-DB mirror https:/

True

Testing PyPaperBot with Real IFC Publications

> NOTE: uses above `bulk_download_with_pypaperbot`

In [None]:
import json
import os
import random
import sys
import time

# Define test function to load DOIs from your real publication data
def test_pypaperbot_with_real_publications(json_path, sample_size=5, output_dir='../papers/test_downloads'):
    """
    Test PyPaperBot with a small sample of real publications
    
    Args:
        json_path: Path to the JSON file containing publications with DOIs
        sample_size: Number of publications to test (default: 5)
        output_dir: Directory to save downloaded PDFs
    """
    print(f"üß™ Testing PyPaperBot with {sample_size} real publications")
    
    # Load publications from JSON file
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            publications = json.load(f)
        print(f"‚úÖ Loaded {len(publications)} publications from {json_path}")
    except Exception as e:
        print(f"‚ùå Error loading publications: {e}")
        return False
    
    # Extract DOIs from publications
    dois = [pub.get('doi') for pub in publications if pub.get('doi')]
    print(f"üìä Found {len(dois)} publications with DOIs")
    
    if not dois:
        print("‚ùå No DOIs found in the publications data")
        return False
    
    # Select a random sample of DOIs
    if len(dois) > sample_size:
        sample_dois = random.sample(dois, sample_size)
    else:
        sample_dois = dois
        print(f"‚ö†Ô∏è Requested {sample_size} samples but only {len(dois)} DOIs available")
    
    print(f"\nüìù Selected {len(sample_dois)} DOIs for testing:")
    for i, doi in enumerate(sample_dois):
        print(f"   {i+1}. {doi}")
    
    # Download PDFs using PyPaperBot
    print(f"\nüì• Downloading PDFs to {output_dir}")
    
    # Create timestamp for this test run
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    test_dir = os.path.join(output_dir, f"test_run_{timestamp}")
    
    # Use the bulk download function with a single chunk
    success = bulk_download_with_pypaperbot(
        sample_dois, 
        output_dir=test_dir,
        chunk_size=len(sample_dois)  # Process all in one chunk
    )
    
    if success:
        print(f"\n‚úÖ Test successful! PDFs downloaded to {test_dir}")
    else:
        print(f"\n‚ùå Test failed. Check logs for errors.")
    
    return success

# Run the test with the real publications
ifc_publications_path = '/home/santi/Projects/UBMI-IFC-Podcast/data/raw/all_ifc_publications.json'
test_pypaperbot_with_real_publications(ifc_publications_path, sample_size=5)

üß™ Testing PyPaperBot with 5 real publications
‚úÖ Loaded 404 publications from /home/santi/Projects/UBMI-IFC-Podcast/data/raw/all_ifc_publications.json
üìä Found 402 publications with DOIs

üìù Selected 5 DOIs for testing:
   1. 10.3389/ffunb.2024.1378590
   2. 10.1371/journal.pone.0242749
   3. 10.1016/j.neuroscience.2020.08.025
   4. 10.1016/j.arcmed.2021.09.001
   5. 10.3390/ijms25126491

üì• Downloading PDFs to ../papers/test_downloads

Processing chunk 1/1 (5 DOIs)
Executing command: python -m PyPaperBot --doi-file ../papers/test_downloads/test_run_20250921_205321/batch_1/temp_dois.txt --dwn-dir ../papers/test_downloads/test_run_20250921_205321/batch_1 --restrict 1 --use-doi-as-filename

Output:
PyPaperBot v1.4.1
PyPaperBot is a Python tool for downloading scientific papers using Google Scholar, Crossref and SciHub.
        -Join the telegram channel to stay updated --> https://t.me/pypaperbotdatawizards <--
        -If you like this project, you can share a cup of coffee at

True

## Affiliation mining

> affiliation mining system:

- Extracts text from PDFs using PyMuPDF
- Uses both regex and NLP for affiliation detection
- Supports Spanish and English processing
- Groups similar affiliations automatically
- Generates PubMed search variations from discovered affiliations

> NOTE‚ö†Ô∏è

spaCy:

- Tokenizes the text into words, punctuation, etc.
- Part-of-speech tags each token
- Dependency parses to understand grammatical relationships
- Named Entity Recognition identifies spans as organizations, people, locations, etc.
Classification assigns labels like "ORG" (organization), "PERSON", "GPE" (geopolitical entity)

```python entity_recognition_process
doc = nlp(text)
for ent in doc.ents:
    if ent.label_ == "ORG":  # Organization entity
        print(ent.text)
```



#### spacy installation

```python
# Install Python packages
pip install -r requirements.txt

# Download spaCy language models
python -m spacy download en_core_web_sm
python -m spacy download es_core_news_sm

# Optional: Download larger, more accurate models
python -m spacy download en_core_web_md
python -m spacy download es_core_news_md
```

#### PDF text Extraaction

In [15]:
import os
import fitz  # PyMuPDF
import glob
from tqdm import tqdm

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF using PyMuPDF (faster and more accurate than PyPDF)"""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        
        # Extract text from each page
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        
        return text
    except Exception as e:
        print(f"Error extracting text from {os.path.basename(pdf_path)}: {e}")
        return ""
    finally:
        if 'doc' in locals():
            doc.close()

def batch_process_pdfs(pdf_dir, limit=None):
    """Process multiple PDFs and extract text"""
    pdf_files = glob.glob(os.path.join(pdf_dir, "**", "*.pdf"), recursive=True)
    
    if limit:
        pdf_files = pdf_files[:limit]
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    results = {}
    for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
        filename = os.path.basename(pdf_path)
        text = extract_text_from_pdf(pdf_path)
        if text:
            results[filename] = text
    
    print(f"Successfully extracted text from {len(results)} PDFs")
    return results

pipeline to connect the PDF processing with affiliation extraction

In [10]:
def mine_affiliations_from_pdfs(pdf_dir, output_json=None, limit=None):
    """
    Extract affiliations from PDFs and return structured data
    
    Args:
        pdf_dir: Directory containing PDFs to process
        output_json: Optional path to save results as JSON
        limit: Maximum number of PDFs to process
        
    Returns:
        Dictionary with affiliation data
    """
    # 1. Initialize the affiliation miner
    print("üîç Initializing affiliation miner...")
    miner = EnhancedAffiliationMiner()
    
    # 2. Extract text from PDFs
    print("\nüìÑ Extracting text from PDFs...")
    pdf_texts = batch_process_pdfs(pdf_dir, limit)
    
    # 3. Mine affiliations from each PDF
    print("\nüè¢ Mining affiliations from extracted text...")
    all_affiliations = set()
    pdf_affiliations = {}
    
    for filename, text in tqdm(pdf_texts.items(), desc="Mining affiliations"):
        # Process only the first few pages where affiliations typically appear
        first_pages_text = text[:20000]  # Adjust based on typical affiliation location
        affiliations = miner.extract_affiliations_advanced_nlp(first_pages_text)
        
        if affiliations:
            pdf_affiliations[filename] = list(affiliations)
            all_affiliations.update(affiliations)
    
    # 4. Cluster similar affiliations
    print(f"\nüß© Clustering {len(all_affiliations)} discovered affiliations...")
    clusters = miner.analyze_affiliations_with_clustering(list(all_affiliations))
    
    # 5. Generate PubMed search variations
    print("\nüîé Generating PubMed search variations...")
    pubmed_variations = generate_pubmed_search_variations(clusters)
    
    # 6. Compile results
    results = {
        'total_pdfs_processed': len(pdf_texts),
        'total_affiliations_found': len(all_affiliations),
        'affiliation_clusters': [
            {'representative': cluster[0], 'variations': cluster} 
            for cluster in clusters
        ],
        'pubmed_search_variations': pubmed_variations,
        'pdf_affiliations': pdf_affiliations
    }
    
    # 7. Save results if requested
    if output_json:
        import json
        os.makedirs(os.path.dirname(output_json), exist_ok=True)
        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f"\n‚úÖ Results saved to {output_json}")
    
    return results

def generate_pubmed_search_variations(affiliation_clusters):
    """Generate PubMed search variations from affiliation clusters"""
    search_variations = []
    
    # Process each cluster
    for cluster in affiliation_clusters:
        # Use the first (representative) affiliation from each cluster
        if cluster:
            rep_affiliation = cluster[0]
            
            # Clean and format for PubMed
            # Remove common punctuation and normalize spaces
            clean_aff = re.sub(r'[,.:]', '', rep_affiliation)
            clean_aff = re.sub(r'\s+', ' ', clean_aff).strip()
            
            # Add [Affiliation] tag for PubMed
            pubmed_variation = f"{clean_aff}[Affiliation]"
            search_variations.append(pubmed_variation)
    
    return search_variations

In [11]:
import json

def mine_affiliations_from_pdfs(pdf_dir, output_json=None, limit=None):
    """
    Extract affiliations from PDFs and return structured data
    
    Args:
        pdf_dir: Directory containing PDFs to process
        output_json: Optional path to save results as JSON
        limit: Maximum number of PDFs to process
        
    Returns:
        Dictionary with affiliation data
    """
    # 1. Initialize the affiliation miner
    print("üîç Initializing affiliation miner...")
    miner = EnhancedAffiliationMiner()
    
    # 2. Extract text from PDFs
    print("\nüìÑ Extracting text from PDFs...")
    pdf_texts = batch_process_pdfs(pdf_dir, limit)
    
    try:
        # 3. Mine affiliations from each PDF
        print("\nüè¢ Mining affiliations from extracted text...")
        all_affiliations = set()
        pdf_affiliations = {}
        
        for filename, text in tqdm(pdf_texts.items(), desc="Mining affiliations"):
            try:
                # Process only the first few pages where affiliations typically appear
                first_pages_text = text[:20000]  # Adjust based on typical affiliation location
                affiliations = miner.extract_affiliations_advanced_nlp(first_pages_text)
                
                if affiliations:
                    pdf_affiliations[filename] = list(affiliations)
                    all_affiliations.update(affiliations)
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue
        
        # 4. Cluster similar affiliations
        print(f"\nüß© Clustering {len(all_affiliations)} discovered affiliations...")
        clusters = miner.analyze_affiliations_with_clustering(list(all_affiliations))
        
        # 5. Generate PubMed search variations
        print("\nüîé Generating PubMed search variations...")
        pubmed_variations = generate_pubmed_search_variations(clusters)
        
        # 6. Compile results
        results = {
            'total_pdfs_processed': len(pdf_texts),
            'total_affiliations_found': len(all_affiliations),
            'affiliation_clusters': [
                {'representative': cluster[0], 'variations': cluster} 
                for cluster in clusters
            ],
            'pubmed_search_variations': pubmed_variations,
            'pdf_affiliations': pdf_affiliations
        }
        
        # 7. Save results if requested
        if output_json:
            os.makedirs(os.path.dirname(output_json), exist_ok=True)
            with open(output_json, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
            print(f"\n‚úÖ Results saved to {output_json}")
        
        return results

    except Exception as e:
        print(f"‚ùå Error in affiliation mining process: {e}")
        if output_json:
            # Save what we have so far as backup
            with open(output_json + '.partial', 'w', encoding='utf-8') as f:
                json.dump({
                    'error': str(e),
                    'partial_results': pdf_affiliations
                }, f, indent=2, ensure_ascii=False)
        return {'total_pdfs_processed': 0, 'total_affiliations_found': 0, 
                'affiliation_clusters': [], 'pubmed_search_variations': [], 
                'pdf_affiliations': {}, 'error': str(e)}

In [12]:
# Modify the build_search_queries method in PubmedSearcher class
def build_search_queries(self, affiliation_variations=None):
    """Build comprehensive search queries for different affiliation variations"""
    
    if affiliation_variations is None:
        # Default variations based on your institute
        affiliation_variations = [
            "Instituto de Fisiologia Celular[Affiliation]",
            "Institute of Cellular Physiology[Affiliation]",
            "IFC UNAM[Affiliation]",
            "Departamento de Neurobiologia UNAM[Affiliation]",
            "Universidad Nacional Autonoma Mexico Fisiologia[Affiliation]",
            "National Autonomous University Mexico Cellular Physiology[Affiliation]"
        ]
    
    # Filter out variations that are too generic or too long
    filtered_variations = []
    for var in affiliation_variations:
        # Remove the [Affiliation] suffix if present for checking
        check_var = var.replace("[Affiliation]", "").strip().lower()
        
        # Skip variations that are too short (likely noise) 
        # or don't contain key terms related to your institute
        if len(check_var) < 10:
            continue
        
        # Skip variations without key identifiers
        if not any(term in check_var for term in ["fisiol", "physiol", "mexico", "unam", "ifc", "cellular"]):
            continue
            
        filtered_variations.append(var)
    
    queries = []
    
    # Individual affiliation searches
    for aff in filtered_variations[:10]:  # Limit to top 10 to avoid excessive queries
        queries.append(aff)
        
    # Combined searches with time ranges
    recent_query = f"({' OR '.join(filtered_variations[:3])}) AND (2020:2024[pdat])"
    historical_query = f"({' OR '.join(filtered_variations[:3])}) AND (2010:2019[pdat])"
    
    queries.extend([recent_query, historical_query])
    
    return queries

complete pipeline function

In [13]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
import re
from collections import Counter, defaultdict
import langdetect
from langdetect import detect

class EnhancedAffiliationMiner:
    def __init__(self):
        """Initialize with advanced spaCy features"""
        self.nlp_models = {}
        self.matchers = {}
        self.load_nlp_models()
        self.setup_custom_matchers()
        
    def load_nlp_models(self):
        """Load spaCy models with error handling"""
        models_to_load = {
            'en': 'en_core_web_sm',
            'es': 'es_core_news_sm'
        }
        
        for lang, model_name in models_to_load.items():
            try:
                nlp = spacy.load(model_name)
                # Add custom pipeline components
                if not nlp.has_pipe('merge_entities'):
                    nlp.add_pipe('merge_entities')
                
                self.nlp_models[lang] = nlp
                print(f"‚úÖ Loaded {model_name}")
                
                # Setup matcher for this language
                self.matchers[lang] = Matcher(nlp.vocab)
                
            except OSError:
                print(f"‚ùå {model_name} not found. Install with:")
                print(f"   python -m spacy download {model_name}")
    
    def setup_custom_matchers(self):
        """Setup custom pattern matchers for institutional names"""
        
        # Patterns for Spanish institutions
        if 'es' in self.matchers:
            spanish_patterns = [
                # Instituto de X patterns
                [{"LOWER": "instituto"}, {"LOWER": "de"}, {"IS_TITLE": True, "OP": "+"}],
                
                # Universidad patterns
                [{"LOWER": "universidad"}, {"IS_TITLE": True, "OP": "+"}],
                [{"LOWER": "universidad"}, {"LOWER": "nacional"}, {"LOWER": "aut√≥noma"}, {"LOWER": "de"}, {"LOWER": "m√©xico"}],
                
                # Departamento patterns
                [{"LOWER": "departamento"}, {"LOWER": "de"}, {"IS_TITLE": True, "OP": "+"}],
                
                # IFC patterns
                [{"TEXT": {"REGEX": r"IFC-?UNAM"}}],
            ]
            
            for i, pattern in enumerate(spanish_patterns):
                self.matchers['es'].add(f"SPANISH_INSTITUTION_{i}", [pattern])
        
        # Patterns for English institutions
        if 'en' in self.matchers:
            english_patterns = [
                # University of X patterns
                [{"LOWER": "university"}, {"LOWER": "of"}, {"IS_TITLE": True, "OP": "+"}],
                
                # Institute of X patterns
                [{"LOWER": "institute"}, {"LOWER": "of"}, {"IS_TITLE": True, "OP": "+"}],
                
                # Department of X patterns
                [{"LOWER": "department"}, {"LOWER": "of"}, {"IS_TITLE": True, "OP": "+"}],
                
                # National Autonomous University of Mexico
                [{"LOWER": "national"}, {"LOWER": "autonomous"}, {"LOWER": "university"}, 
                 {"LOWER": "of"}, {"LOWER": "mexico"}],
            ]
            
            for i, pattern in enumerate(english_patterns):
                self.matchers['en'].add(f"ENGLISH_INSTITUTION_{i}", [pattern])
    
    def detect_language_advanced(self, text):
        """Advanced language detection"""
        try:
            # Use langdetect for primary detection
            detected = detect(text[:1000])  # Use first 1000 chars for speed
            
            # Validate with keyword analysis
            spanish_keywords = ['de', 'del', 'la', 'el', 'y', 'universidad', 'instituto']
            english_keywords = ['of', 'the', 'and', 'university', 'institute', 'department']
            
            text_lower = text.lower()
            spanish_count = sum(1 for kw in spanish_keywords if kw in text_lower)
            english_count = sum(1 for kw in english_keywords if kw in text_lower)
            
            # Override detection if keyword analysis is strong
            if spanish_count > english_count * 1.5:
                return 'es'
            elif english_count > spanish_count * 1.5:
                return 'en'
            else:
                return detected if detected in ['es', 'en'] else 'en'
                
        except:
            return 'en'  # Default to English
    
    def extract_affiliations_advanced_nlp(self, text):
        """Advanced NER + custom patterns for affiliation extraction"""
        language = self.detect_language_advanced(text)
        
        if language not in self.nlp_models:
            print(f"‚ö†Ô∏è No model available for language: {language}")
            return set()
        
        nlp = self.nlp_models[language]
        matcher = self.matchers[language]
        
        affiliations = set()
        
        # Process text in chunks to handle large documents
        max_length = 1000000
        text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
        
        for chunk in text_chunks:
            try:
                doc = nlp(chunk)
                
                # Method 1: Standard NER for organizations
                for ent in doc.ents:
                    if ent.label_ == "ORG":
                        org_text = ent.text.strip()
                        if self.is_relevant_affiliation(org_text):
                            affiliations.add(org_text)
                
                # Method 2: Custom pattern matching
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start:end]
                    affiliation_text = span.text.strip()
                    if len(affiliation_text) > 5:
                        affiliations.add(affiliation_text)
                
                # Method 3: Context-based extraction
                # Look for sentences containing institutional indicators
                for sent in doc.sents:
                    sent_text = sent.text.strip()
                    if self.contains_institutional_indicators(sent_text, language):
                        # Extract the institutional part
                        extracted = self.extract_institutional_part(sent_text, language)
                        if extracted:
                            affiliations.add(extracted)
                            
            except Exception as e:
                print(f"‚ö†Ô∏è Error processing chunk: {e}")
                continue
        
        return affiliations
    
    def is_relevant_affiliation(self, org_text):
        """Check if organization text is relevant to our search"""
        relevant_keywords = [
            'instituto', 'institute', 'universidad', 'university',
            'departamento', 'department', 'unam', 'ifc', 'mexico',
            'fisiolog', 'physiolog', 'celular', 'cellular', 'neurobiolog'
        ]
        
        org_lower = org_text.lower()
        return (len(org_text) > 10 and 
                any(keyword in org_lower for keyword in relevant_keywords))
    
    def contains_institutional_indicators(self, text, language):
        """Check if text contains institutional indicators"""
        if language == 'es':
            indicators = [
                'instituto de', 'universidad', 'departamento de', 
                'centro de', 'facultad de', 'unam'
            ]
        else:
            indicators = [
                'institute of', 'university of', 'department of',
                'center of', 'faculty of', 'unam'
            ]
        
        text_lower = text.lower()
        return any(indicator in text_lower for indicator in indicators)
    
    def extract_institutional_part(self, sentence, language):
        """Extract the institutional part from a sentence"""
        # Use regex patterns to extract institutional names
        if language == 'es':
            patterns = [
                r'Instituto\s+de\s+[A-Za-z√Å√°√â√©√ç√≠√ì√≥√ö√∫√ë√±\s,]+?(?:,|\.|\s+UNAM)',
                r'Universidad\s+[A-Za-z√Å√°√â√©√ç√≠√ì√≥√ö√∫√ë√±\s,]+?(?:,|\.)',
                r'Departamento\s+de\s+[A-Za-z√Å√°√â√©√ç√≠√ì√≥√ö√∫√ë√±\s,]+?(?:,|\.)'
            ]
        else:
            patterns = [
                r'Institute\s+of\s+[A-Za-z\s,]+?(?:,|\.|\s+UNAM)',
                r'University\s+of\s+[A-Za-z\s,]+?(?:,|\.)',
                r'Department\s+of\s+[A-Za-z\s,]+?(?:,|\.)'
            ]
        
        for pattern in patterns:
            match = re.search(pattern, sentence, re.IGNORECASE)
            if match:
                return match.group().strip()
        
        return None
    
    def analyze_affiliations_with_clustering(self, affiliations_list):
        """Advanced analysis with similarity clustering"""
        from difflib import SequenceMatcher
        
        def similarity(a, b):
            return SequenceMatcher(None, a.lower(), b.lower()).ratio()
        
        # Group similar affiliations
        clusters = []
        processed = set()
        
        for affiliation in affiliations_list:
            if affiliation in processed:
                continue
                
            # Find similar affiliations
            cluster = [affiliation]
            processed.add(affiliation)
            
            for other in affiliations_list:
                if other not in processed and similarity(affiliation, other) > 0.7:
                    cluster.append(other)
                    processed.add(other)
            
            if len(cluster) >= 1:
                clusters.append(cluster)
        
        return clusters

# Usage example and demo
def demo_enhanced_mining():
    """Demonstrate enhanced affiliation mining"""
    miner = EnhancedAffiliationMiner()
    
    sample_text = """
    Instituto de Fisiolog√≠a Celular, Universidad Nacional Aut√≥noma de M√©xico, 
    Ciudad Universitaria, M√©xico, D.F. 04510, M√©xico
    
    Department of Cellular Physiology, National Autonomous University of Mexico,
    Mexico City, Mexico
    
    Departamento de Neurobiolog√≠a, Instituto de Fisiolog√≠a Celular, UNAM
    Centro de Investigaci√≥n y de Estudios Avanzados del IPN
    
    IFC-UNAM, Circuito Exterior s/n, Ciudad Universitaria
    """
    
    print("üß™ Testing enhanced affiliation extraction...")
    
    # Advanced NLP extraction
    affiliations = miner.extract_affiliations_advanced_nlp(sample_text)
    
    print(f"\nüß† Enhanced NLP extraction found {len(affiliations)} affiliations:")
    for affiliation in sorted(affiliations):
        print(f"   ‚Ä¢ {affiliation}")
    
    # Clustering analysis
    clusters = miner.analyze_affiliations_with_clustering(list(affiliations))
    print(f"\nüîó Found {len(clusters)} similarity clusters:")
    for i, cluster in enumerate(clusters):
        print(f"   Cluster {i+1}: {len(cluster)} variations")
        for variation in cluster:
            print(f"      - {variation}")
    
    return affiliations

# Run enhanced demo
if __name__ == "__main__":
    print("üöÄ Starting Enhanced Affiliation Mining Demo...")
    demo_results = demo_enhanced_mining()

üöÄ Starting Enhanced Affiliation Mining Demo...
‚úÖ Loaded en_core_web_sm
‚úÖ Loaded en_core_web_sm
‚úÖ Loaded es_core_news_sm
üß™ Testing enhanced affiliation extraction...
‚úÖ Loaded es_core_news_sm
üß™ Testing enhanced affiliation extraction...

üß† Enhanced NLP extraction found 6 affiliations:
   ‚Ä¢ Fisiolog√≠a Celular
   ‚Ä¢ IFC-UNAM
   ‚Ä¢ Instituto de Fisiolog√≠a Celular
   ‚Ä¢ Instituto de Fisiolog√≠a Celular,
   ‚Ä¢ National Autonomous University of Mexico
   ‚Ä¢ UNAM
    Centro de Investigaci√≥n

üîó Found 4 similarity clusters:
   Cluster 1: 3 variations
      - Fisiolog√≠a Celular
      - Instituto de Fisiolog√≠a Celular
      - Instituto de Fisiolog√≠a Celular,
   Cluster 2: 1 variations
      - IFC-UNAM
   Cluster 3: 1 variations
      - National Autonomous University of Mexico
   Cluster 4: 1 variations
      - UNAM
    Centro de Investigaci√≥n

üß† Enhanced NLP extraction found 6 affiliations:
   ‚Ä¢ Fisiolog√≠a Celular
   ‚Ä¢ IFC-UNAM
   ‚Ä¢ Instituto de Fisiol

In [None]:
def analyze_pdfs_and_search_pubmed(pdf_dir, output_dir='../data/processed/affiliations', 
                                 limit_pdfs=None, max_results_per_query=20):
    """
    Complete pipeline: Extract affiliations from PDFs and search PubMed
    
    Args:
        pdf_dir: Directory containing PDFs to process
        output_dir: Directory for saving outputs
        limit_pdfs: Maximum number of PDFs to process (None for all)
        max_results_per_query: Maximum results per PubMed query
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Step 1: Mine affiliations from PDFs
    print("üîé Step 1: Mining affiliations from PDFs...")
    affiliations_output = os.path.join(output_dir, "discovered_affiliations.json")
    affiliation_results = mine_affiliations_from_pdfs(
        pdf_dir=pdf_dir,
        output_json=affiliations_output,
        limit=limit_pdfs
    )
    
    # Extract PubMed search variations
    pubmed_variations = affiliation_results.get('pubmed_search_variations', [])
    if not pubmed_variations:
        print("‚ö†Ô∏è No valid PubMed search variations found. Using default variations.")
    else:
        print(f"üîç Found {len(pubmed_variations)} PubMed search variations")
        print("\nSample variations:")
        for i, var in enumerate(pubmed_variations[:5]):
            print(f"   {i+1}. {var}")
    
    # Step 2: Search PubMed with discovered affiliations
    print("\nüîç Step 2: Searching PubMed with discovered affiliations...")
    searcher = PubmedSearcher()
    articles = searcher.comprehensive_search(
        affiliation_variations=pubmed_variations,
        max_per_query=max_results_per_query
    )
    
    # Step 3: Save PubMed results
    print(f"\nüìä Found {len(articles)} articles from PubMed")
    pubmed_output = os.path.join(output_dir, "pubmed_results.json")
    with open(pubmed_output, 'w', encoding='utf-8') as f:
        json.dump(articles, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ PubMed results saved to {pubmed_output}")
    
    # Step 4: Summary
    print("\nüìã Pipeline Summary:")
    print(f"   PDFs processed: {affiliation_results['total_pdfs_processed']}")
    print(f"   Unique affiliations found: {affiliation_results['total_affiliations_found']}")
    print(f"   Affiliation clusters: {len(affiliation_results['affiliation_clusters'])}")
    print(f"   PubMed search variations: {len(pubmed_variations)}")
    print(f"   PubMed articles found: {len(articles)}")
    
    return {
        'affiliation_results': affiliation_results,
        'pubmed_articles': articles
    }

### Mining all

In [16]:
# Install PyMuPDF if not already installed
!pip install pymupdf tqdm

# Test the affiliation extraction on the downloaded PDFs
import os

# Get the path to the recently downloaded PDFs
test_pdf_dir = '../papers/downloaded/zotero'  # Update with your actual path

# Test with a small number of PDFs first
print(f"Testing affiliation extraction on PDFs in {test_pdf_dir}")
test_results = mine_affiliations_from_pdfs(
    pdf_dir=test_pdf_dir,
    output_json='../data/processed/all_affiliations.json',
    limit=None  # Process up to 5 PDFs
)

# Display the discovered affiliations
print("\nüè¢ Discovered affiliations:")
for cluster in test_results['affiliation_clusters'][:5]:  # Show top 5 clusters
    print(f"\n‚Ä¢ Main variation: {cluster['representative']}")
    if len(cluster['variations']) > 1:
        print("  Other variations:")
        for var in cluster['variations'][1:]:
            print(f"  - {var}")

# Show how these could be used for PubMed search
print("\nüîç PubMed search variations:")
for i, var in enumerate(test_results['pubmed_search_variations'][:5]):
    print(f"   {i+1}. {var}")

Testing affiliation extraction on PDFs in ../papers/downloaded/zotero
üîç Initializing affiliation miner...
Testing affiliation extraction on PDFs in ../papers/downloaded/zotero
üîç Initializing affiliation miner...
‚úÖ Loaded en_core_web_sm
‚úÖ Loaded en_core_web_sm
‚úÖ Loaded es_core_news_sm

üìÑ Extracting text from PDFs...
Found 345 PDF files to process
‚úÖ Loaded es_core_news_sm

üìÑ Extracting text from PDFs...
Found 345 PDF files to process


Processing PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 345/345 [00:37<00:00,  9.13it/s]
Processing PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 345/345 [00:37<00:00,  9.13it/s]


Successfully extracted text from 345 PDFs

üè¢ Mining affiliations from extracted text...


Mining affiliations: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 345/345 [03:59<00:00,  1.44it/s]




üß© Clustering 1102 discovered affiliations...

üîé Generating PubMed search variations...

‚úÖ Results saved to ../data/processed/all_affiliations.json

üè¢ Discovered affiliations:

‚Ä¢ Main variation: University of Washington,
  Other variations:
  - University of Washington School of Medicine
  - University of Warwick
  - University of Michigan,
  - University of Concepcion,
  - University of Coahuila,
  - University of Manchester,
  - University of Aarhus,
  - University of Dhaka,
  - University of Michigan
  - University of Michigan.
  - University of Toronto
  - University of¬†Cape Town
  - University of Salento
  - University of Palermo,
  - University
of Toronto,
  - University of 
Massachusetts,
  - University of Minnesota
  - University of Illinois,
  - University of Rome,
  - University of Padua,
  - University of Bath
  - University of Manitoba
  - E.S.-B, University of Washington.y
  - University of Warwick,
  - University of Toronto,
  - University of Akron
  - Unive

Review affiliations before merging
Manual review step before searching PubMed



In [None]:
def review_and_select_affiliations(affiliation_clusters):
    """
    Interactive review of discovered affiliation clusters before PubMed search
    
    Args:
        affiliation_clusters: List of affiliation clusters discovered from PDFs
        
    Returns:
        List of approved affiliation variations for PubMed search
    """
    print("\nüìã AFFILIATION REVIEW\n")
    print("Review the following affiliation clusters discovered from PDFs:")
    print("Select which clusters to include in PubMed search\n")
    
    approved_variations = []
    
    # Filter clusters to only include relevant ones based on keywords
    relevant_keywords = [
        'instituto', 'institute', 'universidad', 'university',
        'departamento', 'department', 'unam', 'ifc', 'mexico',
        'fisiolog', 'physiolog', 'celular', 'cellular', 'neurobiolog'
    ]
    
    relevant_clusters = []
    for cluster in affiliation_clusters:
        # Check if any variation in the cluster contains a relevant keyword
        if any(
            any(keyword in variation.lower() for keyword in relevant_keywords)
            for variation in cluster['variations']
        ):
            relevant_clusters.append(cluster)

    print(f"Found {len(relevant_clusters)} potentially relevant clusters out of {len(affiliation_clusters)} total.")

    for i, cluster_data in enumerate(relevant_clusters):
        cluster = cluster_data['variations']
        representative = cluster_data['representative']
        
        print("-" * 40)
        print(f"\nCluster {i+1}/{len(relevant_clusters)}: Representative -> {representative}")
        
        if len(cluster) > 1:
            print("  Other variations found:")
            for j, variation in enumerate(cluster):
                if variation != representative:
                    print(f"    {j+1}. {variation}")
        
        # Ask for approval
        while True:
            try:
                choice = input(f"Include this cluster in PubMed search? (y/n/skip): ").lower().strip()
                if choice in ('y', 'yes', 'n', 'no', 's', 'skip'):
                    break
                print("Invalid input. Please enter 'y', 'n', or 's'.")
            except (EOFError, KeyboardInterrupt):
                print("\nReview interrupted. Exiting.")
                return approved_variations

        if choice in ('y', 'yes'):
            approved_variations.extend(cluster)
            print(f"‚úÖ Approved. Added {len(cluster)} variations.")
        elif choice in ('n', 'no'):
            print(f"‚ùå Cluster excluded.")
        else: # skip
            print("Skipping remaining clusters.")
            break
            
    print("-" * 40)
    print(f"\nüìä Review Complete: Approved {len(approved_variations)} total affiliation variations.")
    
    return approved_variations

# Extract the affiliation clusters from the results of the previous cell
affiliation_clusters = test_results.get('affiliation_clusters', [])

# Run the interactive review process
if affiliation_clusters:
    approved_affiliations = review_and_select_affiliations(affiliation_clusters)
    print("\nFinal list of approved affiliations for PubMed search:")
    for aff in approved_affiliations:
        print(f"- {aff}")
else:
    print("No affiliation clusters found in 'test_results'.")


üìã AFFILIATION REVIEW

Review the following affiliation clusters discovered from PDFs:
Select which clusters to include in PubMed search

Found 357 potentially relevant clusters out of 357 total.
----------------------------------------

Cluster 1/357: Representative -> University of Washington,
  Other variations found:
    2. University of Washington School of Medicine
    3. University of Warwick
    4. University of Michigan,
    5. University of Concepcion,
    6. University of Coahuila,
    7. University of Manchester,
    8. University of Aarhus,
    9. University of Dhaka,
    10. University of Michigan
    11. University of Michigan.
    12. University of Toronto
    13. University of¬†Cape Town
    14. University of Salento
    15. University of Palermo,
    16. University
of Toronto,
    17. University of 
Massachusetts,
    18. University of Minnesota
    19. University of Illinois,
    20. University of Rome,
    21. University of Padua,
    22. University of Bath
    2

we should updaate the PDF processing and PubMed search workflow:


In [21]:
def analyze_pdfs_and_search_pubmed_with_review(pdf_dir, output_dir='../data/processed/affiliations', 
                                             limit_pdfs=None, max_results_per_query=20):
    """
    Complete pipeline with manual review: Extract affiliations, review them, then search PubMed
    
    Args:
        pdf_dir: Directory containing PDFs to process
        output_dir: Directory for saving outputs
        limit_pdfs: Maximum number of PDFs to process (None for all)
        max_results_per_query: Maximum results per PubMed query
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Step 1: Mine affiliations from PDFs
    print("üîé Step 1: Mining affiliations from PDFs...")
    affiliations_output = os.path.join(output_dir, "discovered_affiliations.json")
    affiliation_results = mine_affiliations_from_pdfs(
        pdf_dir=pdf_dir,
        output_json=affiliations_output,
        limit=limit_pdfs
    )
    
    clusters = affiliation_results.get('affiliation_clusters', [])
    
    # Step 2: Manual review of affiliations (NEW)
    print("\nüîç Step 2: Reviewing discovered affiliations...")
    if not clusters:
        print("‚ö†Ô∏è No affiliation clusters found.")
        approved_variations = []
    else:
        # Extract clusters from results
        raw_clusters = [cluster['variations'] for cluster in clusters]
        approved_variations = review_and_select_affiliations(raw_clusters)
    
    # Step 3: Format approved variations for PubMed search
    if not approved_variations:
        print("‚ö†Ô∏è No affiliations approved. Using default affiliations for PubMed search.")
        pubmed_variations = None  # Will use defaults in PubmedSearcher
    else:
        pubmed_variations = []
        for variation in approved_variations:
            # Clean and format for PubMed
            clean_aff = re.sub(r'[,.:]', '', variation)
            clean_aff = re.sub(r'\s+', ' ', clean_aff).strip()
            pubmed_variation = f"{clean_aff}[Affiliation]"
            pubmed_variations.append(pubmed_variation)
    
        print(f"üîç Generated {len(pubmed_variations)} PubMed search variations")
        print("\nSample variations:")
        for i, var in enumerate(pubmed_variations[:5]):
            print(f"   {i+1}. {var}")
    
    # Step 4: Search PubMed with approved affiliations
    print("\nüîç Step 4: Searching PubMed with approved affiliations...")
    searcher = PubmedSearcher()
    articles = searcher.comprehensive_search(
        affiliation_variations=pubmed_variations,
        max_per_query=max_results_per_query
    )
    
    # Step 5: Save PubMed results
    print(f"\nüìä Found {len(articles)} articles from PubMed")
    pubmed_output = os.path.join(output_dir, "pubmed_results.json")
    with open(pubmed_output, 'w', encoding='utf-8') as f:
        json.dump(articles, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ PubMed results saved to {pubmed_output}")
    
    # Step 6: Summary
    print("\nüìã Pipeline Summary:")
    print(f"   PDFs processed: {affiliation_results['total_pdfs_processed']}")
    print(f"   Unique affiliations found: {affiliation_results['total_affiliations_found']}")
    print(f"   Affiliation clusters reviewed: {len(clusters)}")
    print(f"   Approved variations: {len(approved_variations)}")
    print(f"   PubMed articles found: {len(articles)}")
    
    return {
        'affiliation_results': affiliation_results,
        'approved_variations': approved_variations,
        'pubmed_articles': articles
    }

üéØ Top Results Found

The system correctly identified target clusters:

"Instituto de Fisiolog√≠a Celular" (Score: 1767) - 39 variations ‚úÖ
"Universidad Nacional Aut√≥noma de M√©xico" (Score: 708) - 53 variations ‚úÖ
"Institute for Cellular Physiology" (Score: 544) - 10 variations ‚úÖ
Department-level affiliations with IFC connections ‚úÖ
"Cellular Physiology" standalone terms ‚úÖ

Intelligent Scoring: Uses pattern matching, keywords, and similarity algorithms
- added Customizable: Easily adjust thresholds and criteria
- Formatted for use in PubMed searches and workflows

üöÄ How to Use
Option 1: Use the pre-filtered results (Recommended):

```bash
# The filtered results are ready to use in:
# - data/processed/filtered_affiliations.json
# - data/processed/manual_review_affiliations.txt
```

Option 2: Interactive filtering

```bash
python3 scripts/filter_affiliations.py
```

Option 3: Command line

```bash
# Conservative (highest quality)
python3 scripts/filter_affiliations.py --score 10.0 --limit 20

# Liberal (more inclusive)  
python3 scripts/filter_affiliations.py --score 2.0 --limit 100
```

## 4. PubMed Search Strategy

In [22]:
class PubmedSearcher:
    def __init__(self):
        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        
    def build_search_queries(self, affiliation_variations=None):
        """Build comprehensive search queries for different affiliation variations"""
        
        if affiliation_variations is None:
            # Default variations based on your institute
            affiliation_variations = [
                "Instituto de Fisiologia Celular[Affiliation]",
                "Institute of Cellular Physiology[Affiliation]",
                "IFC UNAM[Affiliation]",
                "Departamento de Neurobiologia UNAM[Affiliation]",
                "Universidad Nacional Autonoma Mexico Fisiologia[Affiliation]",
                "National Autonomous University Mexico Cellular Physiology[Affiliation]"
            ]
        
        queries = []
        
        # Individual affiliation searches
        for aff in affiliation_variations:
            queries.append(aff)
            
        # Combined searches with time ranges
        recent_query = f"({' OR '.join(affiliation_variations[:3])}) AND (2020:2024[pdat])"
        historical_query = f"({' OR '.join(affiliation_variations[:3])}) AND (2010:2019[pdat])"
        
        queries.extend([recent_query, historical_query])
        
        return queries
    
    def search_pubmed(self, query, max_results=100):
        """Search PubMed with a given query"""
        
        # Step 1: Search
        search_url = f"{self.base_url}esearch.fcgi"
        search_params = {
            'db': 'pubmed',
            'term': query,
            'retmax': max_results,
            'retmode': 'json'
        }
        
        try:
            response = requests.get(search_url, params=search_params)
            search_data = response.json()
            
            pmids = search_data['esearchresult']['idlist']
            total_count = int(search_data['esearchresult']['count'])
            
            print(f"Found {total_count} results for query: {query[:50]}...")
            
            if not pmids:
                return []
            
            # Step 2: Fetch details
            time.sleep(0.5)  # Rate limiting
            
            fetch_url = f"{self.base_url}efetch.fcgi"
            fetch_params = {
                'db': 'pubmed',
                'id': ','.join(pmids),
                'retmode': 'xml'
            }
            
            fetch_response = requests.get(fetch_url, params=fetch_params)
            
            # Parse XML (simplified - you might want to use xml.etree.ElementTree)
            articles = self.parse_pubmed_xml(fetch_response.text)
            
            return articles
            
        except Exception as e:
            print(f"Error searching PubMed: {e}")
            return []
    
    def parse_pubmed_xml(self, xml_content):
        """Simple XML parsing for PubMed results (you might want to improve this)"""
        import xml.etree.ElementTree as ET
        
        articles = []
        
        try:
            root = ET.fromstring(xml_content)
            
            for article in root.findall('.//PubmedArticle'):
                try:
                    # Extract basic info
                    pmid = article.find('.//PMID').text
                    
                    title_elem = article.find('.//ArticleTitle')
                    title = title_elem.text if title_elem is not None else "No title"
                    
                    # Authors
                    authors = []
                    for author in article.findall('.//Author'):
                        lastname = author.find('.//LastName')
                        firstname = author.find('.//ForeName')
                        if lastname is not None:
                            author_name = lastname.text
                            if firstname is not None:
                                author_name += f", {firstname.text}"
                            authors.append(author_name)
                    
                    # Journal and year
                    journal_elem = article.find('.//Journal/Title')
                    journal = journal_elem.text if journal_elem is not None else "Unknown"
                    
                    year_elem = article.find('.//PubDate/Year')
                    year = int(year_elem.text) if year_elem is not None else None
                    
                    # Abstract
                    abstract_elem = article.find('.//Abstract/AbstractText')
                    abstract = abstract_elem.text if abstract_elem is not None else ""
                    
                    # DOI
                    doi_elem = article.find('.//ELocationID[@EIdType="doi"]')
                    doi = doi_elem.text if doi_elem is not None else None
                    
                    article_data = {
                        'pmid': pmid,
                        'title': title,
                        'authors': '; '.join(authors),
                        'journal': journal,
                        'year': year,
                        'abstract': abstract,
                        'doi': doi
                    }
                    
                    articles.append(article_data)
                    
                except Exception as e:
                    print(f"Error parsing article: {e}")
                    continue
                    
        except Exception as e:
            print(f"Error parsing XML: {e}")
            
        return articles
    
    def comprehensive_search(self, affiliation_variations=None, max_per_query=50):
        """
        Run comprehensive search with all query variations
        
        Args:
            affiliation_variations: Optional list of affiliation variations to use
            max_per_query: Maximum results per query
        
        Returns:
            List of articles found
        """
        queries = self.build_search_queries(affiliation_variations)
        all_articles = []
        seen_pmids = set()
        
        for i, query in enumerate(queries):
            print(f"\nüîç Running search {i+1}/{len(queries)}")
            articles = self.search_pubmed(query, max_per_query)
            
            # Deduplicate
            new_articles = []
            for article in articles:
                if article['pmid'] not in seen_pmids:
                    seen_pmids.add(article['pmid'])
                    new_articles.append(article)
            
            all_articles.extend(new_articles)
            print(f"Added {len(new_articles)} new articles (total: {len(all_articles)})")
            
            time.sleep(1)  # Be respectful to NCBI
            
        return all_articles
# Run comprehensive PubMed search
print("üîç Starting comprehensive PubMed search...")
searcher = PubmedSearcher()

# Test with a single query first
test_articles = searcher.search_pubmed("Instituto de Fisiologia Celular[Affiliation]", max_results=5)
print(f"\nüìä Test search found {len(test_articles)} articles")

if test_articles:
    print("\nSample result:")
    sample = test_articles[0]
    print(f"Title: {sample['title'][:100]}...")
    print(f"Authors: {sample['authors'][:100]}...")
    print(f"PMID: {sample['pmid']}")

new_articles = searcher.comprehensive_search(max_per_query=20)
print(f"\nüéâ Found {len(new_articles)} total unique articles from PubMed")

üîç Starting comprehensive PubMed search...
Found 2248 results for query: Instituto de Fisiologia Celular[Affiliation]...

üìä Test search found 5 articles

Sample result:
Title: Multistable bimodal perceptual coding within the ventral premotor cortex....
Authors: Andrade-Ortega, Bernardo; D√≠az, H√©ctor; Bayones, Lucas; Alvarez, Manuel; Zainos, Antonio; Rivera-Yos...
PMID: 40971437

üîç Running search 1/8
Found 2248 results for query: Instituto de Fisiologia Celular[Affiliation]...
Added 20 new articles (total: 20)

üîç Running search 2/8
Found 88 results for query: Institute of Cellular Physiology[Affiliation]...
Added 20 new articles (total: 40)

üîç Running search 3/8
Found 522 results for query: IFC UNAM[Affiliation]...
Added 12 new articles (total: 52)

üîç Running search 4/8
Found 9 results for query: Departamento de Neurobiologia UNAM[Affiliation]...
Added 9 new articles (total: 61)

üîç Running search 5/8
Found 3968 results for query: Universidad Nacional Autonoma Mexic

## Using Pre-Filtered Affiliations for PubMed Search

Instead of manually specifying affiliation variations, we'll now use the automatically filtered and scored affiliations from our filtering system.

In [None]:
# Load the pre-filtered affiliation results
import json

def load_filtered_affiliations(min_score=15.0):
    """
    Load pre-filtered affiliation clusters for PubMed searches.
    
    Args:
        min_score: Minimum relevance score to include (default: 15.0 for high quality)
        
    Returns:
        List of affiliation terms optimized for PubMed searches
    """
    
    filtered_file = 'data/processed/filtered_affiliations.json'
    
    print(f"üìÅ Loading filtered affiliations from {filtered_file}")
    
    with open(filtered_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    clusters = data['relevant_affiliation_clusters']
    
    # Filter by score and extract search terms
    affiliation_terms = []
    
    for cluster in clusters:
        if cluster['relevance_score'] >= min_score:
            
            # Add representative term
            representative = clean_affiliation_for_search(cluster['representative'])
            if representative:
                affiliation_terms.append(f'"{representative}"[Affiliation]')
            
            # Add top variations (limit to avoid too many terms)
            for variation in cluster['variations'][:3]:  # Top 3 variations per cluster
                cleaned = clean_affiliation_for_search(variation)
                if cleaned and len(cleaned) > 10:  # Only substantial terms
                    search_term = f'"{cleaned}"[Affiliation]'
                    if search_term not in affiliation_terms:  # Avoid duplicates
                        affiliation_terms.append(search_term)
    
    print(f"‚úÖ Extracted {len(affiliation_terms)} affiliation search terms from {len([c for c in clusters if c['relevance_score'] >= min_score])} high-scoring clusters")
    
    return affiliation_terms

def clean_affiliation_for_search(term):
    """Clean an affiliation term for PubMed search."""
    import re
    
    if not term:
        return ""
    
    # Remove common noise patterns
    cleaned = re.sub(r'[‚Ä¢\d]+\s*', '', term)  # Remove bullets and leading numbers
    cleaned = re.sub(r'[^\w\s\-,.]', ' ', cleaned)  # Remove special chars except basic punctuation
    cleaned = re.sub(r'\s+', ' ', cleaned)  # Normalize whitespace
    cleaned = cleaned.strip()
    
    # Remove very generic prefixes
    prefixes_to_remove = ['the ', 'a ', 'an ', 'at the ', 'from the ']
    for prefix in prefixes_to_remove:
        if cleaned.lower().startswith(prefix):
            cleaned = cleaned[len(prefix):]
    
    # Skip very short or generic terms
    if len(cleaned) < 8:
        return ""
    
    generic_terms = ['university', 'institute', 'department', 'school', 'college']
    if cleaned.lower() in generic_terms:
        return ""
    
    return cleaned

# Load the filtered affiliations
print("üîç LOADING PRE-FILTERED AFFILIATIONS FOR PUBMED SEARCH")
print("=" * 60)

# You can adjust the min_score based on your needs:
# - 20.0+: Only the highest quality, most relevant affiliations
# - 15.0+: High quality affiliations (recommended)
# - 10.0+: Moderate quality, more inclusive
# - 5.0+: Liberal, includes more possibilities

filtered_affiliations = load_filtered_affiliations(min_score=15.0)

print(f"\nüìã TOP 10 AFFILIATION SEARCH TERMS:")
for i, term in enumerate(filtered_affiliations[:10], 1):
    print(f"{i:2d}. {term}")

if len(filtered_affiliations) > 10:
    print(f"    ... and {len(filtered_affiliations) - 10} more terms")

print(f"\nüí° You can adjust min_score to get more or fewer terms:")
print(f"   - Current (15.0): {len(filtered_affiliations)} terms")
print(f"   - If 10.0: {len(load_filtered_affiliations(10.0))} terms") 
print(f"   - If 20.0: {len(load_filtered_affiliations(20.0))} terms")

### Enhanced PubMed Search with Filtered Affiliations

Now we'll modify the PubMed searcher to use our filtered affiliations instead of hardcoded ones:

In [None]:
# Enhanced PubMed Searcher using filtered affiliations
class EnhancedPubmedSearcher(PubmedSearcher):
    """Enhanced PubMed searcher that uses pre-filtered affiliation results."""
    
    def __init__(self):
        super().__init__()
        self.filtered_affiliations = None
    
    def load_affiliations(self, min_score=15.0):
        """Load filtered affiliations for searches."""
        self.filtered_affiliations = load_filtered_affiliations(min_score)
        return self.filtered_affiliations
    
    def build_smart_search_queries(self, max_terms_per_query=5):
        """
        Build intelligent search queries using filtered affiliations.
        
        Args:
            max_terms_per_query: Maximum affiliation terms per query to avoid overly complex searches
            
        Returns:
            List of optimized search queries
        """
        
        if not self.filtered_affiliations:
            print("‚ö†Ô∏è  No filtered affiliations loaded. Loading with default score...")
            self.load_affiliations()
        
        affiliations = self.filtered_affiliations
        queries = []
        
        # Strategy 1: High-priority terms (top scoring affiliations)
        high_priority = affiliations[:max_terms_per_query]
        if high_priority:
            priority_query = f"({' OR '.join(high_priority)})"
            queries.append(priority_query)
        
        # Strategy 2: Batch remaining terms to avoid overwhelming single queries
        remaining_terms = affiliations[max_terms_per_query:]
        
        for i in range(0, len(remaining_terms), max_terms_per_query):
            batch = remaining_terms[i:i + max_terms_per_query]
            if batch:
                batch_query = f"({' OR '.join(batch)})"
                queries.append(batch_query)
        
        # Strategy 3: Add time-filtered searches for high-priority terms
        if high_priority:
            recent_query = f"({' OR '.join(high_priority[:3])}) AND (2020:2024[pdat])"
            historical_query = f"({' OR '.join(high_priority[:3])}) AND (2010:2019[pdat])"
            queries.extend([recent_query, historical_query])
        
        print(f"‚úÖ Generated {len(queries)} optimized search queries")
        return queries
    
    def comprehensive_filtered_search(self, min_score=15.0, max_per_query=50, max_terms_per_query=5):
        """
        Run comprehensive search using filtered affiliations.
        
        Args:
            min_score: Minimum relevance score for affiliations to include
            max_per_query: Maximum results per individual query
            max_terms_per_query: Maximum affiliation terms per query
            
        Returns:
            Dictionary with search results and metadata
        """
        
        print("üîç COMPREHENSIVE PUBMED SEARCH WITH FILTERED AFFILIATIONS")
        print("=" * 65)
        
        # Load affiliations
        affiliations = self.load_affiliations(min_score)
        print(f"üìä Using {len(affiliations)} filtered affiliation terms (score >= {min_score})")
        
        # Build queries
        queries = self.build_smart_search_queries(max_terms_per_query)
        
        # Execute searches
        all_articles = []
        seen_pmids = set()
        query_results = []
        
        for i, query in enumerate(queries):
            print(f"\nüîç Query {i+1}/{len(queries)}: {query[:100]}{'...' if len(query) > 100 else ''}")
            
            try:
                articles = self.search_pubmed(query, max_per_query)
                
                # Deduplicate
                new_articles = []
                for article in articles:
                    if article['pmid'] not in seen_pmids:
                        seen_pmids.add(article['pmid'])
                        new_articles.append(article)
                
                all_articles.extend(new_articles)
                
                query_result = {
                    'query': query,
                    'total_found': len(articles),
                    'new_articles': len(new_articles),
                    'cumulative_total': len(all_articles)
                }
                query_results.append(query_result)
                
                print(f"   Found: {len(articles)} | New: {len(new_articles)} | Total: {len(all_articles)}")
                
                time.sleep(1)  # Rate limiting
                
            except Exception as e:
                print(f"   ‚ùå Error: {e}")
                continue
        
        # Summary
        print(f"\nüéâ SEARCH COMPLETE!")
        print(f"üìä Total unique articles found: {len(all_articles)}")
        print(f"üîç Queries executed: {len([r for r in query_results if r['total_found'] > 0])}/{len(queries)}")
        
        # Return comprehensive results
        return {
            'articles': all_articles,
            'search_metadata': {
                'total_articles': len(all_articles),
                'affiliations_used': len(affiliations),
                'min_score_threshold': min_score,
                'queries_executed': len(queries),
                'query_results': query_results
            }
        }

# Initialize the enhanced searcher
print("üöÄ INITIALIZING ENHANCED PUBMED SEARCHER")
print("=" * 50)

enhanced_searcher = EnhancedPubmedSearcher()

# Test with a small search first
print("\nüß™ TESTING WITH SMALL SEARCH...")
test_affiliations = enhanced_searcher.load_affiliations(min_score=20.0)  # Very high score for testing
print(f"Test will use {len(test_affiliations)} highest-scoring affiliations")

if test_affiliations:
    print(f"\nTop 3 test affiliations:")
    for i, aff in enumerate(test_affiliations[:3], 1):
        print(f"  {i}. {aff}")
else:
    print("‚ö†Ô∏è  No affiliations found with score >= 20.0, try lower score")

### Run the Enhanced Search

Choose your search strategy based on your needs:

In [None]:
# CHOOSE YOUR SEARCH STRATEGY
# Uncomment the option you want to use:

# Option 1: CONSERVATIVE - High precision, fewer results
# Uses only the highest-scoring affiliations (20.0+)
# Best for: High-quality, highly relevant results
# search_results = enhanced_searcher.comprehensive_filtered_search(
#     min_score=20.0,      # Only top-quality affiliations
#     max_per_query=30,    # Fewer results per query  
#     max_terms_per_query=3 # Simpler queries
# )

# Option 2: BALANCED - Good balance of precision and recall (RECOMMENDED)
# Uses high-scoring affiliations (15.0+)  
# Best for: Most use cases
search_results = enhanced_searcher.comprehensive_filtered_search(
    min_score=15.0,      # High-quality affiliations
    max_per_query=50,    # Moderate results per query
    max_terms_per_query=5 # Balanced query complexity
)

# Option 3: LIBERAL - Higher recall, more results
# Uses moderately-scoring affiliations (10.0+)
# Best for: Comprehensive literature review, exploratory research
# search_results = enhanced_searcher.comprehensive_filtered_search(
#     min_score=10.0,      # More inclusive
#     max_per_query=75,    # More results per query
#     max_terms_per_query=7 # More complex queries
# )

# Display results summary
print("\n" + "=" * 70)
print("üìä SEARCH RESULTS SUMMARY")
print("=" * 70)

articles = search_results['articles']
metadata = search_results['search_metadata']

print(f"üéØ Total Articles Found: {len(articles)}")
print(f"üîç Affiliations Used: {metadata['affiliations_used']} (score >= {metadata['min_score_threshold']})")
print(f"‚öôÔ∏è  Queries Executed: {metadata['queries_executed']}")

if articles:
    print(f"\nüìñ Sample Results:")
    for i, article in enumerate(articles[:3], 1):
        print(f"\n{i}. {article['title'][:100]}{'...' if len(article['title']) > 100 else ''}")
        print(f"   Authors: {article['authors'][:80]}{'...' if len(article['authors']) > 80 else ''}")
        print(f"   Journal: {article['journal']} ({article['year']})")
        print(f"   PMID: {article['pmid']}")
        
    if len(articles) > 3:
        print(f"\n   ... and {len(articles) - 3} more articles")

# Query performance breakdown
print(f"\nüîç Query Performance:")
successful_queries = [q for q in metadata['query_results'] if q['total_found'] > 0]
print(f"Successful queries: {len(successful_queries)}/{len(metadata['query_results'])}")

for i, query_result in enumerate(successful_queries[:5], 1):  # Show top 5 performing queries
    print(f"{i}. Found {query_result['total_found']} articles ({query_result['new_articles']} new)")
    print(f"   Query: {query_result['query'][:80]}{'...' if len(query_result['query']) > 80 else ''}")

print(f"\nüíæ Next Steps:")
print(f"1. Review the {len(articles)} articles found")
print(f"2. Save results to your database")
print(f"3. Run text extraction and analysis on promising articles")
print(f"4. Adjust search parameters if needed (min_score, max_per_query, etc.)")

# Save results for later use
if articles:
    print(f"\nüíæ Would you like to save these results?")
    # Uncomment to save:
    # import json
    # with open('data/processed/pubmed_filtered_search_results.json', 'w') as f:
    #     json.dump(search_results, f, indent=2)
    # print("‚úÖ Results saved to data/processed/pubmed_filtered_search_results.json")

## 5. Database Integration & Expansion

In [23]:
def merge_publication_databases(existing_pubs, new_pubs, output_file='../data/processed/expanded_ifc_publications.json'):
    """Merge existing publications with newly found ones, removing duplicates"""
    
    # Create lookup sets for deduplication
    existing_dois = {pub.get('doi') for pub in existing_pubs if pub.get('doi')}
    existing_pmids = {pub.get('pubmed_id') for pub in existing_pubs if pub.get('pubmed_id')}
    existing_titles = {pub.get('title', '').lower().strip() for pub in existing_pubs if pub.get('title')}
    
    merged_pubs = existing_pubs.copy()
    new_count = 0
    
    print(f"Processing {len(new_pubs)} potential new publications...")
    
    for pub in new_pubs:
        is_duplicate = False
        
        # Check for duplicates
        if pub.get('doi') and pub['doi'] in existing_dois:
            is_duplicate = True
        elif pub.get('pmid') and pub['pmid'] in existing_pmids:
            is_duplicate = True
        elif pub.get('title', '').lower().strip() in existing_titles:
            is_duplicate = True
            
        if not is_duplicate:
            # Convert PubMed format to your format
            converted_pub = {
                'title': pub.get('title', ''),
                'authors': pub.get('authors', ''),
                'journal': pub.get('journal', ''),
                'year': pub.get('year'),
                'doi': pub.get('doi'),
                'pubmed_id': pub.get('pmid'),
                'ifc_url': None,  # Not available from PubMed
                'abstract': pub.get('abstract', ''),
                'keywords': None,
                'embedding_text': pub.get('abstract', '') + " " + pub.get('title', ''),  # Text for embeddings
                'keywords_extracted': extract_keywords(pub.get('abstract', '') + " " + pub.get('title', '')),
                'metadata': {
                    'source': 'PubMed_search',
                    'has_full_text': False,  # Set to True when full text is available
                    'affiliation_matched': pub.get('affiliation_matched', 'Unknown')  # Store which affiliation matched                
            }
        }
            
            merged_pubs.append(converted_pub)
            new_count += 1
            
            # Update tracking sets (FIXED - removed erroneous import)
            if pub.get('doi'):
                existing_dois.add(pub['doi'])
            if pub.get('pmid'):
                existing_pmids.add(pub['pmid'])
            existing_titles.add(pub.get('title', '').lower().strip())
    
    
    # Save expanded database
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_pubs, f, indent=2, ensure_ascii=False)
    
    print(f"\nüìä Database expansion complete:")
    print(f"   Original publications: {len(existing_pubs)}")
    print(f"   New publications added: {new_count}")
    print(f"   Total publications: {len(merged_pubs)}")
    print(f"   Saved to: {output_file}")
    
    return merged_pubs

# Demo with existing data
print("üìà Database expansion simulation")
print(f"Current database size: {len(publications)}")

# Create some demo "new" publications
demo_new_pubs = [
    {
        'pmid': '99999999',
        'title': 'Demo paper: Synaptic plasticity in hippocampal circuits',
        'authors': 'Demo Author, A.; Demo Author, B.',
        'journal': 'Demo Journal of Neuroscience',
        'year': 2023,
        'abstract': 'This is a demo abstract about hippocampal synaptic plasticity...',
        'doi': '10.1234/demo.2023.001'
    }
]

expanded_db = merge_publication_databases(publications, demo_new_pubs)


üìà Database expansion simulation
Current database size: 2
Processing 1 potential new publications...


NameError: name 'extract_keywords' is not defined

In [None]:
# Test merge with the full IFC database to evaluate effectiveness
print("üß™ Testing merge with full IFC database to evaluate method effectiveness\n")

# Load the full IFC database
try:
    with open('../data/raw/all_ifc_publications.json', 'r', encoding='utf-8') as f:
        full_ifc_db = json.load(f)
    print(f"‚úÖ Loaded full IFC database: {len(full_ifc_db)} publications")
except FileNotFoundError:
    print("‚ùå Could not find '../data/raw/all_ifc_publications.json'")
    print("   Please check the file path")
    full_ifc_db = []

if full_ifc_db and new_articles:
    print(f"üìä Comparison Analysis:")
    print(f"   Full IFC database: {len(full_ifc_db)} publications")
    print(f"   PubMed search found: {len(new_articles)} publications")
    
    # Test merge (in memory only)
    def test_merge_effectiveness(existing_pubs, new_pubs):
        """Test merge to evaluate method effectiveness without saving"""
        
        # Create lookup sets for deduplication (do this once, not in the loop)
        print("   Building lookup tables for faster matching...")
        existing_dois = {pub.get('doi') for pub in existing_pubs if pub.get('doi')}
        existing_pmids = {pub.get('pubmed_id') for pub in existing_pubs if pub.get('pubmed_id')}
        
        # Pre-process titles once for performance
        existing_titles = set()
        for pub in existing_pubs:
            if pub.get('title'):
                existing_titles.add(pub.get('title', '').lower().strip())
        
        # Analyze new publications
        new_count = 0
        duplicate_by_doi = 0
        duplicate_by_pmid = 0
        duplicate_by_title = 0
        truly_new = []
        
        # Process with progress indicator
        print("\n   Processing publications:")
        total_new = len(new_pubs)
        update_interval = max(1, min(100, total_new // 10))  # Show 10 updates or every item for small sets
        
        for i, pub in enumerate(new_pubs):
            # Show progress periodically
            if i % update_interval == 0 or i == total_new - 1:
                progress = (i + 1) / total_new * 100
                print(f"   Progress: {i+1}/{total_new} ({progress:.1f}%)", end="\r")
            
            is_duplicate = False
            duplicate_reason = ""
            
            # Check for duplicates with detailed tracking - check fastest methods first
            if pub.get('doi') and pub['doi'] in existing_dois:
                is_duplicate = True
                duplicate_reason = "DOI match"
                duplicate_by_doi += 1
            elif pub.get('pmid') and pub['pmid'] in existing_pmids:
                is_duplicate = True
                duplicate_reason = "PMID match"
                duplicate_by_pmid += 1
            elif pub.get('title'):
                title_lower = pub.get('title', '').lower().strip()
                if title_lower in existing_titles:
                    is_duplicate = True
                    duplicate_reason = "Title match"
                    duplicate_by_title += 1
                
            if not is_duplicate:
                # This is a new publication
                truly_new.append({
                    'title': pub.get('title', ''),
                    'authors': pub.get('authors', ''),
                    'journal': pub.get('journal', ''),
                    'year': pub.get('year'),
                    'doi': pub.get('doi'),
                    'pubmed_id': pub.get('pmid'),
                    'abstract': pub.get('abstract', ''),
                    'source': 'PubMed_search'
                })
                new_count += 1
        
        print("\n   Processing complete!                           ")  # Clear progress line
        
        return {
            'total_found': len(new_pubs),
            'truly_new': new_count,
            'truly_new_articles': truly_new,
            'duplicates': {
                'by_doi': duplicate_by_doi,
                'by_pmid': duplicate_by_pmid,
                'by_title': duplicate_by_title,
                'total': len(new_pubs) - new_count
            }
        }
    
    # Perform test merge
    print("\n   Starting merge effectiveness analysis...")
    merge_results = test_merge_effectiveness(full_ifc_db, new_articles)
    
    print(f"\nüéØ Method Effectiveness Analysis:")
    print(f"   üìà Publications found by PubMed: {merge_results['total_found']}")
    print(f"   ‚ú® Truly new publications: {merge_results['truly_new']}")
    print(f"   üîÑ Duplicates found: {merge_results['duplicates']['total']}")
    print(f"      - By DOI: {merge_results['duplicates']['by_doi']}")
    print(f"      - By PMID: {merge_results['duplicates']['by_pmid']}")
    print(f"      - By Title: {merge_results['duplicates']['by_title']}")
    
    effectiveness_rate = (merge_results['truly_new'] / merge_results['total_found']) * 100 if merge_results['total_found'] > 0 else 0
    print(f"   üìä Method effectiveness: {effectiveness_rate:.1f}% new content")
    
    # Show sample of new publications
    if merge_results['truly_new'] > 0:
        print(f"\nüìã Sample of new publications found:")
        sample_size = min(5, merge_results['truly_new'])
        for i, pub in enumerate(merge_results['truly_new_articles'][:sample_size]):
            print(f"   {i+1}. {pub['title']}")
            print(f"      Journal: {pub['journal']}, Year: {pub['year']}")
            print(f"      DOI: {pub['doi']}")
            print()
else:
    print("‚ö†Ô∏è Cannot perform analysis - missing data")

üß™ Testing merge with full IFC database to evaluate method effectiveness

‚úÖ Loaded full IFC database: 404 publications
üìä Comparison Analysis:
   Full IFC database: 404 publications
   PubMed search found: 114 publications

   Starting merge effectiveness analysis...
   Building lookup tables for faster matching...

   Processing publications:
   Progress: 114/114 (100.0%)
   Processing complete!                           

üéØ Method Effectiveness Analysis:
   üìà Publications found by PubMed: 114
   ‚ú® Truly new publications: 89
   üîÑ Duplicates found: 25
      - By DOI: 23
      - By PMID: 2
      - By Title: 0
   üìä Method effectiveness: 78.1% new content

üìã Sample of new publications found:
   1. Multistable bimodal perceptual coding within the ventral premotor cortex.
      Journal: Science advances, Year: 2025
      DOI: 10.1126/sciadv.adw5500

   2. Inhibition of the oncogenic channel Kv10.1 by the antipsychotic drug penfluridol.
      Journal: Frontiers in phar

## 6. Automated Pipeline

 Performance Optimizations for Large Datasets
For handling larger databases,  adding indexes:



In [None]:
def build_publication_indexes(publications):
    """Create index structures for faster lookup"""
    indexes = {
        'doi': {},
        'pmid': {},
        'title_lower': {},
        'year': {},
        'journal': {}
    }
    
    for i, pub in enumerate(publications):
        if pub.get('doi'):
            indexes['doi'][pub['doi']] = i
        if pub.get('pubmed_id'):
            indexes['pmid'][pub['pubmed_id']] = i
        if pub.get('title'):
            indexes['title_lower'][pub['title'].lower().strip()] = i
        if pub.get('year'):
            year = pub['year']
            if year not in indexes['year']:
                indexes['year'][year] = []
            indexes['year'][year].append(i)
        if pub.get('journal'):
            journal = pub['journal']
            if journal not in indexes['journal']:
                indexes['journal'][journal] = []
            indexes['journal'][journal].append(i)
    
    return indexes

adding keywords for better embeddings

In [None]:
def extract_keywords(text, max_keywords=10):
    """Extract key terms from text for better embedding search"""
    if not text:
        return []
        
    try:
        # Simple frequency-based extraction
        # In a real application, consider using more sophisticated methods
        words = re.findall(r'\b[a-zA-Z]{3,15}\b', text.lower())
        word_freq = Counter(words)
        
        # Filter out common words
        common_words = {'the', 'and', 'was', 'were', 'with', 'for', 'this', 'that'}
        for word in common_words:
            if word in word_freq:
                del word_freq[word]
                
        # Return top keywords
        return [word for word, _ in word_freq.most_common(max_keywords)]
    except Exception:
        return []

include text fields (embeddings)

In [None]:
# When preparing for ChromaDB
documents = []
metadatas = []
ids = []

for i, pub in enumerate(final_db):
    # Combine text fields for embedding
    doc_text = pub.get('title', '') + " " + pub.get('abstract', '')
    if pub.get('full_text'):
        doc_text += " " + pub.get('full_text')
    
    # Add to lists
    documents.append(doc_text)
    metadatas.append({
        'title': pub.get('title'),
        'year': pub.get('year'),
        'journal': pub.get('journal'),
        'authors': pub.get('authors'),
        'keywords': pub.get('keywords_extracted', [])
    })
    ids.append(f"pub_{i}")

data quality

In [None]:
def check_data_quality(publications):
    """Check database quality before embedding"""
    issues = {
        'missing_title': 0,
        'missing_abstract': 0,
        'missing_year': 0,
        'missing_authors': 0,
        'short_text': 0
    }
    
    for pub in publications:
        if not pub.get('title'):
            issues['missing_title'] += 1
        if not pub.get('abstract'):
            issues['missing_abstract'] += 1
        if not pub.get('year'):
            issues['missing_year'] += 1
        if not pub.get('authors'):
            issues['missing_authors'] += 1
        
        # Check if there's enough text to create meaningful embeddings
        text_len = len((pub.get('title', '') + " " + pub.get('abstract', '')).split())
        if text_len < 30:  # Arbitrary threshold
            issues['short_text'] += 1
    
    return issues

In [None]:
publication type classification to improve search capabilities

In [None]:
def classify_publication_type(title, abstract):
    """Simple rule-based classification of publication type"""
    text = (title + " " + abstract).lower()
    
    if any(kw in text for kw in ['review', 'overview', 'survey']):
        return 'Review'
    elif any(kw in text for kw in ['trial', 'randomized', 'placebo']):
        return 'Clinical Trial'
    elif any(kw in text for kw in ['case report', 'patient case']):
        return 'Case Report'
    elif any(kw in text for kw in ['method', 'technique', 'protocol']):
        return 'Methodology'
    else:
        return 'Research Article'

In [None]:
def run_complete_pipeline_with_review(initial_json_path, pdf_dir, output_dir='../data/processed'):
    """Complete automated pipeline with affiliation review step"""
    
    print("üöÄ Starting complete publication database expansion pipeline\n")
    
    # Step 1: Load existing data
    print("üìÇ Step 1: Loading existing publications")
    with open(initial_json_path, 'r', encoding='utf-8') as f:
        existing_pubs = json.load(f)
    print(f"   Loaded {len(existing_pubs)} existing publications")
    
    # Step 2: Mine affiliations from PDFs and review
    print("\n? Step 2: Mining and reviewing affiliations from PDFs")
    review_results = analyze_pdfs_and_search_pubmed_with_review(
        pdf_dir=pdf_dir,
        output_dir=os.path.join(output_dir, 'affiliations'),
        max_results_per_query=30
    )
    
    new_articles = review_results.get('pubmed_articles', [])
    print(f"   Found {len(new_articles)} potential new articles")
    
    # Step 3: Merge databases
    print("\nüîÑ Step 3: Merging and deduplicating databases")
    expanded_json_path = os.path.join(output_dir, 'expanded_ifc_publications.json')
    final_db = merge_publication_databases(existing_pubs, new_articles, expanded_json_path)

    # Step 5: Create final BibTeX
    print("\nüìö Step 5: Creating final BibTeX file")
    final_bibtex_path = os.path.join(output_dir, 'final_ifc_publications.bib')
    create_bibtex_from_publications(final_db, final_bibtex_path)
    
    # Step 6: Generate summary report
    print("\nüìä Step 6: Generating summary report")
    report = {
        'pipeline_date': time.strftime('%Y-%m-%d %H:%M:%S'),
        'original_count': len(existing_pubs),
        'pubmed_found': len(new_articles),
        'final_count': len(final_db),
        'new_additions': len(final_db) - len(existing_pubs),
        'files_created': {
            'expanded_json': expanded_json_path,
            # 'bibtex_original': bibtex_path, # optional
            'bibtex_final': final_bibtex_path
        },
        'year_distribution': {},
        'top_journals': {}
    }
    
    # Analyze year distribution
    years = [pub.get('year') for pub in final_db if pub.get('year')]
    year_counts = Counter(years)
    report['year_distribution'] = dict(year_counts.most_common(10))
    
    # Analyze top journals
    journals = [pub.get('journal') for pub in final_db if pub.get('journal')]
    journal_counts = Counter(journals)
    report['top_journals'] = dict(journal_counts.most_common(10))
    
    # Save report
    report_path = os.path.join(output_dir, 'pipeline_report.json')
    with open(report_path, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, ensure_ascii=False)
    
    print(f"\n‚úÖ Pipeline complete! Summary:")
    print(f"   üìä Original: {report['original_count']} publications")
    print(f"   üÜï Added: {report['new_additions']} new publications")
    print(f"   üìà Final: {report['final_count']} total publications")
    print(f"   üìÑ Report saved: {report_path}")
    
    return final_db, report

def extract_and_store_full_text(publications_with_dois, pdf_dir, output_dir):
    """Extract full text from PDFs and store it with publications data"""   
    # Track which publications have full text
    has_full_text = set()
    # Extract text from PDFs where available
    pdf_texts = batch_process_pdfs(pdf_dir)
    
    # Match PDFs to publications by DOI
    for pub in publications_with_dois:
        if pub.get('doi'):
            # Look for PDF with DOI in filename (PyPaperBot naming convention)
            doi_filename = pub['doi'].replace('/', '_') + '.pdf'
            if doi_filename in pdf_texts:
                # Store text with publication
                pub['full_text'] = pdf_texts[doi_filename]
                pub['metadata']['has_full_text'] = True # Will fail if 'metadata' doesn't exist
                has_full_text.add(pub['doi'])
    
    print(f"Added full text to {len(has_full_text)} publications")
    return publications_with_dois

# final_database, pipeline_report = run_complete_pipeline_with_review('../data/raw/test_ifc_publications.json')

# print("\nüéØ Pipeline ready! Uncomment the line above to run the complete workflow.")
# print("\nNext steps:")
# print("1. Run this pipeline to expand your database")
# print("2. Import the BibTeX files into Zotero to download PDFs")
# print("3. Use the expanded JSON database for your ChromaDB embeddings")
# print("4. Run affiliation mining on downloaded PDFs to find more variations")
# print("5. Iterate to continuously expand your database")

In [None]:
# Define paths
initial_json_path = '../data/raw/all_ifc_publications.json'  # Your existing publications
pdf_dir = '../papers/downloaded'  # Directory with PDFs for affiliation mining
output_dir = '../data/processed'  # Where to save results

# Run the complete pipeline
final_database, pipeline_report = run_complete_pipeline_with_review(
    initial_json_path=initial_json_path,
    pdf_dir=pdf_dir,
    output_dir=output_dir
)

# Print report summary
print(f"\nExpanded database from {pipeline_report['original_count']} to {pipeline_report['final_count']} publications")
print(f"Added {pipeline_report['new_additions']} new publications")

In [None]:
# Extract full text from PDFs and add to publications
enriched_database = extract_and_store_full_text(
    publications_with_dois=final_database,
    pdf_dir=pdf_dir,
    output_dir=output_dir
)

# Save the enriched database with full text
enriched_output_path = os.path.join(output_dir, 'ifc_publications_with_fulltext.json')
with open(enriched_output_path, 'w', encoding='utf-8') as f:
    json.dump(enriched_database, f, indent=2, ensure_ascii=False)
    
print(f"Saved enriched database with full text to: {enriched_output_path}")

In [None]:
# Check data quality
quality_issues = check_data_quality(enriched_database)
print("\nData quality report:")
for issue, count in quality_issues.items():
    percentage = (count / len(enriched_database)) * 100
    print(f"  ‚Ä¢ {issue}: {count} publications ({percentage:.1f}%)")

# Generate metadata about publication types for better search
publication_types = Counter()
for pub in enriched_database:
    pub_type = classify_publication_type(
        pub.get('title', ''), 
        pub.get('abstract', '')
    )
    publication_types[pub_type] += 1
    
    # Add the classification to the publication
    if 'metadata' not in pub:
        pub['metadata'] = {}
    pub['metadata']['publication_type'] = pub_type

print("\nPublication type distribution:")
for pub_type, count in publication_types.most_common():
    percentage = (count / len(enriched_database)) * 100
    print(f"  ‚Ä¢ {pub_type}: {count} publications ({percentage:.1f}%)")