# Building IFC Publications Database

This notebook implements multiple strategies to build a comprehensive database of Instituto de Fisiología Celular publications:

1. **PDF Acquisition**: Sci-Hub integration + BibTeX export for Zotero
2. **Affiliation Mining**: Extract all variations of institute names from existing PDFs
3. **PubMed Search Strategy**: Use discovered affiliations to find more papers
4. **Database Expansion**: Automated workflow to grow the collection

In [13]:
import json
import requests
import re
import os
import time
from urllib.parse import quote
from pathlib import Path
import pandas as pd
from collections import Counter
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
from pypdf import PdfReader
import warnings
warnings.filterwarnings('ignore')

## 1. Load Existing Publications Data

In [14]:
# Load your existing publications
with open('../data/raw/test_ifc_publications.json', 'r', encoding='utf-8') as f:
    publications = json.load(f)

print(f"Loaded {len(publications)} publications")
print("Sample publication:")
print(json.dumps(publications[0], indent=2, ensure_ascii=False))

Loaded 2 publications
Sample publication:
{
  "title": "Neural mechanisms of memory formation in hippocampal circuits",
  "authors": "García-López, M., Rodríguez-Silva, A., Mendoza-Pérez, J.",
  "journal": "Journal of Neuroscience",
  "year": 2024,
  "doi": "10.1523/JNEUROSCI.1234-24.2024",
  "pubmed_id": "38123456",
  "ifc_url": "https://www.ifc.unam.mx/publicacion.php?ut=000123456789",
  "abstract": "We investigated the cellular and molecular mechanisms underlying memory formation in hippocampal circuits. Using electrophysiological recordings and optogenetic manipulations, we found that...",
  "keywords": null
}


## 2. PDF Acquisition Strategy

### Option A: Sci-Hub Integration (own implementation)

> ⚠️ USE OPTION C

- Not tested for CAPTCHAs
- Use method B or C

In [11]:
class PdfDownloader:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
    def download_from_scihub(self, doi, output_dir='../papers/downloaded'):
        """Download PDF from Sci-Hub using DOI"""
        os.makedirs(output_dir, exist_ok=True)
        
        # Sci-Hub mirrors (these change frequently)
        scihub_urls = [
            'https://sci-hub.se/',
            'https://sci-hub.st/',
            'https://sci-hub.ru/',
        ]
        
        for base_url in scihub_urls:
            try:
                url = f"{base_url}{doi}"
                response = self.session.get(url, timeout=10)
                
                if response.status_code == 200:
                    # Look for PDF download link
                    if 'application/pdf' in response.headers.get('content-type', ''):
                        # Direct PDF
                        filename = f"{doi.replace('/', '_')}.pdf"
                        filepath = os.path.join(output_dir, filename)
                        
                        with open(filepath, 'wb') as f:
                            f.write(response.content)
                        
                        return filepath
                    
                time.sleep(1)  # Be respectful
                
            except Exception as e:
                print(f"Failed with {base_url}: {e}")
                continue
                
        return None
    
    def download_batch(self, publications, max_downloads=10):
        """Download multiple PDFs with rate limiting"""
        downloaded = []
        
        for i, pub in enumerate(publications[:max_downloads]):
            if 'doi' in pub and pub['doi']:
                print(f"Downloading {i+1}/{max_downloads}: {pub['title'][:50]}...")
                
                filepath = self.download_from_scihub(pub['doi'])
                if filepath:
                    downloaded.append({
                        'publication': pub,
                        'pdf_path': filepath
                    })
                    print(f"✅ Downloaded to {filepath}")
                else:
                    print(f"❌ Failed to download {pub['doi']}")
                    
                time.sleep(2)  # Rate limiting
                
        return downloaded

# Test with a small sample (2)
downloader = PdfDownloader()
downloaded_pdfs = downloader.download_batch(publications, max_downloads=2)

Downloading 1/2: Neural mechanisms of memory formation in hippocamp...
❌ Failed to download 10.1523/JNEUROSCI.1234-24.2024
Downloading 2/2: Cardiac physiology under metabolic stress conditio...
❌ Failed to download 10.1093/cvr/cvz098


### Option B: BibTeX Export for Zotero

> ⚠️ USE OPTION C

> Use one of the multiple zotero -> sci-hub plugins

In [12]:
def create_bibtex_from_publications(publications, output_file='../data/processed/ifc_publications.bib'):
    """Convert JSON publications to BibTeX format for Zotero import"""
    
    db = BibDatabase()
    entries = []
    
    def format_authors_for_bibtex(author_string):
        """Convert author string to proper BibTeX format"""
        if not author_string:
            return "Unknown"
        
        # Split by commas and clean each author
        authors = [author.strip() for author in author_string.split(',')]
        
        # Group authors (assuming they come in pairs: LastName, FirstName)
        formatted_authors = []
        i = 0
        while i < len(authors):
            if i + 1 < len(authors):
                # Check if next item looks like a first name (short, no hyphens typically)
                next_item = authors[i + 1].strip()
                if (len(next_item) <= 3 or 
                    (len(next_item.split()) == 1 and '.' in next_item) or
                    re.match(r'^[A-Z]\.?$', next_item)):
                    # This is likely a first name/initial
                    last_name = authors[i].strip()
                    first_name = next_item
                    formatted_authors.append(f"{last_name}, {first_name}")
                    i += 2
                else:
                    # This is likely a full name or last name only
                    formatted_authors.append(authors[i].strip())
                    i += 1
            else:
                # Last author, no pair
                formatted_authors.append(authors[i].strip())
                i += 1
        
        # Join with " and " for BibTeX format
        return " and ".join(formatted_authors)
    
    for i, pub in enumerate(publications):
        # Create a unique citation key
        first_author = pub['authors'].split(',')[0].strip() if pub['authors'] else 'Unknown'
        first_author_clean = re.sub(r'[^a-zA-Z]', '', first_author)
        citation_key = f"{first_author_clean}{pub['year']}_ifc_{i}"
        
        # Format authors properly for BibTeX
        formatted_authors = format_authors_for_bibtex(pub['authors'])
        
        entry = {
            'ENTRYTYPE': 'article',
            'ID': citation_key,
            'title': pub['title'],
            'author': formatted_authors,  # Now properly formatted
            'journal': pub['journal'],
            'year': str(pub['year']),
            'abstract': pub.get('abstract', ''),
            'url': pub.get('ifc_url', ''),
            'note': 'Instituto de Fisiología Celular, UNAM'
        }
        
        if pub.get('doi'):
            entry['doi'] = pub['doi']
            
        if pub.get('pubmed_id'):
            entry['pmid'] = pub['pubmed_id']
            
        entries.append(entry)
    
    db.entries = entries
    
    # Write to file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    writer = BibTexWriter()
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(writer.write(db))
    
    print(f"📚 Created BibTeX file with {len(entries)} entries: {output_file}")
    print("Import this file into Zotero to download PDFs automatically")
    
    # Show sample formatted authors for verification
    print("\n🔍 Sample author formatting:")
    for i, entry in enumerate(entries[:3]):
        print(f"{i+1}. Original: {publications[i]['authors']}")
        print(f"   BibTeX:   {entry['author']}")
    
    return output_file

# Test the improved function
print("🔧 Testing improved BibTeX creation...")
bibtex_file = create_bibtex_from_publications(publications)
print(f"\nBibTeX file created at: {bibtex_file}")

# Let's also check the actual BibTeX content
print("\n📄 Sample BibTeX entries:")
with open(bibtex_file, 'r', encoding='utf-8') as f:
    content = f.read()
    # Show first entry
    first_entry_end = content.find('\n}\n') + 3
    print(content[:first_entry_end])

🔧 Testing improved BibTeX creation...
📚 Created BibTeX file with 2 entries: ../data/processed/ifc_publications.bib
Import this file into Zotero to download PDFs automatically

🔍 Sample author formatting:
1. Original: García-López, M., Rodríguez-Silva, A., Mendoza-Pérez, J.
   BibTeX:   García-López, M. and Rodríguez-Silva, A. and Mendoza-Pérez, J.
2. Original: Hernández-Campos, L., López-Martín, R.
   BibTeX:   Hernández-Campos, L. and López-Martín, R.

BibTeX file created at: ../data/processed/ifc_publications.bib

📄 Sample BibTeX entries:
@article{GarcaLpez2024_ifc_0,
 abstract = {We investigated the cellular and molecular mechanisms underlying memory formation in hippocampal circuits. Using electrophysiological recordings and optogenetic manipulations, we found that...},
 author = {García-López, M. and Rodríguez-Silva, A. and Mendoza-Pérez, J.},
 doi = {10.1523/JNEUROSCI.1234-24.2024},
 journal = {Journal of Neuroscience},
 note = {Instituto de Fisiología Celular, UNAM},
 pmid = {38

### Option C: PyPaperBot

- [Repo](https://github.com/ferru97/PyPaperBot)

- Download papers given a query
- Download papers given paper's DOIs
- Generate Bibtex of the downloaded paper
- Filter downloaded paper by year, journal and citations number

#### Key Features:

- Multiple Download Methods: DOI-based downloads (most reliable)
- Google Scholar queries
- BibTeX-only generation
- Flexible Modes: Download PDFs only, BibTeX only, or both
- IFC-Specific Queries: Pre-configured searches for the institute
- Deduplication: Automatic removal of duplicate downloads
- Rate Limiting: Respectful delays between requests

In [1]:
import subprocess
import tempfile
from pathlib import Path
import shutil
import PyPaperBot

In [2]:
class PyPaperBotDownloader:
    def __init__(self, download_dir='../papers/pypaper_downloads'):
        """Initialize PyPaperBot downloader with configuration"""
        self.download_dir = Path(download_dir)
        self.download_dir.mkdir(parents=True, exist_ok=True)
        self.python_cmd = 'python'  # Initialize with default
        
        # Check if PyPaperBot is installed
        self._check_installation()
        
    def _check_installation(self):
        """Check if PyPaperBot is installed"""
        try:
            result = subprocess.run([self.python_cmd, '-m', 'PyPaperBot', '-h'], 
                                  capture_output=True, text=True, timeout=10)
            if result.returncode == 0:
                print("✅ PyPaperBot installation verified")
                return True
        except (subprocess.TimeoutExpired, FileNotFoundError):
            pass
            
        # Try with 'py' command
        try:
            result = subprocess.run(['py', '-m', 'PyPaperBot', '-h'], 
                                  capture_output=True, text=True, timeout=10)
            if result.returncode == 0:
                self.python_cmd = 'py'
                print("✅ PyPaperBot installation verified (using 'py' command)")
                return True
        except (subprocess.TimeoutExpired, FileNotFoundError):
            pass
            
        print("❌ PyPaperBot not found. Install with: pip install PyPaperBot")
        return False
    
    def download_by_dois(self, publications, restrict_mode=None, max_downloads=None, 
                        scihub_mirror=None, use_doi_filename=True):
        """Download papers using their DOIs"""
        
        # Filter publications with DOIs
        pubs_with_dois = [pub for pub in publications if pub.get('doi')]
        
        if not pubs_with_dois:
            print("❌ No publications with DOIs found")
            return {'pdf_files': [], 'bibtex_files': [], 'publications': []}
        
        if max_downloads:
            pubs_with_dois = pubs_with_dois[:max_downloads]
        
        print(f"📥 Attempting to download {len(pubs_with_dois)} papers using DOIs...")
        
        # Create temporary DOI file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as doi_file:
            for pub in pubs_with_dois:
                doi_file.write(f"{pub['doi']}\n")
            doi_file_path = doi_file.name
        
        try:
            # Build PyPaperBot command
            cmd = [
                self.python_cmd, '-m', 'PyPaperBot',
                '--doi-file', doi_file_path,
                '--dwn-dir', str(self.download_dir)
            ]
            
            # Add optional parameters
            if restrict_mode is not None:
                cmd.extend(['--restrict', str(restrict_mode)])
            
            if scihub_mirror:
                cmd.extend(['--scihub-mirror', scihub_mirror])
                
            if use_doi_filename:
                cmd.append('--use-doi-as-filename')
            
            print(f"🔧 Running command: {' '.join(cmd)}")
            
            # Execute PyPaperBot
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
            
            print("📋 PyPaperBot output:")
            print(result.stdout)
            
            if result.stderr:
                print("⚠️ PyPaperBot warnings/errors:")
                print(result.stderr)
            
            # Check downloaded files
            downloaded_files = list(self.download_dir.glob('*.pdf'))
            downloaded_bibtex = list(self.download_dir.glob('*.bib'))
            
            print(f"✅ Downloaded {len(downloaded_files)} PDF files")
            print(f"✅ Generated {len(downloaded_bibtex)} BibTeX files")
            
            return {
                'pdf_files': downloaded_files,
                'bibtex_files': downloaded_bibtex,
                'publications': pubs_with_dois
            }
            
        except subprocess.TimeoutExpired:
            print("⏰ Download timed out after 10 minutes")
            return {'pdf_files': [], 'bibtex_files': [], 'publications': []}
        except Exception as e:
            print(f"❌ Error during download: {e}")
            return {'pdf_files': [], 'bibtex_files': [], 'publications': []}
        finally:
            # Clean up temporary file
            Path(doi_file_path).unlink(missing_ok=True)
    
    def download_by_query(self, query, scholar_pages=3, min_year=None, 
                         max_downloads=None, restrict_mode=None, 
                         skip_words=None, scihub_mirror=None):
        """Download papers using Google Scholar query"""
        
        cmd = [
            self.python_cmd, '-m', 'PyPaperBot',
            '--query', query,
            '--scholar-pages', str(scholar_pages),
            '--dwn-dir', str(self.download_dir)
        ]
        
        # Add optional parameters
        if min_year:
            cmd.extend(['--min-year', str(min_year)])
            
        if max_downloads:
            if isinstance(max_downloads, str) and 'year' in max_downloads.lower():
                cmd.extend(['--max-dwn-year', str(max_downloads)])
            else:
                cmd.extend(['--max-dwn-cites', str(max_downloads)])
        
        if restrict_mode is not None:
            cmd.extend(['--restrict', str(restrict_mode)])
            
        if skip_words:
            cmd.extend(['--skip-words', skip_words])
            
        if scihub_mirror:
            cmd.extend(['--scihub-mirror', scihub_mirror])
        
        print(f"🔍 Searching and downloading papers for query: '{query}'")
        print(f"🔧 Running command: {' '.join(cmd)}")
        
        try:
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
            
            print("📋 PyPaperBot output:")
            print(result.stdout)
            
            if result.stderr:
                print("⚠️ PyPaperBot warnings/errors:")
                print(result.stderr)
            
            # Check results
            downloaded_files = list(self.download_dir.glob('*.pdf'))
            downloaded_bibtex = list(self.download_dir.glob('*.bib'))
            
            print(f"✅ Downloaded {len(downloaded_files)} PDF files")
            print(f"✅ Generated {len(downloaded_bibtex)} BibTeX files")
            
            return {
                'pdf_files': downloaded_files,
                'bibtex_files': downloaded_bibtex,
                'query': query
            }
            
        except subprocess.TimeoutExpired:
            print("⏰ Download timed out after 15 minutes")
            return {'pdf_files': [], 'bibtex_files': [], 'query': query}
        except Exception as e:
            print(f"❌ Error during download: {e}")
            return {'pdf_files': [], 'bibtex_files': [], 'query': query}
    
    def generate_bibtex_only(self, publications, output_file=None):
        """Generate BibTeX file without downloading PDFs"""
        
        pubs_with_dois = [pub for pub in publications if pub.get('doi')]
        
        if not pubs_with_dois:
            print("❌ No publications with DOIs found for BibTeX generation")
            return None
        
        # Create temporary DOI file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as doi_file:
            for pub in pubs_with_dois:
                doi_file.write(f"{pub['doi']}\n")
            doi_file_path = doi_file.name
        
        try:
            cmd = [
                self.python_cmd, '-m', 'PyPaperBot',
                '--doi-file', doi_file_path,
                '--dwn-dir', str(self.download_dir),
                '--restrict', '0'  # Download only BibTeX
            ]
            
            print(f"📚 Generating BibTeX for {len(pubs_with_dois)} publications...")
            
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
            
            print("📋 PyPaperBot output:")
            print(result.stdout)
            
            if result.stderr:
                print("⚠️ PyPaperBot warnings/errors:")
                print(result.stderr)
            
            # Find generated BibTeX file
            bibtex_files = list(self.download_dir.glob('*.bib'))
            
            if bibtex_files and output_file:
                # Move to specified location
                output_path = Path(output_file)
                output_path.parent.mkdir(parents=True, exist_ok=True)
                shutil.move(str(bibtex_files[0]), str(output_path))
                print(f"📚 BibTeX file saved to: {output_path}")
                return output_path
            elif bibtex_files:
                print(f"📚 BibTeX file generated: {bibtex_files[0]}")
                return bibtex_files[0]
            else:
                print("❌ No BibTeX file was generated")
                return None
                
        except Exception as e:
            print(f"❌ Error generating BibTeX: {e}")
            return None
        finally:
            Path(doi_file_path).unlink(missing_ok=True)

# Test the fixed PyPaperBot integration
print("🧪 Testing PyPaperBot integration...")

# Test 1: Generate BibTeX only for existing publications
print("\n📚 Test 1: Generate BibTeX only")
downloader = PyPaperBotDownloader()
bibtex_result = downloader.generate_bibtex_only(
    publications, 
    output_file='../data/processed/pypaper_ifc_publications.bib'
)

print("\n✅ PyPaperBot integration fixed and ready!")

🧪 Testing PyPaperBot integration...

📚 Test 1: Generate BibTeX only
❌ PyPaperBot not found. Install with: pip install PyPaperBot


NameError: name 'publications' is not defined

> affiliation mining system:

- Extracts text from PDFs using PyMuPDF
- Uses both regex and NLP for affiliation detection
- Supports Spanish and English processing
- Groups similar affiliations automatically
- Generates PubMed search variations from discovered affiliations

> NOTE⚠️

spaCy:

- Tokenizes the text into words, punctuation, etc.
- Part-of-speech tags each token
- Dependency parses to understand grammatical relationships
- Named Entity Recognition identifies spans as organizations, people, locations, etc.
Classification assigns labels like "ORG" (organization), "PERSON", "GPE" (geopolitical entity)

```python entity_recognition_process
doc = nlp(text)
for ent in doc.ents:
    if ent.label_ == "ORG":  # Organization entity
        print(ent.text)
```



#### spacy installation

```python
# Install Python packages
pip install -r requirements.txt

# Download spaCy language models
python -m spacy download en_core_web_sm
python -m spacy download es_core_news_sm

# Optional: Download larger, more accurate models
python -m spacy download en_core_web_md
python -m spacy download es_core_news_md
```

In [4]:
import spacy


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/santi/Projects/UBMI-IFC-Podcast/venv/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/santi/Projects/UBMI-IFC-Podcast/venv/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  Fil

#### Enhanced Affiliation Mining

Test full scpaCy capabilities 

In [12]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
import re
from collections import Counter, defaultdict
import langdetect
from langdetect import detect

class EnhancedAffiliationMiner:
    def __init__(self):
        """Initialize with advanced spaCy features"""
        self.nlp_models = {}
        self.matchers = {}
        self.load_nlp_models()
        self.setup_custom_matchers()
        
    def load_nlp_models(self):
        """Load spaCy models with error handling"""
        models_to_load = {
            'en': 'en_core_web_sm',
            'es': 'es_core_news_sm'
        }
        
        for lang, model_name in models_to_load.items():
            try:
                nlp = spacy.load(model_name)
                # Add custom pipeline components
                if not nlp.has_pipe('merge_entities'):
                    nlp.add_pipe('merge_entities')
                
                self.nlp_models[lang] = nlp
                print(f"✅ Loaded {model_name}")
                
                # Setup matcher for this language
                self.matchers[lang] = Matcher(nlp.vocab)
                
            except OSError:
                print(f"❌ {model_name} not found. Install with:")
                print(f"   python -m spacy download {model_name}")
    
    def setup_custom_matchers(self):
        """Setup custom pattern matchers for institutional names"""
        
        # Patterns for Spanish institutions
        if 'es' in self.matchers:
            spanish_patterns = [
                # Instituto de X patterns
                [{"LOWER": "instituto"}, {"LOWER": "de"}, {"IS_TITLE": True, "OP": "+"}],
                
                # Universidad patterns
                [{"LOWER": "universidad"}, {"IS_TITLE": True, "OP": "+"}],
                [{"LOWER": "universidad"}, {"LOWER": "nacional"}, {"LOWER": "autónoma"}, {"LOWER": "de"}, {"LOWER": "méxico"}],
                
                # Departamento patterns
                [{"LOWER": "departamento"}, {"LOWER": "de"}, {"IS_TITLE": True, "OP": "+"}],
                
                # IFC patterns
                [{"TEXT": {"REGEX": r"IFC-?UNAM"}}],
            ]
            
            for i, pattern in enumerate(spanish_patterns):
                self.matchers['es'].add(f"SPANISH_INSTITUTION_{i}", [pattern])
        
        # Patterns for English institutions
        if 'en' in self.matchers:
            english_patterns = [
                # University of X patterns
                [{"LOWER": "university"}, {"LOWER": "of"}, {"IS_TITLE": True, "OP": "+"}],
                
                # Institute of X patterns
                [{"LOWER": "institute"}, {"LOWER": "of"}, {"IS_TITLE": True, "OP": "+"}],
                
                # Department of X patterns
                [{"LOWER": "department"}, {"LOWER": "of"}, {"IS_TITLE": True, "OP": "+"}],
                
                # National Autonomous University of Mexico
                [{"LOWER": "national"}, {"LOWER": "autonomous"}, {"LOWER": "university"}, 
                 {"LOWER": "of"}, {"LOWER": "mexico"}],
            ]
            
            for i, pattern in enumerate(english_patterns):
                self.matchers['en'].add(f"ENGLISH_INSTITUTION_{i}", [pattern])
    
    def detect_language_advanced(self, text):
        """Advanced language detection"""
        try:
            # Use langdetect for primary detection
            detected = detect(text[:1000])  # Use first 1000 chars for speed
            
            # Validate with keyword analysis
            spanish_keywords = ['de', 'del', 'la', 'el', 'y', 'universidad', 'instituto']
            english_keywords = ['of', 'the', 'and', 'university', 'institute', 'department']
            
            text_lower = text.lower()
            spanish_count = sum(1 for kw in spanish_keywords if kw in text_lower)
            english_count = sum(1 for kw in english_keywords if kw in text_lower)
            
            # Override detection if keyword analysis is strong
            if spanish_count > english_count * 1.5:
                return 'es'
            elif english_count > spanish_count * 1.5:
                return 'en'
            else:
                return detected if detected in ['es', 'en'] else 'en'
                
        except:
            return 'en'  # Default to English
    
    def extract_affiliations_advanced_nlp(self, text):
        """Advanced NER + custom patterns for affiliation extraction"""
        language = self.detect_language_advanced(text)
        
        if language not in self.nlp_models:
            print(f"⚠️ No model available for language: {language}")
            return set()
        
        nlp = self.nlp_models[language]
        matcher = self.matchers[language]
        
        affiliations = set()
        
        # Process text in chunks to handle large documents
        max_length = 1000000
        text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
        
        for chunk in text_chunks:
            try:
                doc = nlp(chunk)
                
                # Method 1: Standard NER for organizations
                for ent in doc.ents:
                    if ent.label_ == "ORG":
                        org_text = ent.text.strip()
                        if self.is_relevant_affiliation(org_text):
                            affiliations.add(org_text)
                
                # Method 2: Custom pattern matching
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start:end]
                    affiliation_text = span.text.strip()
                    if len(affiliation_text) > 5:
                        affiliations.add(affiliation_text)
                
                # Method 3: Context-based extraction
                # Look for sentences containing institutional indicators
                for sent in doc.sents:
                    sent_text = sent.text.strip()
                    if self.contains_institutional_indicators(sent_text, language):
                        # Extract the institutional part
                        extracted = self.extract_institutional_part(sent_text, language)
                        if extracted:
                            affiliations.add(extracted)
                            
            except Exception as e:
                print(f"⚠️ Error processing chunk: {e}")
                continue
        
        return affiliations
    
    def is_relevant_affiliation(self, org_text):
        """Check if organization text is relevant to our search"""
        relevant_keywords = [
            'instituto', 'institute', 'universidad', 'university',
            'departamento', 'department', 'unam', 'ifc', 'mexico',
            'fisiolog', 'physiolog', 'celular', 'cellular', 'neurobiolog'
        ]
        
        org_lower = org_text.lower()
        return (len(org_text) > 10 and 
                any(keyword in org_lower for keyword in relevant_keywords))
    
    def contains_institutional_indicators(self, text, language):
        """Check if text contains institutional indicators"""
        if language == 'es':
            indicators = [
                'instituto de', 'universidad', 'departamento de', 
                'centro de', 'facultad de', 'unam'
            ]
        else:
            indicators = [
                'institute of', 'university of', 'department of',
                'center of', 'faculty of', 'unam'
            ]
        
        text_lower = text.lower()
        return any(indicator in text_lower for indicator in indicators)
    
    def extract_institutional_part(self, sentence, language):
        """Extract the institutional part from a sentence"""
        # Use regex patterns to extract institutional names
        if language == 'es':
            patterns = [
                r'Instituto\s+de\s+[A-Za-zÁáÉéÍíÓóÚúÑñ\s,]+?(?:,|\.|\s+UNAM)',
                r'Universidad\s+[A-Za-zÁáÉéÍíÓóÚúÑñ\s,]+?(?:,|\.)',
                r'Departamento\s+de\s+[A-Za-zÁáÉéÍíÓóÚúÑñ\s,]+?(?:,|\.)'
            ]
        else:
            patterns = [
                r'Institute\s+of\s+[A-Za-z\s,]+?(?:,|\.|\s+UNAM)',
                r'University\s+of\s+[A-Za-z\s,]+?(?:,|\.)',
                r'Department\s+of\s+[A-Za-z\s,]+?(?:,|\.)'
            ]
        
        for pattern in patterns:
            match = re.search(pattern, sentence, re.IGNORECASE)
            if match:
                return match.group().strip()
        
        return None
    
    def analyze_affiliations_with_clustering(self, affiliations_list):
        """Advanced analysis with similarity clustering"""
        from difflib import SequenceMatcher
        
        def similarity(a, b):
            return SequenceMatcher(None, a.lower(), b.lower()).ratio()
        
        # Group similar affiliations
        clusters = []
        processed = set()
        
        for affiliation in affiliations_list:
            if affiliation in processed:
                continue
                
            # Find similar affiliations
            cluster = [affiliation]
            processed.add(affiliation)
            
            for other in affiliations_list:
                if other not in processed and similarity(affiliation, other) > 0.7:
                    cluster.append(other)
                    processed.add(other)
            
            if len(cluster) >= 1:
                clusters.append(cluster)
        
        return clusters

# Usage example and demo
def demo_enhanced_mining():
    """Demonstrate enhanced affiliation mining"""
    miner = EnhancedAffiliationMiner()
    
    sample_text = """
    Instituto de Fisiología Celular, Universidad Nacional Autónoma de México, 
    Ciudad Universitaria, México, D.F. 04510, México
    
    Department of Cellular Physiology, National Autonomous University of Mexico,
    Mexico City, Mexico
    
    Departamento de Neurobiología, Instituto de Fisiología Celular, UNAM
    Centro de Investigación y de Estudios Avanzados del IPN
    
    IFC-UNAM, Circuito Exterior s/n, Ciudad Universitaria
    """
    
    print("🧪 Testing enhanced affiliation extraction...")
    
    # Advanced NLP extraction
    affiliations = miner.extract_affiliations_advanced_nlp(sample_text)
    
    print(f"\n🧠 Enhanced NLP extraction found {len(affiliations)} affiliations:")
    for affiliation in sorted(affiliations):
        print(f"   • {affiliation}")
    
    # Clustering analysis
    clusters = miner.analyze_affiliations_with_clustering(list(affiliations))
    print(f"\n🔗 Found {len(clusters)} similarity clusters:")
    for i, cluster in enumerate(clusters):
        print(f"   Cluster {i+1}: {len(cluster)} variations")
        for variation in cluster:
            print(f"      - {variation}")
    
    return affiliations

# Run enhanced demo
if __name__ == "__main__":
    print("🚀 Starting Enhanced Affiliation Mining Demo...")
    demo_results = demo_enhanced_mining()

🚀 Starting Enhanced Affiliation Mining Demo...
✅ Loaded en_core_web_sm
✅ Loaded es_core_news_sm
🧪 Testing enhanced affiliation extraction...

🧠 Enhanced NLP extraction found 6 affiliations:
   • Fisiología Celular
   • IFC-UNAM
   • Instituto de Fisiología Celular
   • Instituto de Fisiología Celular,
   • National Autonomous University of Mexico
   • UNAM
    Centro de Investigación

🔗 Found 4 similarity clusters:
   Cluster 1: 3 variations
      - Instituto de Fisiología Celular
      - Instituto de Fisiología Celular,
      - Fisiología Celular
   Cluster 2: 1 variations
      - IFC-UNAM
   Cluster 3: 1 variations
      - National Autonomous University of Mexico
   Cluster 4: 1 variations
      - UNAM
    Centro de Investigación


## 3. Affiliation Mining from Existing PDFs

This is the key step - we'll analyze existing papers to find all variations of how your institute is mentioned.

In [13]:
import spacy

def extract_affiliations_with_nlp(self, text):
    # Use a library like spaCy to identify organization entities
    nlp = spacy.load("es_core_news_md")  # Spanish model
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "ORG"]


## 4. PubMed Search Strategy

In [15]:
class PubmedSearcher:
    def __init__(self):
        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        
    def build_search_queries(self, affiliation_variations=None):
        """Build comprehensive search queries for different affiliation variations"""
        
        if affiliation_variations is None:
            # Default variations based on your institute
            affiliation_variations = [
                "Instituto de Fisiologia Celular[Affiliation]",
                "Institute of Cellular Physiology[Affiliation]",
                "IFC UNAM[Affiliation]",
                "Departamento de Neurobiologia UNAM[Affiliation]",
                "Universidad Nacional Autonoma Mexico Fisiologia[Affiliation]",
                "National Autonomous University Mexico Cellular Physiology[Affiliation]"
            ]
        
        queries = []
        
        # Individual affiliation searches
        for aff in affiliation_variations:
            queries.append(aff)
            
        # Combined searches with time ranges
        recent_query = f"({' OR '.join(affiliation_variations[:3])}) AND (2020:2024[pdat])"
        historical_query = f"({' OR '.join(affiliation_variations[:3])}) AND (2010:2019[pdat])"
        
        queries.extend([recent_query, historical_query])
        
        return queries
    
    def search_pubmed(self, query, max_results=100):
        """Search PubMed with a given query"""
        
        # Step 1: Search
        search_url = f"{self.base_url}esearch.fcgi"
        search_params = {
            'db': 'pubmed',
            'term': query,
            'retmax': max_results,
            'retmode': 'json'
        }
        
        try:
            response = requests.get(search_url, params=search_params)
            search_data = response.json()
            
            pmids = search_data['esearchresult']['idlist']
            total_count = int(search_data['esearchresult']['count'])
            
            print(f"Found {total_count} results for query: {query[:50]}...")
            
            if not pmids:
                return []
            
            # Step 2: Fetch details
            time.sleep(0.5)  # Rate limiting
            
            fetch_url = f"{self.base_url}efetch.fcgi"
            fetch_params = {
                'db': 'pubmed',
                'id': ','.join(pmids),
                'retmode': 'xml'
            }
            
            fetch_response = requests.get(fetch_url, params=fetch_params)
            
            # Parse XML (simplified - you might want to use xml.etree.ElementTree)
            articles = self.parse_pubmed_xml(fetch_response.text)
            
            return articles
            
        except Exception as e:
            print(f"Error searching PubMed: {e}")
            return []
    
    def parse_pubmed_xml(self, xml_content):
        """Simple XML parsing for PubMed results (you might want to improve this)"""
        import xml.etree.ElementTree as ET
        
        articles = []
        
        try:
            root = ET.fromstring(xml_content)
            
            for article in root.findall('.//PubmedArticle'):
                try:
                    # Extract basic info
                    pmid = article.find('.//PMID').text
                    
                    title_elem = article.find('.//ArticleTitle')
                    title = title_elem.text if title_elem is not None else "No title"
                    
                    # Authors
                    authors = []
                    for author in article.findall('.//Author'):
                        lastname = author.find('.//LastName')
                        firstname = author.find('.//ForeName')
                        if lastname is not None:
                            author_name = lastname.text
                            if firstname is not None:
                                author_name += f", {firstname.text}"
                            authors.append(author_name)
                    
                    # Journal and year
                    journal_elem = article.find('.//Journal/Title')
                    journal = journal_elem.text if journal_elem is not None else "Unknown"
                    
                    year_elem = article.find('.//PubDate/Year')
                    year = int(year_elem.text) if year_elem is not None else None
                    
                    # Abstract
                    abstract_elem = article.find('.//Abstract/AbstractText')
                    abstract = abstract_elem.text if abstract_elem is not None else ""
                    
                    # DOI
                    doi_elem = article.find('.//ELocationID[@EIdType="doi"]')
                    doi = doi_elem.text if doi_elem is not None else None
                    
                    article_data = {
                        'pmid': pmid,
                        'title': title,
                        'authors': '; '.join(authors),
                        'journal': journal,
                        'year': year,
                        'abstract': abstract,
                        'doi': doi
                    }
                    
                    articles.append(article_data)
                    
                except Exception as e:
                    print(f"Error parsing article: {e}")
                    continue
                    
        except Exception as e:
            print(f"Error parsing XML: {e}")
            
        return articles
    
    def comprehensive_search(self, max_per_query=50):
        """Run comprehensive search with all query variations"""
        queries = self.build_search_queries()
        all_articles = []
        seen_pmids = set()
        
        for i, query in enumerate(queries):
            print(f"\n🔍 Running search {i+1}/{len(queries)}")
            articles = self.search_pubmed(query, max_per_query)
            
            # Deduplicate
            new_articles = []
            for article in articles:
                if article['pmid'] not in seen_pmids:
                    seen_pmids.add(article['pmid'])
                    new_articles.append(article)
            
            all_articles.extend(new_articles)
            print(f"Added {len(new_articles)} new articles (total: {len(all_articles)})")
            
            time.sleep(1)  # Be respectful to NCBI
            
        return all_articles

# Run comprehensive PubMed search
print("🔍 Starting comprehensive PubMed search...")
searcher = PubmedSearcher()

# Test with a single query first
test_articles = searcher.search_pubmed("Instituto de Fisiologia Celular[Affiliation]", max_results=5)
print(f"\n📊 Test search found {len(test_articles)} articles")

if test_articles:
    print("\nSample result:")
    sample = test_articles[0]
    print(f"Title: {sample['title'][:100]}...")
    print(f"Authors: {sample['authors'][:100]}...")
    print(f"PMID: {sample['pmid']}")

# Uncomment to run full search
new_articles = searcher.comprehensive_search(max_per_query=20)
print(f"\n🎉 Found {len(new_articles)} total unique articles from PubMed")

🔍 Starting comprehensive PubMed search...
Error searching PubMed: name 'requests' is not defined

📊 Test search found 0 articles

🔍 Running search 1/8
Error searching PubMed: name 'requests' is not defined
Added 0 new articles (total: 0)


NameError: name 'time' is not defined

## 5. Database Integration & Expansion

In [None]:
def merge_publication_databases(existing_pubs, new_pubs, output_file='../data/processed/expanded_ifc_publications.json'):
    """Merge existing publications with newly found ones, removing duplicates"""
    
    # Create lookup sets for deduplication
    existing_dois = {pub.get('doi') for pub in existing_pubs if pub.get('doi')}
    existing_pmids = {pub.get('pubmed_id') for pub in existing_pubs if pub.get('pubmed_id')}
    existing_titles = {pub.get('title', '').lower().strip() for pub in existing_pubs}
    
    merged_pubs = existing_pubs.copy()
    new_count = 0
    
    for pub in new_pubs:
        is_duplicate = False
        
        # Check for duplicates
        if pub.get('doi') and pub['doi'] in existing_dois:
            is_duplicate = True
        elif pub.get('pmid') and pub['pmid'] in existing_pmids:
            is_duplicate = True
        elif pub.get('title', '').lower().strip() in existing_titles:
            is_duplicate = True
            
        if not is_duplicate:
            # Convert PubMed format to your format
            converted_pub = {
                'title': pub.get('title', ''),
                'authors': pub.get('authors', ''),
                'journal': pub.get('journal', ''),
                'year': pub.get('year'),
                'doi': pub.get('doi'),
                'pubmed_id': pub.get('pmid'),
                'ifc_url': None,  # Not available from PubMed
                'abstract': pub.get('abstract', ''),
                'keywords': None
            }
            
            merged_pubs.append(converted_pub)
            new_count += 1
            
            # Update tracking sets
            if pub.get('doi'):
                existing_dois.add(pub['doi'])
            if pub.get('pmid'):
                existing_pmids.add(pub['pmid'])
            existing_titles.add(pub.get('title', '').lower().strip())
    
    # Save expanded database
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_pubs, f, indent=2, ensure_ascii=False)
    
    print(f"\n📊 Database expansion complete:")
    print(f"   Original publications: {len(existing_pubs)}")
    print(f"   New publications added: {new_count}")
    print(f"   Total publications: {len(merged_pubs)}")
    print(f"   Saved to: {output_file}")
    
    return merged_pubs

# Demo with existing data
print("📈 Database expansion simulation")
print(f"Current database size: {len(publications)}")

# Create some demo "new" publications
demo_new_pubs = [
    {
        'pmid': '99999999',
        'title': 'Demo paper: Synaptic plasticity in hippocampal circuits',
        'authors': 'Demo Author, A.; Demo Author, B.',
        'journal': 'Demo Journal of Neuroscience',
        'year': 2023,
        'abstract': 'This is a demo abstract about hippocampal synaptic plasticity...',
        'doi': '10.1234/demo.2023.001'
    }
]

expanded_db = merge_publication_databases(publications, demo_new_pubs)

# Generate updated BibTeX file
updated_bibtex = create_bibtex_from_publications(expanded_db, '../data/processed/expanded_ifc_publications.bib')
print(f"\n📚 Updated BibTeX file: {updated_bibtex}")

## 6. Automated Pipeline

In [None]:
def run_complete_pipeline(initial_json_path, output_dir='../data/processed'):
    """Complete automated pipeline to expand publication database"""
    
    print("🚀 Starting complete publication database expansion pipeline\n")
    
    # Step 1: Load existing data
    print("📂 Step 1: Loading existing publications")
    with open(initial_json_path, 'r', encoding='utf-8') as f:
        existing_pubs = json.load(f)
    print(f"   Loaded {len(existing_pubs)} existing publications")
    
    # Step 2: Create BibTeX for manual download
    print("\n📚 Step 2: Creating BibTeX file for Zotero")
    bibtex_path = os.path.join(output_dir, 'ifc_publications_for_zotero.bib')
    create_bibtex_from_publications(existing_pubs, bibtex_path)
    
    # Step 3: Search PubMed for additional papers
    print("\n🔍 Step 3: Searching PubMed for additional publications")
    searcher = PubmedSearcher()
    new_articles = searcher.comprehensive_search(max_per_query=30)
    print(f"   Found {len(new_articles)} potential new articles")
    
    # Step 4: Merge databases
    print("\n🔄 Step 4: Merging and deduplicating databases")
    expanded_json_path = os.path.join(output_dir, 'expanded_ifc_publications.json')
    final_db = merge_publication_databases(existing_pubs, new_articles, expanded_json_path)
    
    # Step 5: Create final BibTeX
    print("\n📚 Step 5: Creating final BibTeX file")
    final_bibtex_path = os.path.join(output_dir, 'final_ifc_publications.bib')
    create_bibtex_from_publications(final_db, final_bibtex_path)
    
    # Step 6: Generate summary report
    print("\n📊 Step 6: Generating summary report")
    report = {
        'pipeline_date': time.strftime('%Y-%m-%d %H:%M:%S'),
        'original_count': len(existing_pubs),
        'pubmed_found': len(new_articles),
        'final_count': len(final_db),
        'new_additions': len(final_db) - len(existing_pubs),
        'files_created': {
            'expanded_json': expanded_json_path,
            'bibtex_original': bibtex_path,
            'bibtex_final': final_bibtex_path
        },
        'year_distribution': {},
        'top_journals': {}
    }
    
    # Analyze year distribution
    years = [pub.get('year') for pub in final_db if pub.get('year')]
    year_counts = Counter(years)
    report['year_distribution'] = dict(year_counts.most_common(10))
    
    # Analyze top journals
    journals = [pub.get('journal') for pub in final_db if pub.get('journal')]
    journal_counts = Counter(journals)
    report['top_journals'] = dict(journal_counts.most_common(10))
    
    # Save report
    report_path = os.path.join(output_dir, 'pipeline_report.json')
    with open(report_path, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, ensure_ascii=False)
    
    print(f"\n✅ Pipeline complete! Summary:")
    print(f"   📊 Original: {report['original_count']} publications")
    print(f"   🆕 Added: {report['new_additions']} new publications")
    print(f"   📈 Final: {report['final_count']} total publications")
    print(f"   📄 Report saved: {report_path}")
    
    return final_db, report

# Run the complete pipeline (uncomment to execute)
# final_database, pipeline_report = run_complete_pipeline('../data/raw/test_ifc_publications.json')

print("\n🎯 Pipeline ready! Uncomment the line above to run the complete workflow.")
print("\nNext steps:")
print("1. Run this pipeline to expand your database")
print("2. Import the BibTeX files into Zotero to download PDFs")
print("3. Use the expanded JSON database for your ChromaDB embeddings")
print("4. Run affiliation mining on downloaded PDFs to find more variations")
print("5. Iterate to continuously expand your database")