In [1]:
# CHUNK 1: Setup and Basic Configuration

import nest_asyncio
nest_asyncio.apply()  # This allows running async code in Jupyter

import asyncio
import logging
import re
import json
from pathlib import Path
from typing import Any, Dict, List, Optional
from pprint import pprint

import aiohttp
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create a simple placeholder for GrowthLabPublication
class SimpleGrowthLabPublication(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
    def generate_id(self):
        # Simple hash of title
        title = self.get("title", "")
        return f"pub_{hash(title) % 10000}"
        
    def generate_content_hash(self):
        # Simple hash of content
        content = json.dumps(self, sort_keys=True)
        return f"hash_{hash(content) % 10000}"
        
    @property
    def paper_id(self):
        return self.get("paper_id", "")
        
    @paper_id.setter
    def paper_id(self, value):
        self["paper_id"] = value
        
    @property
    def content_hash(self):
        return self.get("content_hash", "")
        
    @content_hash.setter
    def content_hash(self, value):
        self["content_hash"] = value
        
    @property
    def title(self):
        return self.get("title", "")
        
    @title.setter
    def title(self, value):
        self["title"] = value
        
    @property
    def authors(self):
        return self.get("authors", "")
        
    @authors.setter
    def authors(self, value):
        self["authors"] = value
        
    @property
    def abstract(self):
        return self.get("abstract", "")
        
    @abstract.setter
    def abstract(self, value):
        self["abstract"] = value
        
    @property
    def pub_url(self):
        return self.get("pub_url", "")
        
    @property
    def file_urls(self):
        return self.get("file_urls", [])

# Simplified retry function
async def retry_with_backoff(func, *args, max_retries=3, base_delay=1.0, max_delay=30.0, retry_on=Exception, **kwargs):
    """Retry a function with exponential backoff"""
    retries = 0
    while True:
        try:
            return await func(*args, **kwargs)
        except retry_on as e:
            if retries >= max_retries:
                raise
            delay = min(base_delay * (2 ** retries), max_delay)
            print(f"Retrying after error: {e}. Attempt {retries+1}/{max_retries}. Waiting {delay:.2f}s...")
            await asyncio.sleep(delay)
            retries += 1

# Create a simple scraper with just configuration
class SimpleGrowthLabScraper:
    """Simplified version of the Growth Lab scraper for testing"""
    
    def __init__(self):
        # Load default configuration
        self.config = {
            "base_url": "https://growthlab.hks.harvard.edu/publications",
            "scrape_delay": 1.0,  # Reduced for testing
            "concurrency_limit": 1,  # Reduced for testing
            "max_retries": 3,
            "retry_base_delay": 1.0,
            "retry_max_delay": 10.0,
        }
        
        self.base_url = self.config["base_url"]
        self.scrape_delay = self.config["scrape_delay"]
        self.concurrency_limit = self.config["concurrency_limit"]
        self.semaphore = asyncio.Semaphore(self.concurrency_limit)
        
        # Some sample year corrections (from original scraper)
        self.year_corrections = {
            "https://growthlab.hks.harvard.edu/publications/economic-complexity-brief": 2013,
            "https://growthlab.hks.harvard.edu/publications/colombia-atlas-economic-complexity-datlas": 2014,
        }
        
# Create our scraper instance
scraper = SimpleGrowthLabScraper()
print(f"Created scraper with base URL: {scraper.base_url}")
print(f"Scraper configuration: {scraper.config}")

Created scraper with base URL: https://growthlab.hks.harvard.edu/publications
Scraper configuration: {'base_url': 'https://growthlab.hks.harvard.edu/publications', 'scrape_delay': 1.0, 'concurrency_limit': 1, 'max_retries': 3, 'retry_base_delay': 1.0, 'retry_max_delay': 10.0}


In [2]:

async def get_max_page_num(url):
    """Get the maximum page number from pagination"""
    print(f"Fetching pagination information from: {url}")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url) as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return 0
                
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            pagination = soup.find("ul", {"class": "pager"})
            
            if pagination:
                print("Found pagination element on the page")
                last_page_link = pagination.find("li", {"class": "pager-last"})
                if last_page_link and last_page_link.find("a"):
                    last_page_url = last_page_link.find("a").get("href")
                    print(f"Found last page URL: {last_page_url}")
                    
                    match = re.search(r"\d+", last_page_url)
                    if match:
                        max_page = int(match.group())
                        print(f"Extracted max page number: {max_page}")
                        return max_page
            
            print("Could not find pagination or determine max page")
            return 0

# Run the function
max_page = await get_max_page_num(scraper.base_url)
print(f"\nMaximum page number: {max_page}")

Fetching pagination information from: https://growthlab.hks.harvard.edu/publications
Found pagination element on the page
Found last page URL: https://growthlab.hks.harvard.edu/publications?page=21
Extracted max page number: 21

Maximum page number: 21


In [3]:
# CHUNK 3: Fetch and Parse a Publication

async def fetch_and_parse_publication():
    """Fetch the first page and parse a single publication"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(scraper.base_url) as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return None
            
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Find the first publication element
            pub_element = soup.find("div", {"class": "biblio-entry"})
            if not pub_element:
                print("No publication found")
                return None
            
            print("Found a publication element, parsing...")
            
            # Extract data using the same CSS selectors as in parse_publication
            title_element = pub_element.find("span", {"class": "biblio-title"})
            title = title_element.text.strip() if title_element else None
            
            title_link = title_element.find("a") if title_element else None
            pub_url = title_link.get("href") if title_link else None
            
            # Ensure URL is absolute
            if pub_url and not pub_url.startswith(("http://", "https://")):
                pub_url = f"{scraper.base_url.split('/publications')[0]}{pub_url}"
            
            authors_element = pub_element.find("span", {"class": "biblio-authors"})
            authors = authors_element.text.strip() if authors_element else None
            
            # Extract year
            year = None
            if authors_element:
                sibling_text = authors_element.next_sibling
                if sibling_text:
                    year_match = re.search(r"\b\d{4}\b", sibling_text)
                    if year_match:
                        year = int(year_match.group())
            
            # Apply year correction if available
            if pub_url in scraper.year_corrections:
                year = scraper.year_corrections[pub_url]
            
            abstract_element = pub_element.find("div", {"class": "biblio-abstract-display"})
            abstract = abstract_element.text.strip() if abstract_element else None
            
            # Get file URLs
            file_urls = []
            for file_elem in pub_element.find_all("span", {"class": "file"}):
                file_link = file_elem.find("a")
                if file_link and file_link.get("href"):
                    file_url = file_link["href"]
                    # Ensure URL is absolute
                    if not file_url.startswith(("http://", "https://")):
                        file_url = f"{scraper.base_url.split('/publications')[0]}{file_url}"
                    file_urls.append(file_url)
            
            # Create a publication object
            pub = SimpleGrowthLabPublication(
                title=title,
                authors=authors,
                year=year,
                abstract=abstract,
                pub_url=pub_url,
                file_urls=file_urls,
                source="GrowthLab",
            )
            
            # Generate stable ID and content hash
            pub.paper_id = pub.generate_id()
            pub.content_hash = pub.generate_content_hash()
            
            return pub

# Run the function
publication = await fetch_and_parse_publication()

if publication:
    print("\nSuccessfully parsed a publication:")
    print(f"Title: {publication['title']}")
    print(f"Authors: {publication['authors']}")
    print(f"Year: {publication['year']}")
    print(f"Abstract: {publication['abstract'][:100]}..." if publication['abstract'] else "No abstract")
    print(f"URL: {publication['pub_url']}")
    print(f"File URLs: {publication['file_urls']}")
    print(f"ID: {publication['paper_id']}")
    print(f"Content Hash: {publication['content_hash']}")

Found a publication element, parsing...

Successfully parsed a publication:
Title: Global Networks, Monetary Policy and Trade
Authors: Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.
Year: 2025
Abstract: We develop a novel framework to study the interaction between monetary policy and trade. Our New Key...
URL: https://growthlab.hks.harvard.edu/publications/global-networks-monetary-policy-and-trade
File URLs: []
ID: pub_6494
Content Hash: hash_8040


In [15]:
# CHUNK 4: Fetch Publications from a Random Page

import random

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
print(f"Using random seed: {RANDOM_SEED}")

async def fetch_page(page_num=None):
    """Fetch a single page of publications
    
    Args:
        page_num: Page number to fetch, or None to choose a random page
    """
    # If max_page was found in Chunk 2, use it as the upper bound for random selection
    # Otherwise, just pick a number between 1 and 10 (reasonable guess)
    upper_bound = max_page if 'max_page' in globals() and max_page > 0 else 10
    
    if page_num is None:
        # Choose a random page between 0 and upper_bound
        page_num = random.randint(0, upper_bound)
    
    url = scraper.base_url if page_num == 0 else f"{scraper.base_url}?page={page_num}"
    print(f"Fetching publications from page {page_num}: {url}")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    }
    
    publications = []
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url) as response:
            if response.status != 200:
                print(f"Error fetching page {page_num}: {response.status}")
                return []
            
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            pub_elements = soup.find_all("div", {"class": "biblio-entry"})
            
            print(f"Found {len(pub_elements)} publications on page {page_num}")
            
            # Pick a random subset of publications (3 maximum, or all if fewer than 3)
            sample_size = min(3, len(pub_elements))
            if sample_size > 0:
                selected_indices = random.sample(range(len(pub_elements)), sample_size)
                selected_pubs = [pub_elements[i] for i in selected_indices]
            else:
                selected_pubs = []
            
            print(f"Selected {len(selected_pubs)} random publications for analysis")
            
            # Process the selected publications
            for i, pub_element in enumerate(selected_pubs):
                # Extract data (same as before)
                title_element = pub_element.find("span", {"class": "biblio-title"})
                title = title_element.text.strip() if title_element else None
                
                title_link = title_element.find("a") if title_element else None
                pub_url = title_link.get("href") if title_link else None
                
                # Ensure URL is absolute
                if pub_url and not pub_url.startswith(("http://", "https://")):
                    pub_url = f"{scraper.base_url.split('/publications')[0]}{pub_url}"
                
                authors_element = pub_element.find("span", {"class": "biblio-authors"})
                authors = authors_element.text.strip() if authors_element else None
                
                # Extract year
                year = None
                if authors_element:
                    sibling_text = authors_element.next_sibling
                    if sibling_text:
                        year_match = re.search(r"\b\d{4}\b", sibling_text)
                        if year_match:
                            year = int(year_match.group())
                
                abstract_element = pub_element.find("div", {"class": "biblio-abstract-display"})
                abstract = abstract_element.text.strip() if abstract_element else None
                
                # Get file URLs
                file_urls = []
                for file_elem in pub_element.find_all("span", {"class": "file"}):
                    file_link = file_elem.find("a")
                    if file_link and file_link.get("href"):
                        file_url = file_link["href"]
                        # Ensure URL is absolute
                        if not file_url.startswith(("http://", "https://")):
                            file_url = f"{scraper.base_url.split('/publications')[0]}{file_url}"
                        file_urls.append(file_url)
                
                pub = SimpleGrowthLabPublication(
                    title=title,
                    authors=authors,
                    year=year,
                    abstract=abstract,
                    pub_url=pub_url,
                    file_urls=file_urls,
                    source="GrowthLab",
                )
                
                # Generate stable ID and content hash
                pub.paper_id = pub.generate_id()
                pub.content_hash = pub.generate_content_hash()
                
                publications.append(pub)
                
            # In the real scraper, this would be controlled by semaphore
            await asyncio.sleep(scraper.scrape_delay)
            
            return publications

# Run the function to fetch from a random page
random_page_publications = await fetch_page()

print("\nPublications fetched from random page:")
for i, pub in enumerate(random_page_publications):
    print(f"\nPublication {i+1}:")
    print(f"Title: {pub['title']}")
    print(f"Authors: {pub['authors']}")
    print(f"Abstract: {pub['abstract'][:50]}..." if pub['abstract'] else "No abstract")

# Also fetch a specific page (page 6) for comparison
specific_page_publications = await fetch_page(6)

print("\nPublications fetched from page 6:")
for i, pub in enumerate(specific_page_publications):
    print(f"\nPublication {i+1}:")
    print(f"Title: {pub['title']}")
    print(f"Authors: {pub['authors']}")
    print(f"Abstract: {pub['abstract'][:50]}..." if pub['abstract'] else "No abstract")

Using random seed: 42
Fetching publications from page 20: https://growthlab.hks.harvard.edu/publications?page=20
Found 20 publications on page 20
Selected 3 random publications for analysis

Publications fetched from random page:

Publication 1:
Title: Growth Diagnostic: Paraguay
Authors: Hausmann, R. & Klinger, B.
Abstract: Paraguay’s growth history is characterized by prol...

Publication 2:
Title: Uncertainty in the Search for New Exports
Authors: Klinger, B.
Abstract: This paper explores the role that uncertainty play...

Publication 3:
Title: Growth Collapses
Authors: Hausmann, R., Rodríguez, F. & Wagner, R.
Abstract: We study episodes where economic growth decelerate...
Fetching publications from page 6: https://growthlab.hks.harvard.edu/publications?page=6
Found 20 publications on page 6
Selected 3 random publications for analysis

Publications fetched from page 6:

Publication 1:
Title: Buscando virtudes en la lejanía: Recomendaciones de política para promover el crecimiento in

In [14]:
random_page_publications 

[{'title': 'Air Transportation and Regional Economic Development: A Case Study for the New Airport in South Albania',
  'authors': 'Gadgin Matha, S., Goldstein, P. & Lu, J.',
  'year': 2020,
  'abstract': 'Considering the case of the proposed airport in Vlora, South Albania, this report analyzes the channels through which a new greenfield airport can contribute to regional economic development. In December 2019, the Government of Albania opened a call for offers to build a new airport in the south of the country. While there is evidence indicating that the airport could be commercially viable, this does not provide a grounded perspective on the channels by which the airport could boost the regional economy. To evaluate how the new airport would interact with existing and potential economic activities, this report evaluates three of the most important channels of impact by which the airport could serve as a promoter: (1) economic activities directly related to or promoted by airports, (

In [16]:
# %%
# CHUNK 10: Implement Configurable CSS Selectors with Fallbacks

# Define a configuration class for selectors that includes primary and fallback options
class SelectorConfig:
    def __init__(self, primary=None, fallbacks=None, xpath=None, description=None):
        self.primary = primary
        self.fallbacks = fallbacks or []
        self.xpath = xpath  # XPath alternative
        self.description = description or "Element"
    
    def __str__(self):
        return f"{self.description} ({self.primary})"
    
    def __repr__(self):
        return f"SelectorConfig(primary='{self.primary}', fallbacks={self.fallbacks}, xpath='{self.xpath}')"

# Define the selector configuration with primary selectors, fallbacks, and XPath alternatives
SELECTOR_CONFIG = {
    "publication": {
        "container": SelectorConfig(
            primary="div.biblio-entry", 
            fallbacks=["div.publication-item", "article.publication"],
            xpath="//div[contains(@class, 'biblio') and contains(@class, 'entry')]",
            description="Publication container"
        ),
        "title": SelectorConfig(
            primary="span.biblio-title", 
            fallbacks=["h2.publication-title", "h3.title", "div.title"],
            xpath="//span[contains(@class, 'title')]",
            description="Publication title"
        ),
        "authors": SelectorConfig(
            primary="span.biblio-authors", 
            fallbacks=["div.authors", "p.author-list", "div.publication-authors"],
            xpath="//span[contains(@class, 'authors')]",
            description="Publication authors"
        ),
        "abstract": SelectorConfig(
            primary="div.biblio-abstract-display", 
            fallbacks=["div.abstract", "div.publication-abstract", "p.abstract"],
            xpath="//div[contains(@class, 'abstract')]",
            description="Publication abstract"
        ),
        "file": SelectorConfig(
            primary="span.file", 
            fallbacks=["div.download-links a", "div.publication-files a"],
            xpath="//span[contains(@class, 'file')]",
            description="Publication files"
        ),
        "year": SelectorConfig(
            primary=None,  # Extracted from text, not direct selector
            xpath=None,
            description="Publication year"
        )
    },
    "pagination": {
        "container": SelectorConfig(
            primary="ul.pager",
            fallbacks=["div.pagination", "nav.pagination"],
            xpath="//ul[contains(@class, 'pager')]",
            description="Pagination container"
        ),
        "last_page": SelectorConfig(
            primary="li.pager-last",
            fallbacks=["li.page-item:last-child", "a.page-link:last-child"],
            xpath="//li[contains(@class, 'pager-last')]",
            description="Last page link"
        )
    },
    "endnote": {
        "link": SelectorConfig(
            primary="li.biblio_tagged a",
            fallbacks=["a.endnote-link", "a[href*='endnote']", "a[href*='tagged']"],
            xpath="//li[contains(@class, 'biblio_tagged')]//a",
            description="Endnote link"
        )
    }
}

# Create a selector finder that tries multiple methods to find elements
class SelectorFinder:
    def __init__(self, soup=None):
        self.soup = soup
    
    def set_soup(self, soup):
        self.soup = soup
    
    def find(self, selector_config, soup=None, base_element=None):
        """Find an element using the configured selectors with fallbacks"""
        if soup is None and base_element is None:
            soup = self.soup
        
        search_element = base_element or soup
        if not search_element:
            logging.warning(f"No soup or base element provided to find {selector_config}")
            return None
        
        # Try primary selector
        if selector_config.primary:
            element = self._try_css_selector(search_element, selector_config.primary)
            if element:
                return element
        
        # Try fallback selectors
        for fallback in selector_config.fallbacks:
            element = self._try_css_selector(search_element, fallback)
            if element:
                logging.info(f"Using fallback selector {fallback} for {selector_config.description}")
                return element
        
        # Try XPath as last resort
        if selector_config.xpath and hasattr(search_element, 'select'):  # For lxml / html5lib parser
            try:
                elements = search_element.select(selector_config.xpath)
                if elements:
                    logging.info(f"Using XPath selector for {selector_config.description}")
                    return elements[0]
            except Exception as e:
                logging.warning(f"XPath selector failed: {e}")
        
        logging.warning(f"All selectors failed for {selector_config.description}")
        return None
    
    def find_all(self, selector_config, soup=None, base_element=None):
        """Find all elements using the configured selectors with fallbacks"""
        if soup is None and base_element is None:
            soup = self.soup
        
        search_element = base_element or soup
        if not search_element:
            return []
        
        # Try primary selector
        if selector_config.primary:
            elements = self._try_css_selector_all(search_element, selector_config.primary)
            if elements:
                return elements
        
        # Try fallback selectors
        for fallback in selector_config.fallbacks:
            elements = self._try_css_selector_all(search_element, fallback)
            if elements:
                logging.info(f"Using fallback selector {fallback} for {selector_config.description}")
                return elements
        
        # Try XPath as last resort
        if selector_config.xpath and hasattr(search_element, 'select'):
            try:
                elements = search_element.select(selector_config.xpath)
                if elements:
                    logging.info(f"Using XPath selector for {selector_config.description}")
                    return elements
            except Exception as e:
                logging.warning(f"XPath selector failed: {e}")
        
        return []
    
    def _try_css_selector(self, element, selector):
        """Try to find an element with a CSS selector, handling different formats"""
        try:
            if "." in selector:  # Class selector
                parts = selector.split(".")
                tag = parts[0] if parts[0] else None
                classes = parts[1].split()
                return element.find(tag, class_=classes)
            else:  # Tag selector
                return element.find(selector)
        except Exception as e:
            logging.debug(f"Selector {selector} failed: {e}")
            return None
    
    def _try_css_selector_all(self, element, selector):
        """Try to find all elements with a CSS selector, handling different formats"""
        try:
            if "." in selector:  # Class selector
                parts = selector.split(".")
                tag = parts[0] if parts[0] else None
                classes = parts[1].split()
                return element.find_all(tag, class_=classes)
            else:  # Tag selector
                return element.find_all(selector)
        except Exception as e:
            logging.debug(f"Selector {selector} failed: {e}")
            return []

# Initialize the selector finder
selector_finder = SelectorFinder()

# Test the configurable selector system
async def test_configurable_selectors():
    """Test the new configurable selector system"""
    print("\n📋 Testing configurable selectors system...")
    
    # Fetch a page
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(scraper.base_url) as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return
            
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Set the soup in our selector finder
            selector_finder.set_soup(soup)
            
            # Test finding containers
            print("Finding publication containers...")
            containers = selector_finder.find_all(
                SELECTOR_CONFIG["publication"]["container"]
            )
            print(f"Found {len(containers)} publication containers")
            
            if containers:
                # Test finding elements within a container
                print("\nTesting element selectors within first publication:")
                container = containers[0]
                
                # Find and print title
                title_element = selector_finder.find(
                    SELECTOR_CONFIG["publication"]["title"], 
                    base_element=container
                )
                title = title_element.text.strip() if title_element else "Title not found"
                print(f"- Title: {title}")
                
                # Find and print authors
                authors_element = selector_finder.find(
                    SELECTOR_CONFIG["publication"]["authors"], 
                    base_element=container
                )
                authors = authors_element.text.strip() if authors_element else "Authors not found"
                print(f"- Authors: {authors}")
                
                # Find and print abstract
                abstract_element = selector_finder.find(
                    SELECTOR_CONFIG["publication"]["abstract"], 
                    base_element=container
                )
                abstract = abstract_element.text.strip() if abstract_element else "Abstract not found"
                print(f"- Abstract: {abstract[:100]}..." if len(abstract) > 100 else f"- Abstract: {abstract}")
                
                # Find and print files
                file_elements = selector_finder.find_all(
                    SELECTOR_CONFIG["publication"]["file"], 
                    base_element=container
                )
                file_urls = []
                for file_elem in file_elements:
                    file_link = file_elem.find("a")
                    if file_link and file_link.get("href"):
                        file_url = file_link["href"]
                        # Ensure URL is absolute
                        if not file_url.startswith(("http://", "https://")):
                            file_url = f"{scraper.base_url.split('/publications')[0]}{file_url}"
                        file_urls.append(file_url)
                
                print(f"- File URLs: {file_urls}")
                
                # Test finding pagination
                print("\nTesting pagination selectors:")
                pagination = selector_finder.find(
                    SELECTOR_CONFIG["pagination"]["container"]
                )
                if pagination:
                    print("Found pagination container")
                    
                    # Find last page
                    last_page_element = selector_finder.find(
                        SELECTOR_CONFIG["pagination"]["last_page"], 
                        base_element=pagination
                    )
                    
                    if last_page_element and last_page_element.find("a"):
                        last_page_url = last_page_element.find("a").get("href")
                        print(f"Found last page URL: {last_page_url}")
                    else:
                        print("Last page element not found")
                else:
                    print("Pagination container not found")
    
# Run the test
await test_configurable_selectors()

# %%
# CHUNK 11: Implement Publication Parser with Configurable Selectors

class PublicationParser:
    """Class to parse publications using configurable selectors"""
    
    def __init__(self, selectors=None, base_url=None):
        self.selectors = selectors or SELECTOR_CONFIG
        self.base_url = base_url or "https://growthlab.hks.harvard.edu"
        self.finder = SelectorFinder()
    
    async def parse_page(self, html, url=None):
        """Parse a page of publications"""
        if not html:
            logging.warning("No HTML content provided to parse")
            return []
        
        soup = BeautifulSoup(html, "html.parser")
        self.finder.set_soup(soup)
        
        # Find all publication containers
        containers = self.finder.find_all(self.selectors["publication"]["container"])
        
        if not containers:
            logging.warning("No publication containers found on page")
            return []
        
        logging.info(f"Found {len(containers)} publication containers")
        
        # Parse each publication
        publications = []
        for container in containers:
            try:
                publication = await self.parse_publication(container)
                if publication:
                    publications.append(publication)
            except Exception as e:
                logging.error(f"Error parsing publication: {e}")
                continue
        
        return publications
    
    async def parse_publication(self, container):
        """Parse a single publication from a container element"""
        # Extract basic metadata using selectors
        title_element = self.finder.find(self.selectors["publication"]["title"], base_element=container)
        if not title_element:
            logging.warning("No title element found, skipping publication")
            return None
        
        # Get title
        title = title_element.text.strip()
        
        # Get URL from title link
        title_link = title_element.find("a")
        pub_url = title_link.get("href") if title_link else None
        
        # Ensure URL is absolute
        if pub_url and not pub_url.startswith(("http://", "https://")):
            pub_url = f"{self.base_url}{pub_url}"
        
        # Get authors
        authors_element = self.finder.find(self.selectors["publication"]["authors"], base_element=container)
        authors = authors_element.text.strip() if authors_element else None
        
        # Extract year - first try the year text
        year = None
        if authors_element:
            # Look for year in the text following the authors
            sibling_text = authors_element.next_sibling
            if sibling_text:
                year_match = re.search(r"\b(19|20)\d{2}\b", sibling_text)
                if year_match:
                    year = int(year_match.group())
        
        # Get abstract
        abstract_element = self.finder.find(self.selectors["publication"]["abstract"], base_element=container)
        abstract = abstract_element.text.strip() if abstract_element else None
        
        # Get file URLs
        file_elements = self.finder.find_all(self.selectors["publication"]["file"], base_element=container)
        file_urls = []
        for file_elem in file_elements:
            file_link = file_elem.find("a")
            if file_link and file_link.get("href"):
                file_url = file_link["href"]
                # Ensure URL is absolute
                if not file_url.startswith(("http://", "https://")):
                    file_url = f"{self.base_url}{file_url}"
                file_urls.append(file_url)
        
        # Create a publication object
        pub = SimpleGrowthLabPublication(
            title=title,
            authors=authors,
            year=year,
            abstract=abstract,
            pub_url=pub_url,
            file_urls=file_urls,
            source="GrowthLab",
        )
        
        # Generate stable ID and content hash
        pub.paper_id = pub.generate_id()
        pub.content_hash = pub.generate_content_hash()
        
        return pub
    
    async def get_max_page_num(self, html):
        """Get the maximum page number from pagination"""
        if not html:
            logging.warning("No HTML content provided to parse pagination")
            return 0
        
        soup = BeautifulSoup(html, "html.parser")
        self.finder.set_soup(soup)
        
        # Find pagination container
        pagination = self.finder.find(self.selectors["pagination"]["container"])
        
        if not pagination:
            logging.warning("No pagination container found")
            return 0
        
        # Find last page element
        last_page_element = self.finder.find(
            self.selectors["pagination"]["last_page"], 
            base_element=pagination
        )
        
        if not last_page_element or not last_page_element.find("a"):
            logging.warning("No last page link found")
            return 0
        
        # Get URL from last page link
        last_page_url = last_page_element.find("a").get("href")
        if not last_page_url:
            return 0
        
        # Extract page number from URL
        match = re.search(r"page=(\d+)", last_page_url)
        if match:
            return int(match.group(1))
        
        return 0

# Test the publication parser with configurable selectors
async def test_publication_parser():
    """Test the publication parser with configurable selectors"""
    print("\n🔍 Testing publication parser with configurable selectors...")
    
    # Create a parser with the current selector configuration
    parser = PublicationParser(SELECTOR_CONFIG, scraper.base_url.split('/publications')[0])
    
    # Fetch a page
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(scraper.base_url) as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return
            
            html = await response.text()
            
            # Get max page number
            max_page = await parser.get_max_page_num(html)
            print(f"Maximum page number: {max_page}")
            
            # Parse publications
            publications = await parser.parse_page(html)
            print(f"Parsed {len(publications)} publications")
            
            if publications:
                print("\nFirst 3 parsed publications:")
                for i, pub in enumerate(publications[:3]):
                    print(f"\n{i+1}. {pub['title']}")
                    print(f"   Authors: {pub['authors']}")
                    print(f"   Year: {pub['year']}")
                    print(f"   Abstract: {pub['abstract'][:100]}..." if pub['abstract'] else "   No abstract")
                    print(f"   URL: {pub['pub_url']}")
                    print(f"   Files: {pub['file_urls']}")
                    print(f"   ID: {pub['paper_id']}")

# Run the test
await test_publication_parser()

# %%
# CHUNK 12: Implement Selector Monitoring and Alert System

class SelectorMonitor:
    """Class to monitor selector performance and detect failures"""
    
    def __init__(self, selectors=None):
        self.selectors = selectors or SELECTOR_CONFIG
        self.stats = {
            "total_pages": 0,
            "total_publications": 0,
            "selector_success": {},
            "selector_failure": {},
            "alerts": []
        }
        
        # Initialize stats for each selector
        for section, config in self.selectors.items():
            for name, selector in config.items():
                key = f"{section}.{name}"
                self.stats["selector_success"][key] = 0
                self.stats["selector_failure"][key] = 0
    
    def record_success(self, section, name):
        """Record a successful selector use"""
        key = f"{section}.{name}"
        if key in self.stats["selector_success"]:
            self.stats["selector_success"][key] += 1
    
    def record_failure(self, section, name):
        """Record a failed selector use"""
        key = f"{section}.{name}"
        if key in self.stats["selector_failure"]:
            self.stats["selector_failure"][key] += 1
            
            # Check if failure rate is high enough to trigger alert
            total = self.stats["selector_success"][key] + self.stats["selector_failure"][key]
            if total >= 5:  # Only check after a minimum sample
                failure_rate = self.stats["selector_failure"][key] / total
                if failure_rate > 0.5:  # Alert if more than 50% failure
                    self.create_alert(section, name, failure_rate)
    
    def create_alert(self, section, name, failure_rate):
        """Create an alert for a failing selector"""
        selector = self.selectors[section][name]
        alert = {
            "timestamp": time.time(),
            "selector": f"{section}.{name}",
            "failure_rate": failure_rate,
            "primary": selector.primary,
            "fallbacks": selector.fallbacks,
            "message": f"Selector {section}.{name} is failing at a rate of {failure_rate:.2%}"
        }
        
        # Check if we already have an alert for this selector
        existing_alerts = [a for a in self.stats["alerts"] if a["selector"] == alert["selector"]]
        if not existing_alerts:
            self.stats["alerts"].append(alert)
            logging.warning(f"SELECTOR ALERT: {alert['message']}")
    
    def record_page_processed(self):
        """Record that a page was processed"""
        self.stats["total_pages"] += 1
    
    def record_publication_processed(self):
        """Record that a publication was processed"""
        self.stats["total_publications"] += 1
    
    def check_selector_health(self):
        """Check the health of all selectors"""
        print("\n🔍 Selector Health Check:")
        
        for section, config in self.selectors.items():
            print(f"\n{section.upper()} Selectors:")
            
            for name, selector in config.items():
                key = f"{section}.{name}"
                success = self.stats["selector_success"].get(key, 0)
                failure = self.stats["selector_failure"].get(key, 0)
                total = success + failure
                
                if total > 0:
                    success_rate = success / total
                    status = "✅ GOOD" if success_rate >= 0.9 else "⚠️ WARNING" if success_rate >= 0.5 else "❌ FAILING"
                    print(f"  - {key}: {status} ({success}/{total}, {success_rate:.1%})")
                else:
                    print(f"  - {key}: ⚪ NO DATA")
    
    def get_alerts(self):
        """Get current selector alerts"""
        return self.stats["alerts"]
    
    def print_alerts(self):
        """Print current selector alerts"""
        alerts = self.get_alerts()
        if alerts:
            print("\n⚠️ Current Selector Alerts:")
            for alert in alerts:
                print(f"  - {alert['selector']}: Failing at {alert['failure_rate']:.1%}")
                print(f"    Primary: {alert['primary']}")
                print(f"    Fallbacks: {alert['fallbacks']}")
        else:
            print("\n✅ No selector alerts detected")
    
    def generate_report(self):
        """Generate a full report of selector performance"""
        print("\n📊 Selector Performance Report")
        print(f"Pages processed: {self.stats['total_pages']}")
        print(f"Publications processed: {self.stats['total_publications']}")
        
        # Calculate overall selector success rate
        total_success = sum(self.stats["selector_success"].values())
        total_failure = sum(self.stats["selector_failure"].values())
        total_attempts = total_success + total_failure
        
        if total_attempts > 0:
            overall_rate = total_success / total_attempts
            print(f"Overall selector success rate: {overall_rate:.2%}")
        
        # Print selector-specific stats
        print("\nSelector Performance:")
        for key in sorted(self.stats["selector_success"].keys()):
            success = self.stats["selector_success"][key]
            failure = self.stats["selector_failure"][key]
            total = success + failure
            
            if total > 0:
                rate = success / total
                print(f"  - {key}: {rate:.2%} success ({success}/{total})")
        
        # Print alerts
        self.print_alerts()

# Create a simple demo for the selector monitor
async def test_selector_monitor():
    """Demonstrate the selector monitoring system"""
    print("\n📡 Testing selector monitoring system...")
    
    # Create a monitor
    monitor = SelectorMonitor(SELECTOR_CONFIG)
    
    # Simulate some selector successes and failures
    monitor.record_success("publication", "container")
    monitor.record_success("publication", "container")
    monitor.record_success("publication", "title")
    monitor.record_success("publication", "title")
    monitor.record_failure("publication", "abstract")
    
    # Simulate multiple failures to trigger alert
    monitor.record_failure("publication", "file")
    monitor.record_failure("publication", "file")
    monitor.record_failure("publication", "file")
    monitor.record_success("publication", "file")
    
    # Record page and publication processing
    monitor.record_page_processed()
    monitor.record_page_processed()
    monitor.record_publication_processed()
    monitor.record_publication_processed()
    monitor.record_publication_processed()
    
    # Check selector health
    monitor.check_selector_health()
    
    # Generate full report
    monitor.generate_report()
    
    return monitor

# Run the test
selector_monitor = await test_selector_monitor()

# Show how to integrate the monitor with the parser
print("\n🔄 Integration Example: Using the monitor with the parser")
print("""
# Example integration in production code:

async def parse_with_monitoring(html, parser, monitor):
    # Record that we're processing a page
    monitor.record_page_processed()
    
    # Parse the page
    try:
        publications = await parser.parse_page(html)
        # Record successful parsing
        monitor.record_success("publication", "container")
        
        # For each publication, record successful elements
        for pub in publications:
            monitor.record_publication_processed()
            if pub.get("title"):
                monitor.record_success("publication", "title")
            else:
                monitor.record_failure("publication", "title")
                
            # And so on for other elements...
            
        return publications
    except Exception as e:
        # Record failure
        monitor.record_failure("publication", "container")
        logging.error(f"Error parsing page: {e}")
        return []
""")

print("\n✅ Implementation complete!")

  line 1:
//span[contains(@class, 'file')]
^



📋 Testing configurable selectors system...
Finding publication containers...
Found 20 publication containers

Testing element selectors within first publication:
- Title: Global Networks, Monetary Policy and Trade
- Authors: Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.
- Abstract: We develop a novel framework to study the interaction between monetary policy and trade. Our New Key...
- File URLs: []

Testing pagination selectors:
Found pagination container
Found last page URL: https://growthlab.hks.harvard.edu/publications?page=21

🔍 Testing publication parser with configurable selectors...


INFO:root:Found 20 publication containers
  line 1:
//span[contains(@class, 'file')]
^
  line 1:
//span[contains(@class, 'file')]
^
  line 1:
//span[contains(@class, 'file')]
^


Maximum page number: 21
Parsed 20 publications

First 3 parsed publications:

1. Global Networks, Monetary Policy and Trade
   Authors: Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.
   Year: 2025
   Abstract: We develop a novel framework to study the interaction between monetary policy and trade. Our New Key...
   URL: https://growthlab.hks.harvard.edu/publications/global-networks-monetary-policy-and-trade
   Files: []
   ID: pub_6494

2. Industrial policy for competitiveness in the energy transition
   Authors: Ahuja, K. & Hausmann, R.
   Year: 2025
   Abstract: Green objectives have reshaped public policy worldwide since the signing in 2015 of the Paris Agreem...
   URL: https://growthlab.hks.harvard.edu/publications/industrial-policy-competitiveness-energy-transition
   Files: ['https://growthlab.hks.harvard.edu/sites/projects.iq.harvard.edu/files/bruegel_blueprint_34_0.pdf']
   ID: pub_4859

3. Public-Private Dialogs to Spur Export-led Growth: The Case of Productivity Taskforces in

In [17]:
# %%
# STEP 2: Create a simple selector configuration

# Define selectors in a configuration dictionary
SELECTORS = {
    "publication": {
        "container": "div.biblio-entry",
        "title": "span.biblio-title",
        "authors": "span.biblio-authors",
        "abstract": "div.biblio-abstract-display",
        "file": "span.file",
    },
    "pagination": {
        "container": "ul.pager",
        "last_page": "li.pager-last",
    },
    "endnote": {
        "link": "li.biblio_tagged a",
    }
}

# Helper function to parse CSS selectors into BeautifulSoup arguments
def parse_selector(selector):
    """Parse a CSS selector string into tag and attributes for BeautifulSoup"""
    if "." in selector:
        parts = selector.split(".")
        tag = parts[0] if parts[0] else None
        class_name = parts[1]
        return tag, {"class": class_name}
    else:
        return selector, {}

# Test our selector parser with a few examples
test_selectors = [
    "div.biblio-entry",
    "span.biblio-title",
    "li"
]

print("Testing selector parser:")
for selector in test_selectors:
    tag, attrs = parse_selector(selector)
    print(f"'{selector}' -> tag: '{tag}', attrs: {attrs}")

Testing selector parser:
'div.biblio-entry' -> tag: 'div', attrs: {'class': 'biblio-entry'}
'span.biblio-title' -> tag: 'span', attrs: {'class': 'biblio-title'}
'li' -> tag: 'li', attrs: {}


In [18]:
# %%
# STEP 3: Create a function that uses the selectors

async def find_with_selector(soup, selector_key, base_element=None):
    """Find an element using the configured selector"""
    # Get the correct selector string
    parts = selector_key.split(".")
    if len(parts) != 2:
        print(f"Invalid selector key format: {selector_key}")
        return None
        
    section, name = parts
    if section not in SELECTORS or name not in SELECTORS[section]:
        print(f"Selector not found: {selector_key}")
        return None
        
    selector = SELECTORS[section][name]
    print(f"Using selector: {selector} for {selector_key}")
    
    # Parse the selector
    tag, attrs = parse_selector(selector)
    
    # Use the base element or soup
    element = base_element if base_element else soup
    
    # Find the element
    return element.find(tag, attrs)

# Test the function with a simple example
async def test_find_with_selector():
    """Test finding elements with our selector function"""
    # Fetch a page
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(scraper.base_url) as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return
                
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Find a publication container
            container = await find_with_selector(soup, "publication.container")
            if not container:
                print("No publication container found")
                return
                
            print("Found publication container")
            
            # Find title within container
            title_element = await find_with_selector(soup, "publication.title", container)
            if title_element:
                print(f"Title: {title_element.text.strip()}")
            else:
                print("Title not found")
            
            # Find authors within container
            authors_element = await find_with_selector(soup, "publication.authors", container)
            if authors_element:
                print(f"Authors: {authors_element.text.strip()}")
            else:
                print("Authors not found")

# Run the test
await test_find_with_selector()

Using selector: div.biblio-entry for publication.container
Found publication container
Using selector: span.biblio-title for publication.title
Title: Global Networks, Monetary Policy and Trade
Using selector: span.biblio-authors for publication.authors
Authors: Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.


In [19]:
# %%
# STEP 4: Add fallback selectors for robustness

# Update our selector configuration to include fallbacks
SELECTORS_WITH_FALLBACKS = {
    "publication": {
        "container": {
            "primary": "div.biblio-entry",
            "fallbacks": ["div.publication-item", "article.publication"]
        },
        "title": {
            "primary": "span.biblio-title",
            "fallbacks": ["h2.publication-title", "h3.title", "div.title"]
        },
        "authors": {
            "primary": "span.biblio-authors",
            "fallbacks": ["div.authors", "p.author-list"]
        },
        "abstract": {
            "primary": "div.biblio-abstract-display",
            "fallbacks": ["div.abstract", "p.abstract"]
        },
        "file": {
            "primary": "span.file",
            "fallbacks": ["a.file-download", "div.file-links a"]
        },
    },
    "pagination": {
        "container": {
            "primary": "ul.pager",
            "fallbacks": ["div.pagination", "nav.pagination"]
        },
        "last_page": {
            "primary": "li.pager-last",
            "fallbacks": ["li.page-item:last-child", "a.page-link:last-child"]
        }
    },
    "endnote": {
        "link": {
            "primary": "li.biblio_tagged a",
            "fallbacks": ["a.endnote-link", "a[href*='endnote']"]
        }
    }
}

# Update our function to try fallbacks
async def find_with_fallbacks(soup, selector_key, base_element=None):
    """Find an element using the configured selector with fallbacks"""
    # Get the correct selector configuration
    parts = selector_key.split(".")
    if len(parts) != 2:
        print(f"Invalid selector key format: {selector_key}")
        return None
        
    section, name = parts
    if section not in SELECTORS_WITH_FALLBACKS or name not in SELECTORS_WITH_FALLBACKS[section]:
        print(f"Selector not found: {selector_key}")
        return None
        
    selector_config = SELECTORS_WITH_FALLBACKS[section][name]
    
    # Use the base element or soup
    element = base_element if base_element else soup
    
    # Try the primary selector first
    primary = selector_config["primary"]
    tag, attrs = parse_selector(primary)
    result = element.find(tag, attrs)
    
    if result:
        print(f"Found with primary selector: {primary}")
        return result
    
    # If primary fails, try fallbacks
    for fallback in selector_config["fallbacks"]:
        print(f"Trying fallback selector: {fallback}")
        tag, attrs = parse_selector(fallback)
        result = element.find(tag, attrs)
        if result:
            print(f"Found with fallback selector: {fallback}")
            return result
    
    print(f"All selectors failed for {selector_key}")
    return None

# Test the function with fallbacks
async def test_find_with_fallbacks():
    """Test finding elements with our fallback selector function"""
    # Fetch a page
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(scraper.base_url) as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return
                
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Find a publication container
            container = await find_with_fallbacks(soup, "publication.container")
            if not container:
                print("No publication container found")
                return
                
            print("Found publication container")
            
            # Find title within container
            title_element = await find_with_fallbacks(soup, "publication.title", container)
            if title_element:
                print(f"Title: {title_element.text.strip()}")
            else:
                print("Title not found")
            
            # Find authors within container
            authors_element = await find_with_fallbacks(soup, "publication.authors", container)
            if authors_element:
                print(f"Authors: {authors_element.text.strip()}")
            else:
                print("Authors not found")
            
            # Find abstract within container (with real and fake selectors to test fallbacks)
            abstract_element = await find_with_fallbacks(soup, "publication.abstract", container)
            if abstract_element:
                abstract = abstract_element.text.strip()
                print(f"Abstract: {abstract[:100]}..." if len(abstract) > 100 else abstract)
            else:
                print("Abstract not found")

# Run the fallback test
await test_find_with_fallbacks()

Found with primary selector: div.biblio-entry
Found publication container
Found with primary selector: span.biblio-title
Title: Global Networks, Monetary Policy and Trade
Found with primary selector: span.biblio-authors
Authors: Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.
Found with primary selector: div.biblio-abstract-display
Abstract: We develop a novel framework to study the interaction between monetary policy and trade. Our New Key...


In [20]:
# %%
# STEP 4: Enhanced Fallback Testing with Website Inspection Tools

# First, let's add a function to extract XPath from an element
def get_xpath(element):
    """Get the XPath of an element"""
    components = []
    child = element if element.name else element.parent
    
    for parent in child.parents:
        siblings = parent.find_all(child.name, recursive=False)
        if len(siblings) > 1:
            # If there are multiple siblings, use position
            components.append(f"{child.name}[{siblings.index(child)+1}]")
        else:
            components.append(child.name)
        child = parent
        
        # Stop at the <html> level
        if parent.name == "html":
            break
            
    components.reverse()
    return "//" + "/".join(components)

# Add a function to inspect an element's attributes and structure
def inspect_element(element, print_xpath=True):
    """Print detailed information about an element"""
    if not element:
        print("No element provided to inspect")
        return
        
    print("\n📋 Element Inspection:")
    print(f"Tag: <{element.name}>")
    
    # Print attributes
    if element.attrs:
        print("Attributes:")
        for key, value in element.attrs.items():
            print(f"  {key}: {value}")
    else:
        print("No attributes")
    
    # Print XPath
    if print_xpath:
        try:
            xpath = get_xpath(element)
            print(f"XPath: {xpath}")
        except Exception as e:
            print(f"Error getting XPath: {e}")
    
    # Print text content (truncated)
    if element.text:
        text = element.text.strip()
        if len(text) > 100:
            text = text[:97] + "..."
        print(f"Text content: \"{text}\"")
    else:
        print("No text content")
    
    # Print immediate children (summary)
    children = element.find_all(recursive=False)
    if children:
        print(f"Direct children: {len(children)}")
        child_tags = {}
        for child in children:
            if child.name:
                child_tags[child.name] = child_tags.get(child.name, 0) + 1
        print("  " + ", ".join(f"{tag} ({count})" for tag, count in child_tags.items()))
    else:
        print("No children")

# Updated fallback function that can deliberately try fallbacks
async def test_all_selectors(soup, selector_key, base_element=None, force_fallbacks=False):
    """Test all selectors (primary and fallbacks) for a given selector key"""
    # Get the selector configuration
    parts = selector_key.split(".")
    if len(parts) != 2:
        print(f"Invalid selector key format: {selector_key}")
        return None
        
    section, name = parts
    if section not in SELECTORS_WITH_FALLBACKS or name not in SELECTORS_WITH_FALLBACKS[section]:
        print(f"Selector not found: {selector_key}")
        return None
        
    selector_config = SELECTORS_WITH_FALLBACKS[section][name]
    
    # Use the base element or soup
    element = base_element if base_element else soup
    
    # Test the primary selector
    primary = selector_config["primary"]
    print(f"\n🔍 Testing selector {selector_key}")
    print(f"Primary selector: {primary}")
    
    tag, attrs = parse_selector(primary)
    primary_result = element.find(tag, attrs)
    
    if primary_result and not force_fallbacks:
        print("✅ Primary selector succeeded")
        inspect_element(primary_result)
        return primary_result
    else:
        if not primary_result:
            print("❌ Primary selector failed")
        else:
            print("⚠️ Primary selector succeeded but testing fallbacks anyway")
    
    # Test each fallback
    for i, fallback in enumerate(selector_config["fallbacks"]):
        print(f"\nTrying fallback {i+1}: {fallback}")
        tag, attrs = parse_selector(fallback)
        fallback_result = element.find(tag, attrs)
        
        if fallback_result:
            print(f"✅ Fallback {i+1} succeeded")
            inspect_element(fallback_result)
            return fallback_result
        else:
            print(f"❌ Fallback {i+1} failed")
    
    print("\n❌ All selectors failed")
    return None

# Add a function to get a sample of publications for testing
async def get_publication_samples(num_pages=2, pubs_per_page=3, random_delay=True):
    """Get a sample of publications from multiple pages for testing"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    }
    
    all_containers = []
    
    # Get publications from multiple pages
    for page in range(num_pages):
        page_url = scraper.base_url if page == 0 else f"{scraper.base_url}?page={page}"
        print(f"Fetching page {page}: {page_url}")
        
        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(page_url) as response:
                if response.status != 200:
                    print(f"Error fetching page {page}: {response.status}")
                    continue
                
                html = await response.text()
                soup = BeautifulSoup(html, "html.parser")
                
                # Find publication containers
                tag, attrs = parse_selector(SELECTORS_WITH_FALLBACKS["publication"]["container"]["primary"])
                containers = soup.find_all(tag, attrs)
                
                if containers:
                    print(f"Found {len(containers)} publications on page {page}")
                    
                    # Select a sample from this page
                    import random
                    sample_size = min(pubs_per_page, len(containers))
                    if sample_size > 0:
                        # Use random sampling to get diverse examples
                        sampled_containers = random.sample(containers, sample_size)
                        all_containers.extend([(soup, container, page) for container in sampled_containers])
                else:
                    print(f"No publications found on page {page}")
        
        # Add a random delay between page requests
        if random_delay and page < num_pages - 1:
            delay = random.uniform(1, 3)
            print(f"Waiting {delay:.1f} seconds before next request...")
            await asyncio.sleep(delay)
    
    return all_containers

# Test function that thoroughly tests all selectors on multiple publications
async def test_selectors_on_real_publications():
    """Test all selectors on a diverse sample of real publications"""
    print("\n🧪 Testing selectors on real publications across multiple pages")
    
    # Get a sample of publications from different pages
    samples = await get_publication_samples(num_pages=3, pubs_per_page=2)
    
    if not samples:
        print("No publication samples found to test")
        return
    
    print(f"\nCollected {len(samples)} publication samples for testing")
    
    # Test all selectors on each publication
    results = {
        "publication.title": {"success": 0, "failure": 0},
        "publication.authors": {"success": 0, "failure": 0},
        "publication.abstract": {"success": 0, "failure": 0},
        "publication.file": {"success": 0, "failure": 0}
    }
    
    for i, (soup, container, page) in enumerate(samples):
        print(f"\n📄 Testing publication sample {i+1} from page {page}")
        
        # Test each selector
        for selector_key in results.keys():
            result = await test_all_selectors(soup, selector_key, container)
            
            if result:
                results[selector_key]["success"] += 1
            else:
                results[selector_key]["failure"] += 1
            
            # Add a small delay between tests
            await asyncio.sleep(0.5)
    
    # Print summary results
    print("\n📊 Selector Test Summary:")
    for selector_key, counts in results.items():
        total = counts["success"] + counts["failure"]
        success_rate = counts["success"] / total if total > 0 else 0
        status = "✅ GOOD" if success_rate == 1.0 else "⚠️ WARNING" if success_rate >= 0.5 else "❌ FAILING"
        
        print(f"{selector_key}: {status} - {success_rate*100:.1f}% success ({counts['success']}/{total})")

# Run the real-world testing
await test_selectors_on_real_publications()

# Now, let's add a tool to help you inspect the HTML directly
# %%
# HTML Inspector Tool

async def inspect_website_element(url=None, css_selector=None):
    """Fetch a website and inspect an element using CSS selector"""
    if not url:
        url = scraper.base_url
    
    print(f"Fetching: {url}")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url) as response:
            if response.status != 200:
                print(f"Error fetching URL: {response.status}")
                return
            
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            if not css_selector:
                # If no selector provided, give page structure overview
                print("\n📄 Page Structure Overview:")
                
                # Count tags by type
                tag_counts = {}
                for tag in soup.find_all(True):  # Find all tags
                    tag_name = tag.name
                    tag_counts[tag_name] = tag_counts.get(tag_name, 0) + 1
                
                print("Tags found:")
                for tag, count in sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
                    print(f"  {tag}: {count}")
                
                # Count classes
                class_counts = {}
                for tag in soup.find_all(True):
                    if "class" in tag.attrs:
                        for cls in tag.attrs["class"]:
                            class_counts[cls] = class_counts.get(cls, 0) + 1
                
                print("\nTop 15 classes:")
                for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
                    print(f"  {cls}: {count}")
                
                return
            
            # Find the element using the provided CSS selector
            try:
                if "." in css_selector:
                    parts = css_selector.split(".")
                    tag = parts[0] if parts[0] else None
                    cls = parts[1]
                    elements = soup.find_all(tag, class_=cls)
                else:
                    elements = soup.find_all(css_selector)
                
                if elements:
                    print(f"\nFound {len(elements)} elements matching '{css_selector}'")
                    
                    # Inspect the first element in detail
                    first_element = elements[0]
                    inspect_element(first_element)
                    
                    # If there are multiple elements, show a summary of others
                    if len(elements) > 1:
                        print(f"\nSummary of other {len(elements)-1} elements:")
                        for i, element in enumerate(elements[1:5]):  # Show up to 4 more
                            print(f"\nElement {i+2}:")
                            # Simplified inspection
                            print(f"Tag: <{element.name}>")
                            text = element.text.strip()
                            if text:
                                if len(text) > 50:
                                    text = text[:47] + "..."
                                print(f"Text: \"{text}\"")
                    
                    return first_element
                else:
                    print(f"No elements found matching '{css_selector}'")
                    return None
            except Exception as e:
                print(f"Error inspecting element with selector '{css_selector}': {e}")
                return None

# Let's provide some examples of how to use the inspector
print("""
# 🔍 Website Element Inspector Tool

Use this tool to inspect elements on the website directly. Examples:

1. Get page overview:
   await inspect_website_element()

2. Inspect publication containers:
   await inspect_website_element(css_selector="div.biblio-entry")

3. Inspect titles:
   await inspect_website_element(css_selector="span.biblio-title")

4. Inspect a specific page:
   await inspect_website_element(url="https://growthlab.hks.harvard.edu/publications?page=5")

5. Try the selector you're having trouble with:
   await inspect_website_element(css_selector="your-selector-here")
""")

# Let's try it with the publication container
element = await inspect_website_element(css_selector="div.biblio-entry")


🧪 Testing selectors on real publications across multiple pages
Fetching page 0: https://growthlab.hks.harvard.edu/publications
Found 20 publications on page 0
Waiting 1.2 seconds before next request...
Fetching page 1: https://growthlab.hks.harvard.edu/publications?page=1
Found 20 publications on page 1
Waiting 1.1 seconds before next request...
Fetching page 2: https://growthlab.hks.harvard.edu/publications?page=2
Found 20 publications on page 2

Collected 6 publication samples for testing

📄 Testing publication sample 1 from page 0

🔍 Testing selector publication.title
Primary selector: span.biblio-title
✅ Primary selector succeeded

📋 Element Inspection:
Tag: <span>
Attributes:
  class: ['biblio-title']
XPath: //body/div[2]/div[3]/div/div[2]/div/div[1]/div/section/div/div[2]/div[5]/span[2]
Text content: "De Facto Openness to Immigration"
Direct children: 1
  a (1)

🔍 Testing selector publication.authors
Primary selector: span.biblio-authors
✅ Primary selector succeeded

📋 Element I

In [21]:
# Improved selector configuration based on testing results

# Let's first investigate the publication that had missing files
async def investigate_missing_files():
    """Investigate the publication that had missing files"""
    print("\n🔍 Investigating publication with missing files")
    
    # Let's fetch all publications and check their file elements
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(scraper.base_url) as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return
                
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Get all publication containers
            containers = soup.find_all("div", class_="biblio-entry")
            print(f"Found {len(containers)} publications")
            
            # Check each publication for file elements
            publications_with_files = 0
            publications_without_files = []
            
            for i, container in enumerate(containers):
                # Get title for reference
                title_elem = container.find("span", class_="biblio-title")
                title = title_elem.text.strip() if title_elem else f"Publication {i+1}"
                
                # Look for file elements
                file_elems = container.find_all("span", class_="file")
                
                if file_elems:
                    publications_with_files += 1
                else:
                    publications_without_files.append((i+1, title))
                    # Let's look for any other link elements that might be files
                    links = container.find_all("a")
                    link_texts = [link.text.strip() for link in links if link.text.strip()]
                    print(f"\nPublication {i+1} without file elements: {title}")
                    print(f"Found {len(links)} links: {link_texts[:3]}")
            
            print(f"\nSummary: {publications_with_files}/{len(containers)} publications have file elements")
            if publications_without_files:
                print(f"{len(publications_without_files)} publications without file elements:")
                for i, (num, title) in enumerate(publications_without_files[:5]):
                    print(f"{i+1}. Publication {num}: {title}")
                
                # Let's check one of these publications more thoroughly
                if publications_without_files:
                    pub_num = publications_without_files[0][0] - 1  # Zero-indexed
                    problem_pub = containers[pub_num]
                    print("\nDetailed inspection of publication without files:")
                    
                    # Print the raw HTML to see the structure
                    html_snippet = str(problem_pub)[:500] + "..." if len(str(problem_pub)) > 500 else str(problem_pub)
                    print(f"\nHTML structure:\n{html_snippet}")
                    
                    # Look for any spans that might contain file info
                    spans = problem_pub.find_all("span")
                    print(f"\nFound {len(spans)} span elements:")
                    for i, span in enumerate(spans):
                        cls = span.get("class", ["no-class"])
                        text = span.text.strip()
                        print(f"{i+1}. <span class='{' '.join(cls)}'>{text[:30]}{'...' if len(text) > 30 else ''}</span>")

# Let's run the investigation
await investigate_missing_files()

# Now, create an improved selector configuration based on our findings
IMPROVED_SELECTORS = {
    "publication": {
        "container": {
            "primary": "div.biblio-entry",
            "fallbacks": ["div.node-biblio", "article.node-biblio"],
            "xpath": "//div[contains(@class, 'biblio-entry')]"
        },
        "title": {
            "primary": "span.biblio-title",
            "fallbacks": ["h2.title", "div.title", "a.biblio-title-link"],
            "xpath": "//span[contains(@class, 'biblio-title')]"
        },
        "authors": {
            "primary": "span.biblio-authors",
            "fallbacks": ["div.biblio-authors", "p.biblio-authors"],
            "xpath": "//span[contains(@class, 'biblio-authors')]"
        },
        "abstract": {
            "primary": "div.biblio-abstract-display",
            "fallbacks": ["div.abstract", "div.field-biblio-abstract"],
            "xpath": "//div[contains(@class, 'biblio-abstract')]"
        },
        "file": {
            "primary": "span.file",
            "fallbacks": [
                "a.biblio-download", 
                "a[href$='.pdf']",
                "a[href*='files']"
            ],
            "xpath": "//span[contains(@class, 'file')]"
        }
    },
    "pagination": {
        "container": {
            "primary": "ul.pager",
            "fallbacks": ["nav.pagination", "div.pagination"],
            "xpath": "//ul[contains(@class, 'pager')]"
        },
        "last_page": {
            "primary": "li.pager-last",
            "fallbacks": ["li.page-item:last-child", "a.page-link:last-child"],
            "xpath": "//li[contains(@class, 'pager-last')]"
        }
    },
    "endnote": {
        "link": {
            "primary": "li.biblio_tagged a",
            "fallbacks": ["a[href*='tagged=1']", "a[href*='endnote']", "a.endnote-link"],
            "xpath": "//a[contains(@href, 'tagged=1')]"
        }
    }
}

# Create a more robust implementation of parse_selector that handles complex CSS
def parse_advanced_selector(selector):
    """Parse a CSS selector into BeautifulSoup find arguments, handling more complex cases"""
    # Handle attribute selectors like a[href$='.pdf']
    if "[" in selector and "]" in selector:
        tag_part = selector.split("[")[0]
        attr_part = selector[selector.find("[")+1:selector.find("]")]
        
        # Process the tag part
        if "." in tag_part:
            tag, class_name = tag_part.split(".", 1)
            attrs = {"class": class_name}
        else:
            tag = tag_part
            attrs = {}
        
        # Process the attribute part
        if "=" in attr_part:
            attr_name, attr_value = attr_part.split("=", 1)
            
            # Handle special attribute selectors
            if attr_value.startswith("$") and attr_value.endswith("'"):
                # Ends with selector - we'll handle this specially
                return tag, attrs, attr_name, attr_value[2:-1], "ends_with"
            elif attr_value.startswith("*") and attr_value.endswith("'"):
                # Contains selector
                return tag, attrs, attr_name, attr_value[2:-1], "contains"
            else:
                # Regular equals
                attrs[attr_name] = attr_value.strip("'\"")
        
        return tag or None, attrs
    
    # Handle simple class selectors
    elif "." in selector:
        parts = selector.split(".")
        tag = parts[0] if parts[0] else None
        class_name = parts[1]
        return tag, {"class": class_name}
    
    # Handle just tag selectors
    else:
        return selector, {}

# Create an improved finder function that handles the special attribute selectors
async def find_with_improved_selectors(soup, selector_config, base_element=None):
    """Find elements using improved selector parsing"""
    element = base_element if base_element else soup
    
    # Try primary selector
    primary = selector_config["primary"]
    if primary:
        # Parse and apply the selector
        try:
            parsed = parse_advanced_selector(primary)
            
            # Handle special attribute selectors
            if len(parsed) > 2:
                tag, attrs, attr_name, attr_value, match_type = parsed
                
                # Find all matching elements and filter
                candidates = element.find_all(tag, attrs) if tag else element.find_all(True, attrs)
                
                if match_type == "ends_with":
                    result = next((el for el in candidates if attr_name in el.attrs and 
                                  el[attr_name].endswith(attr_value)), None)
                elif match_type == "contains":
                    result = next((el for el in candidates if attr_name in el.attrs and 
                                  attr_value in el[attr_name]), None)
                else:
                    result = None
            else:
                tag, attrs = parsed
                result = element.find(tag, attrs)
                
            if result:
                print(f"✅ Primary selector succeeded: {primary}")
                return result
            else:
                print(f"❌ Primary selector failed: {primary}")
        except Exception as e:
            print(f"Error with primary selector {primary}: {e}")
    
    # Try fallbacks
    for i, fallback in enumerate(selector_config["fallbacks"]):
        print(f"Trying fallback {i+1}: {fallback}")
        
        try:
            parsed = parse_advanced_selector(fallback)
            
            # Handle special attribute selectors
            if len(parsed) > 2:
                tag, attrs, attr_name, attr_value, match_type = parsed
                
                # Find all matching elements and filter
                candidates = element.find_all(tag, attrs) if tag else element.find_all(True, attrs)
                
                if match_type == "ends_with":
                    result = next((el for el in candidates if attr_name in el.attrs and 
                                  el[attr_name].endswith(attr_value)), None)
                elif match_type == "contains":
                    result = next((el for el in candidates if attr_name in el.attrs and 
                                  attr_value in el[attr_name]), None)
                else:
                    result = None
            else:
                tag, attrs = parsed
                result = element.find(tag, attrs)
            
            if result:
                print(f"✅ Fallback {i+1} succeeded: {fallback}")
                return result
        except Exception as e:
            print(f"Error with fallback selector {fallback}: {e}")
    
    # Try XPath as last resort
    if selector_config.get("xpath"):
        try:
            import lxml.html
            from lxml import etree
            
            # Convert BeautifulSoup object to lxml for XPath support
            dom = lxml.html.fromstring(str(element))
            xpath_result = dom.xpath(selector_config["xpath"])
            
            if xpath_result:
                print(f"✅ XPath selector succeeded: {selector_config['xpath']}")
                
                # Convert back to soup to maintain consistency
                result_html = etree.tostring(xpath_result[0])
                result_soup = BeautifulSoup(result_html, "html.parser")
                return result_soup.contents[0] if result_soup.contents else None
        except ImportError:
            print("lxml not available for XPath queries")
        except Exception as e:
            print(f"Error with XPath selector: {e}")
    
    print("❌ All selectors failed")
    return None

# Test our improved selectors
async def test_improved_selectors():
    """Test our improved selectors on a publication that was missing files"""
    print("\n🧪 Testing improved selectors")
    
    # Fetch a page
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        # Try the page where we had a file issue
        async with session.get(f"{scraper.base_url}?page=1") as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return
                
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Get all containers
            containers = soup.find_all("div", class_="biblio-entry")
            
            if containers:
                print(f"Found {len(containers)} containers")
                
                # Look for a container likely to be missing files
                test_container = None
                for container in containers:
                    if not container.find("span", class_="file"):
                        test_container = container
                        title_elem = container.find("span", class_="biblio-title")
                        title = title_elem.text.strip() if title_elem else "Unknown"
                        print(f"Testing with publication missing files: {title}")
                        break
                
                if not test_container:
                    print("No publication without files found, using first publication")
                    test_container = containers[0]
                
                # Test our selectors
                print("\nTesting title selector:")
                title_result = await find_with_improved_selectors(
                    soup, IMPROVED_SELECTORS["publication"]["title"], test_container
                )
                
                print("\nTesting authors selector:")
                authors_result = await find_with_improved_selectors(
                    soup, IMPROVED_SELECTORS["publication"]["authors"], test_container
                )
                
                print("\nTesting abstract selector:")
                abstract_result = await find_with_improved_selectors(
                    soup, IMPROVED_SELECTORS["publication"]["abstract"], test_container
                )
                
                print("\nTesting file selector with fallbacks:")
                file_result = await find_with_improved_selectors(
                    soup, IMPROVED_SELECTORS["publication"]["file"], test_container
                )
                
                # Print a summary of our results
                print("\n📊 Improved Selector Results:")
                print(f"Title: {'✅ Found' if title_result else '❌ Not found'}")
                print(f"Authors: {'✅ Found' if authors_result else '❌ Not found'}")
                print(f"Abstract: {'✅ Found' if abstract_result else '❌ Not found'}")
                print(f"File: {'✅ Found' if file_result else '❌ Not found'}")
                
                if file_result:
                    print(f"\nFile element found: {file_result.name}")
                    print(f"File text: {file_result.text.strip()}")
                    if file_result.name == "a":
                        print(f"File URL: {file_result.get('href')}")

# Run the improved selector test
await test_improved_selectors()



🔍 Investigating publication with missing files
Found 20 publications

Publication 1 without file elements: Global Networks, Monetary Policy and Trade
Found 3 links: ['Global Networks, Monetary Policy and Trade', "Publisher's Version", 'Abstract']

Publication 15 without file elements: Leaving Home: Cumulative Climate Shocks and Migration in Sub-Saharan Africa
Found 3 links: ['Leaving Home: Cumulative Climate Shocks and Migration in Sub-Saharan Africa', "Publisher's Version", 'Abstract']

Publication 18 without file elements: GLocal: A global development dataset of subnational administrative areas
Found 5 links: ['GLocal: A global development dataset of subnational administrative areas', "Publisher's Version", 'Abstract']

Summary: 17/20 publications have file elements
3 publications without file elements:
1. Publication 1: Global Networks, Monetary Policy and Trade
2. Publication 15: Leaving Home: Cumulative Climate Shocks and Migration in Sub-Saharan Africa
3. Publication 18: GLocal:

In [23]:
# %%
# Testing the newly identified selectors for file links

# Update our improved selector config with the new selectors
IMPROVED_SELECTORS["publication"]["file"]["fallbacks"] = [
    "a.biblio-download", 
    "#pub-cover-content-wrapper a",  # New selector from SelectorGadget
    ".Z3988+ a",                     # New selector (element after Z3988)
    "a[href$='.pdf']",
    "a[href*='files']"
]
IMPROVED_SELECTORS["publication"]["file"]["xpath"] = "//*[(@id = 'pub-cover-content-wrapper')]//a | //*[contains(concat(' ', @class, ' '), ' Z3988 ')]//following-sibling::a | //span[contains(@class, 'file')]"

async def test_specific_publication_file_selectors():
    """Test the newly identified file selectors on a specific publication"""
    print("\n🔍 Testing new file selectors on specific publication")
    
    # Fetch the specific publication page
    url = "https://growthlab.hks.harvard.edu/publications/global-networks-monetary-policy-and-trade"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url) as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return
                
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            print(f"Testing selectors on publication page: {url}")
            
            # Test each file selector one by one
            print("\nTesting file selectors individually:")
            
            # 1. Test span.file (original)
            file_spans = soup.find_all("span", class_="file")
            print(f"1. span.file: {len(file_spans)} elements found")
            if file_spans:
                for span in file_spans[:2]:  # Show first 2
                    links = span.find_all("a")
                    for link in links:
                        print(f"   - {link.get('href', 'No href')} ({link.text.strip()})")
            
            # 2. Test #pub-cover-content-wrapper a
            wrapper = soup.find(id="pub-cover-content-wrapper")
            if wrapper:
                cover_links = wrapper.find_all("a")
                print(f"2. #pub-cover-content-wrapper a: {len(cover_links)} elements found")
                for link in cover_links[:2]:  # Show first 2
                    print(f"   - {link.get('href', 'No href')} ({link.text.strip()})")
            else:
                print("2. #pub-cover-content-wrapper a: No wrapper found")
            
            # 3. Test .Z3988+ a
            z3988_elements = soup.find_all(class_="Z3988")
            z3988_next_links = []
            for z3988 in z3988_elements:
                next_element = z3988.find_next_sibling("a")
                if next_element:
                    z3988_next_links.append(next_element)
            
            print(f"3. .Z3988+ a: {len(z3988_next_links)} elements found")
            for link in z3988_next_links[:2]:  # Show first 2
                print(f"   - {link.get('href', 'No href')} ({link.text.strip()})")
            
            # 4. Test a[href$='.pdf']
            pdf_links = soup.find_all("a", href=lambda h: h and h.endswith(".pdf"))
            print(f"4. a[href$='.pdf']: {len(pdf_links)} elements found")
            for link in pdf_links[:2]:  # Show first 2
                print(f"   - {link.get('href', 'No href')} ({link.text.strip()})")
            
            # 5. Test combined XPath
            try:
                import lxml.html
                dom = lxml.html.fromstring(html)
                xpath_elements = dom.xpath(IMPROVED_SELECTORS["publication"]["file"]["xpath"])
                print(f"5. Combined XPath: {len(xpath_elements)} elements found")
                for element in xpath_elements[:2]:  # Show first 2
                    text = element.text_content().strip()
                    href = element.get("href", "No href")
                    print(f"   - {href} ({text})")
            except ImportError:
                print("5. Combined XPath: lxml not available")
            except Exception as e:
                print(f"5. Combined XPath: Error - {e}")
            
            # Now let's test our improved finder function
            print("\nTesting with improved finder function:")
            file_result = await find_with_improved_selectors(
                soup, IMPROVED_SELECTORS["publication"]["file"]
            )
            
            if file_result:
                print(f"✅ Found file element using selectors")
                print(f"Element: <{file_result.name}>")
                if file_result.name == "a":
                    print(f"URL: {file_result.get('href')}")
                    print(f"Text: {file_result.text.strip()}")
                else:
                    links = file_result.find_all("a")
                    print(f"Contains {len(links)} links:")
                    for link in links:
                        print(f"- {link.get('href')} ({link.text.strip()})")
            else:
                print("❌ No file element found with any selector")

# Let's also check the publication listings to see if our new selectors work there
async def test_file_selectors_on_listings():
    """Test our updated file selectors on the publication listings"""
    print("\n🔍 Testing file selectors on publication listing page")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(f"{scraper.base_url}?page=1") as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return
                
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Get all containers
            containers = soup.find_all("div", class_="biblio-entry")
            
            if not containers:
                print("No publications found on listing page")
                return
            
            print(f"Found {len(containers)} publications on listing")
            
            # Count publications with and without files using our improved selectors
            with_files = 0
            without_files = []
            
            for i, container in enumerate(containers[:10]):  # Test first 10
                title_elem = container.find("span", class_="biblio-title")
                title = title_elem.text.strip() if title_elem else f"Publication {i+1}"
                
                # Try our original file selector
                span_files = container.find_all("span", class_="file")
                
                # Try Z3988 selector
                z3988_elements = container.find_all(class_="Z3988")
                z3988_next_links = []
                for z3988 in z3988_elements:
                    next_element = z3988.find_next_sibling("a")
                    if next_element:
                        z3988_next_links.append(next_element)
                
                # Try PDF links
                pdf_links = container.find_all("a", href=lambda h: h and h.endswith(".pdf"))
                
                # Count total files found with either method
                total_files = len(span_files) + len(z3988_next_links) + len(pdf_links)
                
                if total_files > 0:
                    with_files += 1
                    print(f"{i+1}. {title}: ✅ Found {total_files} files")
                    print(f"   - span.file: {len(span_files)}")
                    print(f"   - .Z3988+ a: {len(z3988_next_links)}")
                    print(f"   - a[href$='.pdf']: {len(pdf_links)}")
                else:
                    without_files.append((i+1, title))
                    print(f"{i+1}. {title}: ❌ No files found with any selector")
            
            print(f"\nSummary: {with_files}/{len(containers[:10])} publications have files using our selectors")
            
            # For publications still without files, check if they have any links at all
            if without_files:
                print("\nInspecting publications without files:")
                for num, title in without_files:
                    container = containers[num-1]
                    links = container.find_all("a")
                    print(f"Publication {num}: {title}")
                    print(f"Total links: {len(links)}")
                    
                    # Show the first few links
                    for i, link in enumerate(links[:3]):
                        href = link.get("href", "No href")
                        text = link.text.strip()
                        print(f"- Link {i+1}: {href} ({text})")

# Run the tests
await test_specific_publication_file_selectors()
await test_file_selectors_on_listings()

# Create the final selector configuration with all our findings
FINAL_SELECTORS = {
    "publication": {
        "container": {
            "primary": "div.biblio-entry",
            "fallbacks": ["div.node-biblio", "article.node-biblio"],
            "xpath": "//div[contains(@class, 'biblio-entry')]"
        },
        "title": {
            "primary": "span.biblio-title",
            "fallbacks": ["h2.title", "a.biblio-title-link"],
            "xpath": "//span[contains(@class, 'biblio-title')]"
        },
        "authors": {
            "primary": "span.biblio-authors", 
            "fallbacks": ["div.biblio-authors", "span.field-biblio-authors"],
            "xpath": "//span[contains(@class, 'biblio-authors')]"
        },
        "abstract": {
            "primary": "div.biblio-abstract-display",
            "fallbacks": ["div.abstract", "div.field-biblio-abstract"],
            "xpath": "//div[contains(@class, 'biblio-abstract')]"
        },
        "file": {
            "primary": "span.file",
            "fallbacks": [
                ".Z3988+ a",                     # Element after Z3988
                "#pub-cover-content-wrapper a",  # Cover content links
                "a.biblio-download", 
                "a[href$='.pdf']",
                "a[href*='files']"
            ],
            "xpath": "//*[contains(@class, 'file')] | //*[contains(@class, 'Z3988')]/following-sibling::a | //*[@id='pub-cover-content-wrapper']//a | //a[contains(@href, '.pdf')]"
        }
    },
    "pagination": {
        "container": {
            "primary": "ul.pager",
            "fallbacks": ["nav.pagination", "div.pagination"],
            "xpath": "//ul[contains(@class, 'pager')]"
        },
        "last_page": {
            "primary": "li.pager-last",
            "fallbacks": ["li:last-child", "a:last-child"],
            "xpath": "//li[contains(@class, 'pager-last')]"
        }
    },
    "endnote": {
        "link": {
            "primary": "li.biblio_tagged a",
            "fallbacks": ["a[href*='tagged=1']", "a[href*='endnote']"],
            "xpath": "//a[contains(@href, 'tagged=1')]"
        }
    }
}



🔍 Testing new file selectors on specific publication
Testing selectors on publication page: https://growthlab.hks.harvard.edu/publications/global-networks-monetary-policy-and-trade

Testing file selectors individually:
1. span.file: 0 elements found
2. #pub-cover-content-wrapper a: 1 elements found
   - https://www.nber.org/system/files/working_papers/w33686/w33686.pdf (Publisher's Version)
3. .Z3988+ a: 0 elements found
4. a[href$='.pdf']: 1 elements found
   - https://www.nber.org/system/files/working_papers/w33686/w33686.pdf (Publisher's Version)
5. Combined XPath: 1 elements found
   - https://www.nber.org/system/files/working_papers/w33686/w33686.pdf (Publisher's Version)

Testing with improved finder function:
❌ Primary selector failed: span.file
Trying fallback 1: a.biblio-download
Trying fallback 2: #pub-cover-content-wrapper a
Trying fallback 3: .Z3988+ a
Trying fallback 4: a[href$='.pdf']
Trying fallback 5: a[href*='files']
✅ XPath selector succeeded: //*[(@id = 'pub-cover-c

In [25]:
# %%
# Find proper XPath alternatives for all critical elements

async def find_xpath_alternatives():
    """Find proper XPath alternatives for title, authors, and abstract"""
    print("\n🔍 Finding XPath alternatives for critical elements")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(scraper.base_url) as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return
                
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Find the first publication container
            container = soup.find("div", class_="biblio-entry")
            if not container:
                print("No publication container found")
                return
            
            print("Found publication container, examining critical elements:")
            
            # Examine title element
            title_element = container.find("span", class_="biblio-title")
            if title_element:
                print("\n1. TITLE ELEMENT:")
                print(f"Tag: <{title_element.name}>")
                print(f"Classes: {title_element.get('class')}")
                try:
                    # Trying to get accurate XPath
                    from selenium.webdriver.common.by import By
                    # This is just for demonstration - we're not actually using Selenium here
                    xpath_options = [
                        "//span[@class='biblio-title']",
                        "//span[contains(@class, 'biblio-title')]",
                        "//span[contains(@class, 'title')]"
                    ]
                    print("Potential XPath expressions:")
                    for xpath in xpath_options:
                        print(f"  {xpath}")
                except ImportError:
                    print("Selenium not available for XPath suggestions")
                
                # Check for other similar elements
                all_titles = soup.find_all("span", class_="biblio-title")
                print(f"Found {len(all_titles)} title elements on page")
                print("First few title texts:")
                for title in all_titles[:3]:
                    print(f"  \"{title.text.strip()}\"")
            else:
                print("No title element found")
            
            # Examine authors element
            authors_element = container.find("span", class_="biblio-authors")
            if authors_element:
                print("\n2. AUTHORS ELEMENT:")
                print(f"Tag: <{authors_element.name}>")
                print(f"Classes: {authors_element.get('class')}")
                xpath_options = [
                    "//span[@class='biblio-authors']",
                    "//span[contains(@class, 'biblio-authors')]",
                    "//span[contains(@class, 'authors')]"
                ]
                print("Potential XPath expressions:")
                for xpath in xpath_options:
                    print(f"  {xpath}")
                
                # Check for other similar elements
                all_authors = soup.find_all("span", class_="biblio-authors")
                print(f"Found {len(all_authors)} author elements on page")
                print("First few author texts:")
                for author in all_authors[:3]:
                    print(f"  \"{author.text.strip()}\"")
            else:
                print("No authors element found")
            
            # Examine abstract element
            abstract_element = container.find("div", class_="biblio-abstract-display")
            if abstract_element:
                print("\n3. ABSTRACT ELEMENT:")
                print(f"Tag: <{abstract_element.name}>")
                print(f"Classes: {abstract_element.get('class')}")
                xpath_options = [
                    "//div[@class='biblio-abstract-display']",
                    "//div[contains(@class, 'biblio-abstract-display')]",
                    "//div[contains(@class, 'abstract')]"
                ]
                print("Potential XPath expressions:")
                for xpath in xpath_options:
                    print(f"  {xpath}")
                
                # Check for other similar elements
                all_abstracts = soup.find_all("div", class_="biblio-abstract-display")
                print(f"Found {len(all_abstracts)} abstract elements on page")
                print("First abstract text (truncated):")
                if all_abstracts:
                    abstract_text = all_abstracts[0].text.strip()
                    print(f"  \"{abstract_text[:100]}...\"")
            else:
                print("No abstract element found")

# Run the function to find XPath alternatives
await find_xpath_alternatives()

# %%
# Create a simple selector configuration dictionary

# This is a simplified version focused on the original task
SELECTORS = {
    "title": {
        "primary": "span.biblio-title",
        "fallbacks": ["h2.title", "div.title", "span.field-biblio-title"],
        "xpath": "//span[contains(@class, 'biblio-title')]"
    },
    "authors": {
        "primary": "span.biblio-authors", 
        "fallbacks": ["div.authors", "span.field-biblio-authors"],
        "xpath": "//span[contains(@class, 'biblio-authors')]"
    },
    "abstract": {
        "primary": "div.biblio-abstract-display",
        "fallbacks": ["div.abstract", "div.field-biblio-abstract"],
        "xpath": "//div[contains(@class, 'abstract')]"
    }
}

# Simple function to try a selector with fallbacks
def try_selector_with_fallbacks(element, selector_config):
    """Try a selector with fallbacks and return the first match"""
    # Try primary
    if selector_config["primary"]:
        if "." in selector_config["primary"]:
            tag, class_name = selector_config["primary"].split(".", 1)
            tag = tag if tag else None
            result = element.find(tag, class_=class_name)
            if result:
                return result, selector_config["primary"]
    
    # Try fallbacks
    for fallback in selector_config["fallbacks"]:
        if "." in fallback:
            tag, class_name = fallback.split(".", 1)
            tag = tag if tag else None
            result = element.find(tag, class_=class_name)
            if result:
                return result, fallback
    
    return None, None

# %%
# Create a comprehensive testing function that can check all pages

async def test_selectors_on_all_pages(max_pages=21, publications_per_page=3):
    """Test selectors on publications from all pages"""
    print(f"\n🧪 Testing selectors on publications from all {max_pages} pages")
    print(f"(Sampling {publications_per_page} publications per page)")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    # Track success/failure of selectors
    results = {
        "title": {"success": 0, "failure": 0, "fallback_used": {}},
        "authors": {"success": 0, "failure": 0, "fallback_used": {}},
        "abstract": {"success": 0, "failure": 0, "fallback_used": {}}
    }
    
    # Process each page
    for page in range(max_pages):
        # Construct page URL
        page_url = scraper.base_url if page == 0 else f"{scraper.base_url}?page={page}"
        print(f"\nFetching page {page}: {page_url}")
        
        try:
            async with aiohttp.ClientSession(headers=headers) as session:
                async with session.get(page_url) as response:
                    if response.status != 200:
                        print(f"Error fetching page {page}: {response.status}")
                        continue
                    
                    html = await response.text()
                    soup = BeautifulSoup(html, "html.parser")
                    
                    # Find all publication containers
                    containers = soup.find_all("div", class_="biblio-entry")
                    
                    if not containers:
                        print(f"No publications found on page {page}")
                        continue
                    
                    print(f"Found {len(containers)} publications on page {page}")
                    
                    # Select a sample of publications to test
                    import random
                    sample_size = min(publications_per_page, len(containers))
                    if sample_size > 0:
                        sample_indices = random.sample(range(len(containers)), sample_size)
                        sampled_containers = [containers[i] for i in sample_indices]
                    else:
                        sampled_containers = []
                    
                    # Test each sampled publication
                    for i, container in enumerate(sampled_containers):
                        print(f"\nTesting publication {i+1} from page {page}:")
                        
                        # Try to find title
                        title_element, title_selector = try_selector_with_fallbacks(container, SELECTORS["title"])
                        if title_element:
                            results["title"]["success"] += 1
                            results["title"]["fallback_used"][title_selector] = results["title"]["fallback_used"].get(title_selector, 0) + 1
                            print(f"✅ Title found using {title_selector}: \"{title_element.text.strip()[:50]}\"")
                        else:
                            results["title"]["failure"] += 1
                            print("❌ Title not found")
                        
                        # Try to find authors
                        authors_element, authors_selector = try_selector_with_fallbacks(container, SELECTORS["authors"])
                        if authors_element:
                            results["authors"]["success"] += 1
                            results["authors"]["fallback_used"][authors_selector] = results["authors"]["fallback_used"].get(authors_selector, 0) + 1
                            print(f"✅ Authors found using {authors_selector}: \"{authors_element.text.strip()}\"")
                        else:
                            results["authors"]["failure"] += 1
                            print("❌ Authors not found")
                        
                        # Try to find abstract
                        abstract_element, abstract_selector = try_selector_with_fallbacks(container, SELECTORS["abstract"])
                        if abstract_element:
                            results["abstract"]["success"] += 1
                            results["abstract"]["fallback_used"][abstract_selector] = results["abstract"]["fallback_used"].get(abstract_selector, 0) + 1
                            abstract_text = abstract_element.text.strip()
                            print(f"✅ Abstract found using {abstract_selector}: \"{abstract_text[:50]}...\"")
                        else:
                            results["abstract"]["failure"] += 1
                            print("❌ Abstract not found")
            
            # Add a short delay between pages
            await asyncio.sleep(1)
            
        except Exception as e:
            print(f"Error processing page {page}: {e}")
    
    # Print summary results
    print("\n📊 Selector Test Results:")
    for element_type, stats in results.items():
        total = stats["success"] + stats["failure"]
        if total > 0:
            success_rate = stats["success"] / total * 100
            status = "✅ GOOD" if success_rate == 100 else "⚠️ WARNING" if success_rate >= 80 else "❌ FAILING"
            print(f"{status} {element_type.upper()}: {success_rate:.1f}% success ({stats['success']}/{total})")
            
            # Show which selectors were used
            if stats["success"] > 0:
                print("  Selectors used:")
                for selector, count in sorted(stats["fallback_used"].items(), key=lambda x: x[1], reverse=True):
                    percentage = count / stats["success"] * 100
                    print(f"  - {selector}: {percentage:.1f}% ({count}/{stats['success']})")
    
    return results

# Run the test on all pages
test_results = await test_selectors_on_all_pages()



🔍 Finding XPath alternatives for critical elements
Found publication container, examining critical elements:

1. TITLE ELEMENT:
Tag: <span>
Classes: ['biblio-title']
Selenium not available for XPath suggestions
Found 20 title elements on page
First few title texts:
  "Global Networks, Monetary Policy and Trade"
  "Industrial policy for competitiveness in the energy transition"
  "Public-Private Dialogs to Spur Export-led Growth: The Case of Productivity Taskforces in Namibia"

2. AUTHORS ELEMENT:
Tag: <span>
Classes: ['biblio-authors']
Potential XPath expressions:
  //span[@class='biblio-authors']
  //span[contains(@class, 'biblio-authors')]
  //span[contains(@class, 'authors')]
Found 20 author elements on page
First few author texts:
  "Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A."
  "Ahuja, K. & Hausmann, R."
  "Fortunato, A. & Santos, M.A."

3. ABSTRACT ELEMENT:
Tag: <div>
Classes: ['biblio-abstract-display', 'os-slider']
Potential XPath expressions:
  //div[@class='biblio-abstrac

In [26]:
# Testing the more specific selectors found with SelectorGadget

async def test_more_specific_selectors():
    """Test the more specific selectors for authors and citation"""
    print("\n🔍 Testing more specific selectors from SelectorGadget")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    base_url = "https://growthlab.hks.harvard.edu"
    
    async with aiohttp.ClientSession(headers=headers) as session:
        # Test on the main publications page
        async with session.get(f"{base_url}/publications") as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Get the first publication container
            container = soup.find("div", class_="biblio-entry")
            if not container:
                print("No publication found")
                return
            
            print("\nTesting authors selector with SelectorGadget recommendation:")
            
            # Test the standard selector
            standard_authors = container.find("span", class_="biblio-authors")
            print(f"Standard selector (span.biblio-authors): {'✅ Found' if standard_authors else '❌ Not found'}")
            if standard_authors:
                print(f"Text: \"{standard_authors.text.strip()}\"")
            
            # Test the complex selector from SelectorGadget
            # .biblio-separator-bar+ .clearfix .biblio-authors
            separator = container.find(class_="biblio-separator-bar")
            complex_authors = None
            
            if separator:
                # Find next sibling with class clearfix
                sibling = separator.find_next_sibling(class_="clearfix")
                if sibling:
                    # Within this, find element with class biblio-authors
                    complex_authors = sibling.find(class_="biblio-authors")
            
            print(f"Complex selector (.biblio-separator-bar+ .clearfix .biblio-authors): {'✅ Found' if complex_authors else '❌ Not found'}")
            if complex_authors:
                print(f"Text: \"{complex_authors.text.strip()}\"")
            
            # Test XPath selector
            try:
                import lxml.html
                from lxml import etree
                
                dom = lxml.html.fromstring(str(container))
                xpath = "//*+[contains(concat( \" \", @class, \" \" ), concat( \" \", \"biblio-separator-bar\", \" \" ))]//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"clearfix\", \" \" ))]//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"biblio-authors\", \" \" ))]"
                xpath_results = dom.xpath(xpath)
                
                print(f"XPath selector: {'✅ Found' if xpath_results else '❌ Not found'}")
                if xpath_results:
                    # Convert to string to get text
                    text = xpath_results[0].text_content().strip()
                    print(f"Text: \"{text}\"")
            except ImportError:
                print("XPath selector: ❌ lxml not available")
            except Exception as e:
                print(f"XPath selector: ❌ Error - {e}")
        
        # Test on individual publication page
        async with session.get(f"{base_url}/publications/global-networks-monetary-policy-and-trade") as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            print("\nTesting citation selector on individual publication page:")
            
            # Test for biblio-citation
            citation = soup.find(class_="biblio-citation")
            print(f"Citation selector (.biblio-citation): {'✅ Found' if citation else '❌ Not found'}")
            if citation:
                citation_text = citation.text.strip()
                print(f"Citation text: \"{citation_text}\"")
                
                # Demonstrate parsing author from citation
                if citation_text and "," in citation_text:
                    # Split by year (after author, before title)
                    year_match = re.search(r"\b(19|20)\d{2}\b", citation_text)
                    if year_match:
                        year_index = year_match.start()
                        author_part = citation_text[:year_index].strip()
                        # Remove trailing comma if present
                        if author_part.endswith("."):
                            author_part = author_part[:-1].strip()
                        print(f"Extracted author: \"{author_part}\"")
                        
                        # Extract year
                        year = int(year_match.group())
                        print(f"Extracted year: {year}")
                        
                        # Extract title (after year and dot)
                        title_start = year_match.end() + 1  # Skip the period after year
                        rest = citation_text[title_start:].strip()
                        if "." in rest:
                            title_end = rest.find(".")
                            title = rest[:title_end].strip()
                            print(f"Extracted title: \"{title}\"")

# Run the test
await test_more_specific_selectors()



🔍 Testing more specific selectors from SelectorGadget

Testing authors selector with SelectorGadget recommendation:
Standard selector (span.biblio-authors): ✅ Found
Text: "Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A."
Complex selector (.biblio-separator-bar+ .clearfix .biblio-authors): ❌ Not found
XPath selector: ❌ Error - Invalid expression

Testing citation selector on individual publication page:
Citation selector (.biblio-citation): ✅ Found
Citation text: "Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A., 2025. Global Networks, Monetary Policy and Trade. Copy at http://www.tinyurl.com/27pjk6pp"
Extracted author: "Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.,"
Extracted year: 2025
Extracted title: "Global Networks, Monetary Policy and Trade"


In [27]:
# %%
# Testing the more specific selectors found with SelectorGadget

async def test_more_specific_selectors():
    """Test the more specific selectors for authors and citation"""
    print("\n🔍 Testing more specific selectors from SelectorGadget")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    base_url = "https://growthlab.hks.harvard.edu"
    
    async with aiohttp.ClientSession(headers=headers) as session:
        # Test on the main publications page
        async with session.get(f"{base_url}/publications") as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Get the first publication container
            container = soup.find("div", class_="biblio-entry")
            if not container:
                print("No publication found")
                return
            
            print("\nTesting authors selector with SelectorGadget recommendation:")
            
            # Test the standard selector
            standard_authors = container.find("span", class_="biblio-authors")
            print(f"Standard selector (span.biblio-authors): {'✅ Found' if standard_authors else '❌ Not found'}")
            if standard_authors:
                print(f"Text: \"{standard_authors.text.strip()}\"")
            
            # Test the complex selector from SelectorGadget
            # .biblio-separator-bar+ .clearfix .biblio-authors
            separator = container.find(class_="biblio-separator-bar")
            complex_authors = None
            
            if separator:
                # Find next sibling with class clearfix
                sibling = separator.find_next_sibling(class_="clearfix")
                if sibling:
                    # Within this, find element with class biblio-authors
                    complex_authors = sibling.find(class_="biblio-authors")
            
            print(f"Complex selector (.biblio-separator-bar+ .clearfix .biblio-authors): {'✅ Found' if complex_authors else '❌ Not found'}")
            if complex_authors:
                print(f"Text: \"{complex_authors.text.strip()}\"")
            
            # Test XPath selector
            try:
                import lxml.html
                from lxml import etree
                
                dom = lxml.html.fromstring(str(container))
                xpath = "//*+[contains(concat( \" \", @class, \" \" ), concat( \" \", \"biblio-separator-bar\", \" \" ))]//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"clearfix\", \" \" ))]//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"biblio-authors\", \" \" ))]"
                xpath_results = dom.xpath(xpath)
                
                print(f"XPath selector: {'✅ Found' if xpath_results else '❌ Not found'}")
                if xpath_results:
                    # Convert to string to get text
                    text = xpath_results[0].text_content().strip()
                    print(f"Text: \"{text}\"")
            except ImportError:
                print("XPath selector: ❌ lxml not available")
            except Exception as e:
                print(f"XPath selector: ❌ Error - {e}")
        
        # Test on individual publication page
        async with session.get(f"{base_url}/publications/global-networks-monetary-policy-and-trade") as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            print("\nTesting citation selector on individual publication page:")
            
            # Test for biblio-citation
            citation = soup.find(class_="biblio-citation")
            print(f"Citation selector (.biblio-citation): {'✅ Found' if citation else '❌ Not found'}")
            if citation:
                citation_text = citation.text.strip()
                print(f"Citation text: \"{citation_text}\"")
                
                # Demonstrate parsing author from citation
                if citation_text and "," in citation_text:
                    # Split by year (after author, before title)
                    year_match = re.search(r"\b(19|20)\d{2}\b", citation_text)
                    if year_match:
                        year_index = year_match.start()
                        author_part = citation_text[:year_index].strip()
                        # Remove trailing comma if present
                        if author_part.endswith("."):
                            author_part = author_part[:-1].strip()
                        print(f"Extracted author: \"{author_part}\"")
                        
                        # Extract year
                        year = int(year_match.group())
                        print(f"Extracted year: {year}")
                        
                        # Extract title (after year and dot)
                        title_start = year_match.end() + 1  # Skip the period after year
                        rest = citation_text[title_start:].strip()
                        if "." in rest:
                            title_end = rest.find(".")
                            title = rest[:title_end].strip()
                            print(f"Extracted title: \"{title}\"")

# Run the test
await test_more_specific_selectors()

# %%
# Update our selector configuration with the new findings

# Updated selector configuration with SelectorGadget findings
UPDATED_SELECTORS = {
    "publication": {
        "container": "div.biblio-entry",
        "title": {
            "primary": "span.biblio-title",
            "fallbacks": ["h2.title", "div.title", "span.field-biblio-title", ".biblio-citation"],
            "xpath": "//span[contains(@class, 'biblio-title')]"
        },
        "authors": {
            "primary": "span.biblio-authors", 
            "fallbacks": [
                ".biblio-separator-bar+ .clearfix .biblio-authors",  # New selector from SelectorGadget
                "div.authors", 
                "span.field-biblio-authors",
                ".biblio-citation"  # We can extract authors from the citation
            ],
            "xpath": "//*[contains(@class, 'biblio-authors')] | //*[contains(@class, 'biblio-separator-bar')]/following-sibling::*[contains(@class, 'clearfix')]//*[contains(@class, 'biblio-authors')]"
        },
        "abstract": {
            "primary": "div.biblio-abstract-display",
            "fallbacks": ["div.abstract", "div.field-biblio-abstract", "div.field-name-field-abstract"],
            "xpath": "//div[contains(@class, 'abstract')]"
        },
        "file": {
            "primary": "span.file",
            "fallbacks": [".Z3988+ a", "#pub-cover-content-wrapper a", "a[href$='.pdf']"],
            "xpath": "//span[contains(@class, 'file')] | //*[contains(@class, 'Z3988')]/following-sibling::a | //*[@id='pub-cover-content-wrapper']//a"
        }
    },
    "pagination": {
        "container": "ul.pager",
        "last_page": "li.pager-last"
    }
}

# Improved helper function to handle more complex selectors
def improved_find_with_fallbacks(element, config):
    """Find an element using primary selector with fallbacks, supporting complex selectors
    
    Args:
        element: BeautifulSoup element to search within
        config: Selector configuration with primary and fallbacks
    
    Returns:
        BeautifulSoup element or None if not found
    """
    # Try primary selector
    result = try_complex_selector(element, config["primary"])
    if result:
        return result
    
    # Try fallbacks if provided
    if "fallbacks" in config:
        for fallback in config["fallbacks"]:
            # Special case for citation
            if fallback == ".biblio-citation":
                citation = element.find(class_="biblio-citation")
                if citation:
                    return citation
                    
            result = try_complex_selector(element, fallback)
            if result:
                return result
    
    # Try XPath as last resort
    if "xpath" in config:
        try:
            import lxml.html
            from lxml import etree
            
            # We need to parse the HTML of the element
            dom = lxml.html.fromstring(str(element))
            xpath_result = dom.xpath(config["xpath"])
            
            if xpath_result:
                # Convert back to BeautifulSoup for consistency
                result_html = etree.tostring(xpath_result[0])
                result_soup = BeautifulSoup(result_html, "html.parser")
                return result_soup.contents[0] if result_soup.contents else None
        except:
            pass
    
    return None

def try_complex_selector(element, selector):
    """Try a potentially complex CSS selector
    
    Handles:
    - Simple class selectors (div.class)
    - Sibling selectors (a+ b)
    - Descendant selectors (a b)
    """
    if not selector:
        return None
    
    try:
        # Handle adjacent sibling selector (e.g. .biblio-separator-bar+ .clearfix .biblio-authors)
        if "+ " in selector:
            parts = selector.split("+ ", 1)
            first_part = parts[0].strip()
            second_part = parts[1].strip()
            
            # Find the first element
            first_element = None
            if "." in first_part:
                tag, class_name = first_part.split(".", 1)
                tag = tag if tag else None
                first_element = element.find(tag, class_=class_name)
            else:
                first_element = element.find(first_part)
                
            if not first_element:
                return None
                
            # Find the next sibling element
            sibling = first_element.find_next_sibling()
            
            # If second part has more complex rules (like a descendant selector)
            if " " in second_part:
                sibling_parts = second_part.split(" ", 1)
                sibling_class = sibling_parts[0]
                descendant_selector = sibling_parts[1]
                
                # Check if sibling matches the first part of second_part
                if "." in sibling_class:
                    tag, class_name = sibling_class.split(".", 1)
                    tag = tag if tag else None
                    if (not tag or sibling.name == tag) and class_name in sibling.get("class", []):
                        # Now find the descendant
                        return try_complex_selector(sibling, descendant_selector)
                elif sibling.name == sibling_class:
                    # Now find the descendant
                    return try_complex_selector(sibling, descendant_selector)
                
                return None
            else:
                # Simple sibling check
                if "." in second_part:
                    tag, class_name = second_part.split(".", 1)
                    tag = tag if tag else None
                    if (not tag or sibling.name == tag) and class_name in sibling.get("class", []):
                        return sibling
                elif sibling.name == second_part:
                    return sibling
                
                return None
        
        # Handle descendant selector (e.g. .clearfix .biblio-authors)
        elif " " in selector and not selector.startswith("#"):
            parts = selector.split(" ", 1)
            parent_selector = parts[0].strip()
            child_selector = parts[1].strip()
            
            # Find the parent element
            parent = None
            if "." in parent_selector:
                tag, class_name = parent_selector.split(".", 1)
                tag = tag if tag else None
                parent = element.find(tag, class_=class_name)
            else:
                parent = element.find(parent_selector)
                
            if not parent:
                return None
                
            # Find the child within the parent
            return try_complex_selector(parent, child_selector)
            
        # Handle ID selector
        elif "#" in selector:
            parts = selector.split("#", 1)
            element_id = parts[1]
            
            if "." in parts[0]:
                tag, class_name = parts[0].split(".", 1)
                tag = tag if tag else None
                return element.find(tag, id=element_id, class_=class_name)
            else:
                tag = parts[0] if parts[0] else None
                return element.find(tag, id=element_id)
                
        # Handle simple class selector
        elif "." in selector:
            tag, class_name = selector.split(".", 1)
            tag = tag if tag else None
            return element.find(tag, class_=class_name)
            
        # Handle simple tag selector
        else:
            return element.find(selector)
    except Exception as e:
        print(f"Error with selector '{selector}': {e}")
        return None

# Improved publication parsing to handle complex selectors and citation extraction
def improved_parse_publication(pub_element, base_url):
    """Parse a publication using improved selector handling"""
    # Find title element
    title_element = improved_find_with_fallbacks(pub_element, UPDATED_SELECTORS["publication"]["title"])
    
    title = None
    if title_element:
        if "biblio-citation" in title_element.get("class", []):
            # Extract title from citation
            citation_text = title_element.text.strip()
            year_match = re.search(r"\b(19|20)\d{2}\b", citation_text)
            if year_match:
                title_start = year_match.end() + 1  # Skip the period after year
                rest = citation_text[title_start:].strip()
                if "." in rest:
                    title_end = rest.find(".")
                    title = rest[:title_end].strip()
        else:
            title = title_element.text.strip()
    
    # Get publication URL
    pub_url = None
    if title_element:
        title_link = title_element.find("a")
        if title_link and title_link.get("href"):
            pub_url = title_link["href"]
            if not pub_url.startswith(("http://", "https://")):
                pub_url = f"{base_url}{pub_url}"
    
    # Find authors element
    authors_element = improved_find_with_fallbacks(pub_element, UPDATED_SELECTORS["publication"]["authors"])
    
    authors = None
    if authors_element:
        if "biblio-citation" in authors_element.get("class", []):
            # Extract authors from citation
            citation_text = authors_element.text.strip()
            year_match = re.search(r"\b(19|20)\d{2}\b", citation_text)
            if year_match:
                year_index = year_match.start()
                authors = citation_text[:year_index].strip()
                # Remove trailing comma or period if present
                if authors.endswith(".") or authors.endswith(","):
                    authors = authors[:-1].strip()
        else:
            authors = authors_element.text.strip()
    
    # Extract year from text after authors or from citation
    year = None
    if authors_element:
        # Check if we have a citation
        if "biblio-citation" in authors_element.get("class", []):
            citation_text = authors_element.text.strip()
            year_match = re.search(r"\b(19|20)\d{2}\b", citation_text)
            if year_match:
                year = int(year_match.group())
        else:
            sibling_text = authors_element.next_sibling
            if sibling_text:
                year_match = re.search(r"\b(19|20)\d{2}\b", sibling_text)
                if year_match:
                    year = int(year_match.group())
    
    # Find abstract element
    abstract_element = improved_find_with_fallbacks(pub_element, UPDATED_SELECTORS["publication"]["abstract"])
    abstract = abstract_element.text.strip() if abstract_element else None
    
    # Find file elements (with specialized handling due to multiple potential formats)
    file_urls = []
    file_element = improved_find_with_fallbacks(pub_element, UPDATED_SELECTORS["publication"]["file"])
    
    if file_element:
        if file_element.name == "a" and file_element.get("href"):
            # Direct link
            file_url = file_element["href"]
            if not file_url.startswith(("http://", "https://")):
                file_url = f"{base_url}{file_url}"
            file_urls.append(file_url)
        else:
            # Container with links inside
            for link in file_element.find_all("a"):
                if link.get("href"):
                    file_url = link["href"]
                    if not file_url.startswith(("http://", "https://")):
                        file_url = f"{base_url}{file_url}"
                    file_urls.append(file_url)
    
    # Create publication object
    publication = {
        "title": title,
        "authors": authors,
        "year": year,
        "abstract": abstract,
        "pub_url": pub_url,
        "file_urls": file_urls,
        "source": "GrowthLab"
    }
    
    return publication

# Test our improved parser with new selectors
async def test_improved_parser():
    """Test our improved parser with the updated selectors"""
    print("\n🧪 Testing improved parser with updated selectors")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    base_url = "https://growthlab.hks.harvard.edu"
    
    async with aiohttp.ClientSession(headers=headers) as session:
        # Test on the main publications page
        async with session.get(f"{base_url}/publications") as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Find publications with missing authors from previous tests
            found_problematic = False
            containers = soup.find_all("div", class_="biblio-entry")
            
            for i, container in enumerate(containers[:10]):  # Check first 10
                # Quick check if this has the standard authors element
                std_authors = container.find("span", class_="biblio-authors")
                
                if not std_authors:
                    found_problematic = True
                    print(f"\nFound publication {i+1} with missing standard authors element")
                    
                    # Try parsing with improved parser
                    publication = improved_parse_publication(container, base_url)
                    
                    print("Parsed with improved parser:")
                    print(f"Title: {publication['title']}")
                    print(f"Authors: {publication['authors']}")
                    print(f"Year: {publication['year']}")
                    print(f"Abstract: {publication['abstract'][:100]}..." if publication['abstract'] else "No abstract")
            
            if not found_problematic:
                # Just test the first publication
                print("\nNo problematic publications found, testing first publication")
                container = containers[0]
                
                # Try parsing with improved parser
                publication = improved_parse_publication(container, base_url)
                
                print("Parsed with improved parser:")
                print(f"Title: {publication['title']}")
                print(f"Authors: {publication['authors']}")
                print(f"Year: {publication['year']}")
                print(f"Abstract: {publication['abstract'][:100]}..." if publication['abstract'] else "No abstract")
                
        # Test on a known problematic publication
        async with session.get(f"{base_url}/publications/global-networks-monetary-policy-and-trade") as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            print("\nTesting on individual publication page:")
            
            # For individual pages, we need to create a mock container
            mock_container = BeautifulSoup("<div></div>", "html.parser").div
            
            # Add citation if available
            citation = soup.find(class_="biblio-citation")
            if citation:
                mock_container.append(citation)
                
            # Add abstract if available
            abstract = soup.find(class_="field-name-field-abstract")
            if abstract:
                mock_container.append(abstract)
                
            # Add file links
            cover_wrapper = soup.find(id="pub-cover-content-wrapper")
            if cover_wrapper:
                mock_container.append(cover_wrapper)
            
            # Parse with improved parser
            publication = improved_parse_publication(mock_container, base_url)
            
            print("Parsed with improved parser:")
            print(f"Title: {publication['title']}")
            print(f"Authors: {publication['authors']}")
            print(f"Year: {publication['year']}")
            print(f"Abstract: {publication['abstract'][:100]}..." if publication['abstract'] else "No abstract")
            print(f"File URLs: {publication['file_urls']}")

# Run the test
await test_improved_parser()

# %%
# Final solution with the improved selectors



🔍 Testing more specific selectors from SelectorGadget

Testing authors selector with SelectorGadget recommendation:
Standard selector (span.biblio-authors): ✅ Found
Text: "Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A."
Complex selector (.biblio-separator-bar+ .clearfix .biblio-authors): ❌ Not found
XPath selector: ❌ Error - Invalid expression

Testing citation selector on individual publication page:
Citation selector (.biblio-citation): ✅ Found
Citation text: "Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A., 2025. Global Networks, Monetary Policy and Trade. Copy at http://www.tinyurl.com/27pjk6pp"
Extracted author: "Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.,"
Extracted year: 2025
Extracted title: "Global Networks, Monetary Policy and Trade"

🧪 Testing improved parser with updated selectors

No problematic publications found, testing first publication
Parsed with improved parser:
Title: Global Networks, Monetary Policy and Trade
Authors: Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.

In [31]:
# %%
# FINAL FOCUSED IMPLEMENTATION: SELECTORS WITH CITATION FALLBACK

# Refined selector configuration prioritizing direct selectors
SELECTOR_CONFIG = {
    "publication": {
        "container": "div.biblio-entry",
        "title": {
            "primary": "span.biblio-title",
            "fallbacks": ["h1.page-title", "h2.title", "div.title", "span.field-biblio-title"],
            "xpath": "//span[contains(@class, 'biblio-title')] | //h1[contains(@class, 'page-title')]"
        },
        "authors": {
            "primary": "span.biblio-authors", 
            "fallbacks": [
                "div.field-name-field-biblio-authors .field-item",   # Individual page
                "div.authors", 
                "span.field-biblio-authors"
            ],
            "xpath": "//span[contains(@class, 'biblio-authors')] | //div[contains(@class, 'field-name-field-biblio-authors')]//div[@class='field-item']"
        },
        "abstract": {
            "primary": "div.biblio-abstract-display",
            "fallbacks": [
                "div.field-name-field-abstract",  # Individual page
                "div.abstract", 
                "div.field-biblio-abstract"
            ],
            "xpath": "//div[contains(@class, 'biblio-abstract-display')] | //div[contains(@class, 'field-name-field-abstract')]"
        },
        "file": {
            "primary": "span.file",
            "fallbacks": [
                "#pub-cover-content-wrapper a",  # SelectorGadget discovery for individual pages
                ".Z3988+ a",                     # Element after Z3988
                "a[href$='.pdf']",               # Any PDF link
                "a[href*='files']"               # Links containing "files"
            ],
            "xpath": "//span[contains(@class, 'file')] | //*[@id='pub-cover-content-wrapper']//a | //*[contains(@class, 'Z3988')]/following-sibling::a | //a[contains(@href, '.pdf')]"
        },
        "citation": {
            "primary": ".biblio-citation",       # Only used as fallback when other selectors fail
            "fallbacks": [
                "div.field-name-field-citation",
                "div.citation"
            ],
            "xpath": "//div[contains(@class, 'biblio-citation')] | //div[contains(@class, 'field-name-field-citation')]"
        }
    },
    "pagination": {
        "container": "ul.pager",
        "last_page": "li.pager-last"
    }
}

# Helper function to find elements with fallbacks
def find_with_fallbacks(element, config):
    """Find an element using primary selector with fallbacks
    
    Args:
        element: BeautifulSoup element to search within
        config: Selector configuration with primary and fallbacks
    
    Returns:
        BeautifulSoup element or None if not found
    """
    # Try primary selector
    if "primary" in config and config["primary"]:
        try:
            results = element.select(config["primary"])
            if results:
                return results[0]
        except:
            # Fallback to simpler parsing for basic selectors
            if "." in config["primary"]:
                tag, class_name = config["primary"].split(".", 1)
                tag = tag if tag else None
                result = element.find(tag, class_=class_name)
                if result:
                    return result
    
    # Try fallbacks if provided
    if "fallbacks" in config:
        for fallback in config["fallbacks"]:
            try:
                # Try using CSS selector API first (handles complex selectors)
                results = element.select(fallback)
                if results:
                    return results[0]
            except:
                # Fallback to simpler parsing for basic selectors
                if "." in fallback:
                    tag, class_name = fallback.split(".", 1)
                    tag = tag if tag else None
                    result = element.find(tag, class_=class_name)
                    if result:
                        return result
    
    # Try XPath as last resort
    if "xpath" in config:
        try:
            import lxml.html
            from lxml import etree
            
            # We need to parse the HTML of the element
            dom = lxml.html.fromstring(str(element))
            xpath_result = dom.xpath(config["xpath"])
            
            if xpath_result:
                # Convert back to BeautifulSoup for consistency
                result_html = etree.tostring(xpath_result[0])
                result_soup = BeautifulSoup(result_html, "html.parser")
                return result_soup.contents[0] if result_soup.contents else None
        except:
            pass
    
    return None

# Helper function to find all elements matching a selector with fallbacks
def find_all_with_fallbacks(element, config):
    """Find all elements using primary selector with fallbacks
    
    Args:
        element: BeautifulSoup element to search within
        config: Selector configuration with primary and fallbacks
    
    Returns:
        List of BeautifulSoup elements
    """
    results = []
    
    # Try primary selector
    if "primary" in config and config["primary"]:
        try:
            found = element.select(config["primary"])
            if found:
                results.extend(found)
        except:
            # Fallback to simpler parsing
            if "." in config["primary"]:
                tag, class_name = config["primary"].split(".", 1)
                tag = tag if tag else None
                found = element.find_all(tag, class_=class_name)
                if found:
                    results.extend(found)
    
    # If nothing found and fallbacks are provided, try them
    if not results and "fallbacks" in config:
        for fallback in config["fallbacks"]:
            try:
                found = element.select(fallback)
                if found:
                    results.extend(found)
                    break  # Stop at first successful fallback
            except:
                # Fallback to simpler parsing
                if "." in fallback:
                    tag, class_name = fallback.split(".", 1)
                    tag = tag if tag else None
                    found = element.find_all(tag, class_=class_name)
                    if found:
                        results.extend(found)
                        break  # Stop at first successful fallback
    
    # If still nothing found and XPath is provided, try it as last resort
    if not results and "xpath" in config:
        try:
            import lxml.html
            from lxml import etree
            
            dom = lxml.html.fromstring(str(element))
            xpath_results = dom.xpath(config["xpath"])
            
            if xpath_results:
                for result in xpath_results:
                    result_html = etree.tostring(result)
                    result_soup = BeautifulSoup(result_html, "html.parser")
                    if result_soup.contents:
                        results.append(result_soup.contents[0])
        except:
            pass
    
    return results

# Helper function to extract components from citation when needed
def extract_from_citation(citation_text):
    """Extract author, year, and title from a citation string
    
    Note: Only used as fallback when direct selectors fail
    """
    result = {}
    
    if not citation_text:
        return result
    
    # Try to extract year
    year_match = re.search(r'(\d{4})\. ', citation_text)
    if year_match:
        year_pos = year_match.start()
        after_year = year_match.end()
        
        # Author is everything before the year
        result["authors"] = citation_text[:year_pos].strip()
        if result["authors"].endswith(','):
            result["authors"] = result["authors"][:-1]
        
        # Title is everything after the year until next period or end
        title_end = citation_text.find('.', after_year)
        if title_end > after_year:
            result["title"] = citation_text[after_year:title_end].strip()
        else:
            result["title"] = citation_text[after_year:].strip()
        
        # Extract year as integer
        try:
            result["year"] = int(year_match.group(1))
        except:
            pass
    
    return result

# Main parse_publication function focused on selectors first, citation as fallback
def parse_publication(element, base_url):
    """Parse a publication using configurable selectors with fallbacks
    
    This function prioritizes direct selectors and only falls back to citation
    extraction when direct selectors fail.
    
    Args:
        element: BeautifulSoup element containing the publication
        base_url: Base URL for resolving relative URLs
        
    Returns:
        dict: Publication data
    """
    # Initialize the publication object
    publication = {
        "title": None,
        "authors": None,
        "year": None,
        "abstract": None,
        "pub_url": None,
        "file_urls": [],
        "source": "GrowthLab"
    }
    
    # 1. Find title using direct selectors
    title_element = find_with_fallbacks(element, SELECTOR_CONFIG["publication"]["title"])
    if title_element:
        publication["title"] = title_element.text.strip()
        
        # Get URL from title link
        title_link = title_element.find("a")
        if title_link and title_link.get("href"):
            pub_url = title_link["href"]
            if not pub_url.startswith(("http://", "https://")):
                pub_url = f"{base_url}{pub_url}"
            publication["pub_url"] = pub_url
    
    # 2. Find authors using direct selectors
    authors_element = find_with_fallbacks(element, SELECTOR_CONFIG["publication"]["authors"])
    if authors_element:
        publication["authors"] = authors_element.text.strip()
    
    # 3. Extract year from text after authors
    if authors_element:
        sibling_text = authors_element.next_sibling
        if sibling_text:
            year_match = re.search(r"\b(19|20)\d{2}\b", sibling_text)
            if year_match:
                publication["year"] = int(year_match.group())
    
    # 4. Find abstract using direct selectors
    abstract_element = find_with_fallbacks(element, SELECTOR_CONFIG["publication"]["abstract"])
    if abstract_element:
        publication["abstract"] = abstract_element.text.strip()
    
    # 5. Find file URLs using direct selectors and multiple methods
    file_urls = set()  # Use a set to avoid duplicates
    
    # Try all file selectors
    for selector_type in ["primary", "fallbacks"]:
        if selector_type == "primary":
            selectors = [SELECTOR_CONFIG["publication"]["file"]["primary"]]
        else:
            selectors = SELECTOR_CONFIG["publication"]["file"].get("fallbacks", [])
            
        for selector in selectors:
            try:
                elements = element.select(selector)
                for elem in elements:
                    if elem.name == "a" and elem.get("href"):
                        # Direct link
                        file_url = elem["href"]
                        if not file_url.startswith(("http://", "https://")):
                            file_url = f"{base_url}{file_url}"
                        file_urls.add(file_url)
                    else:
                        # Container with links
                        for link in elem.find_all("a"):
                            if link.get("href"):
                                file_url = link["href"]
                                if not file_url.startswith(("http://", "https://")):
                                    file_url = f"{base_url}{file_url}"
                                file_urls.add(file_url)
            except:
                pass
    
    # Convert set back to list
    publication["file_urls"] = list(file_urls)
    
    # 6. FALLBACK: If critical fields are missing, try citation extraction
    if not publication["title"] or not publication["authors"] or not publication["year"]:
        citation_element = find_with_fallbacks(element, SELECTOR_CONFIG["publication"]["citation"])
        if citation_element:
            citation_text = citation_element.text.strip()
            citation_data = extract_from_citation(citation_text)
            
            # Only use citation data for missing fields
            if not publication["title"] and "title" in citation_data:
                publication["title"] = citation_data["title"]
            if not publication["authors"] and "authors" in citation_data:
                publication["authors"] = citation_data["authors"]
            if not publication["year"] and "year" in citation_data:
                publication["year"] = citation_data["year"]
    
    return publication

# Demo using the revised selector-focused parser
async def demo_selector_focused_parser():
    """Demonstrate the selector-focused parser with citation fallbacks"""
    print("\n🧪 Testing selector-focused parser")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    base_url = "https://growthlab.hks.harvard.edu"
    
    async with aiohttp.ClientSession(headers=headers) as session:
        # Test on the main publications page
        async with session.get(f"{base_url}/publications") as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Get the first publication container
            container = soup.find("div", class_="biblio-entry")
            if not container:
                print("No publication found")
                return
            
            # Parse the publication with our selector-focused parser
            publication = parse_publication(container, base_url)
            
            print("\nParsed publication from listing page:")
            print(f"Title: {publication['title']}")
            print(f"Authors: {publication['authors']}")
            print(f"Year: {publication['year']}")
            print(f"Abstract: {publication['abstract'][:100]}..." if publication['abstract'] else "No abstract")
            print(f"URL: {publication['pub_url']}")
            print(f"File URLs: {publication['file_urls']}")
            
        # Test on the problematic publication page we identified
        test_url = "https://growthlab.hks.harvard.edu/publications/global-networks-monetary-policy-and-trade"
        async with session.get(test_url) as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Parse the publication with our selector-focused parser
            publication = parse_publication(soup, base_url)
            
            print("\nParsed problematic publication page:")
            print(f"Title: {publication['title']}")
            print(f"Authors: {publication['authors']}")
            print(f"Year: {publication['year']}")
            print(f"Abstract: {publication['abstract'][:100]}..." if publication['abstract'] else "No abstract")
            print(f"URL: {publication['pub_url'] or test_url}")
            print(f"File URLs: {publication['file_urls']}")

# Run the demo
await demo_selector_focused_parser()



🧪 Testing selector-focused parser

Parsed publication from listing page:
Title: Global Networks, Monetary Policy and Trade
Authors: Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.
Year: 2025
Abstract: We develop a novel framework to study the interaction between monetary policy and trade. Our New Key...
URL: https://growthlab.hks.harvard.edu/publications/global-networks-monetary-policy-and-trade
File URLs: ['https://www.nber.org/system/files/working_papers/w33686/w33686.pdf']

Parsed problematic publication page:
Title: Global Networks, Monetary Policy and Trade
Authors: Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.
Year: 2025
No abstract
URL: https://growthlab.hks.harvard.edu/publications/global-networks-monetary-policy-and-trade
File URLs: ['https://www.nber.org/system/files/working_papers/w33686/w33686.pdf']


In [34]:
import asyncio
import logging
import re
import json
from pathlib import Path
from typing import Any, Dict, List, Optional
from pprint import pprint

import aiohttp
from bs4 import BeautifulSoup
import nest_asyncio
nest_asyncio.apply()  # This allows running async code in Jupyter

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define the selector configuration with primary selectors, fallbacks, and XPath alternatives
SELECTOR_CONFIG = {
    "publication": {
        "container": {
            "primary": "div.biblio-entry", 
            "fallbacks": ["div.node-biblio", "article.publication", "div.publication-item"],
            "xpath": "//div[contains(@class, 'biblio-entry')]",
            "description": "Publication container"
        },
        "title": {
            "primary": "span.biblio-title", 
            "fallbacks": ["h1.page-title", "h2.publication-title", "h3.title", "div.title"],
            "xpath": "//span[contains(@class, 'biblio-title')] | //h1[contains(@class, 'page-title')]",
            "description": "Publication title"
        },
        "authors": {
            "primary": "span.biblio-authors", 
            "fallbacks": [
                "div.field-name-field-biblio-authors .field-item",
                "div.authors", 
                "p.author-list", 
                "div.publication-authors"
            ],
            "xpath": "//span[contains(@class, 'biblio-authors')] | //div[contains(@class, 'field-name-field-biblio-authors')]//div[@class='field-item']",
            "description": "Publication authors"
        },
        "abstract": {
            "primary": "div.biblio-abstract-display", 
            "fallbacks": [
                "div.field-name-field-abstract",
                "div.abstract", 
                "div.publication-abstract", 
                "p.abstract"
            ],
            "xpath": "//div[contains(@class, 'biblio-abstract-display')] | //div[contains(@class, 'field-name-field-abstract')]",
            "description": "Publication abstract"
        },
        "file": {
            "primary": "span.file", 
            "fallbacks": [
                "#pub-cover-content-wrapper a",  # SelectorGadget discovery
                ".Z3988+ a",                     # Element after Z3988
                "a.biblio-download", 
                "a[href$='.pdf']",
                "a[href*='files']"
            ],
            "xpath": "//span[contains(@class, 'file')] | //*[@id='pub-cover-content-wrapper']//a | //*[contains(@class, 'Z3988')]/following-sibling::a | //a[contains(@href, '.pdf')]",
            "description": "Publication files"
        },
        "citation": {
            "primary": ".biblio-citation",
            "fallbacks": ["div.field-name-field-citation", "div.citation"],
            "xpath": "//div[contains(@class, 'biblio-citation')] | //div[contains(@class, 'field-name-field-citation')]",
            "description": "Citation information"
        }
    },
    "pagination": {
        "container": {
            "primary": "ul.pager",
            "fallbacks": ["div.pagination", "nav.pagination"],
            "xpath": "//ul[contains(@class, 'pager')]",
            "description": "Pagination container"
        },
        "last_page": {
            "primary": "li.pager-last",
            "fallbacks": ["li.page-item:last-child", "a.page-link:last-child"],
            "xpath": "//li[contains(@class, 'pager-last')]",
            "description": "Last page link"
        }
    },
    "endnote": {
        "link": {
            "primary": "li.biblio_tagged a",
            "fallbacks": ["a[href*='tagged=1']", "a[href*='endnote']", "a.endnote-link"],
            "xpath": "//a[contains(@href, 'tagged=1')] | //a[contains(@href, 'endnote')]",
            "description": "Endnote link"
        }
    }
}

# Create a simple placeholder for GrowthLabPublication
class SimpleGrowthLabPublication(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
    def generate_id(self):
        # Simple hash based on title and authors
        title = self.get("title", "")
        authors = self.get("authors", "")
        return f"pub_{hash(title + authors) % 10000}"
        
    def generate_content_hash(self):
        # Simple hash of content
        content = json.dumps(self, sort_keys=True)
        return f"hash_{hash(content) % 10000}"
        
    @property
    def paper_id(self):
        return self.get("paper_id", "")
        
    @paper_id.setter
    def paper_id(self, value):
        self["paper_id"] = value
        
    @property
    def content_hash(self):
        return self.get("content_hash", "")
        
    @content_hash.setter
    def content_hash(self, value):
        self["content_hash"] = value
        
    @property
    def title(self):
        return self.get("title", "")
        
    @title.setter
    def title(self, value):
        self["title"] = value
        
    @property
    def authors(self):
        return self.get("authors", "")
        
    @authors.setter
    def authors(self, value):
        self["authors"] = value
        
    @property
    def abstract(self):
        return self.get("abstract", "")
        
    @abstract.setter
    def abstract(self, value):
        self["abstract"] = value
        
    @property
    def pub_url(self):
        return self.get("pub_url", "")
    
    @pub_url.setter
    def pub_url(self, value):
        self["pub_url"] = value
        
    @property
    def file_urls(self):
        return self.get("file_urls", [])
    
    @file_urls.setter
    def file_urls(self, value):
        self["file_urls"] = value
    
    @property
    def year(self):
        return self.get("year")
    
    @year.setter
    def year(self, value):
        self["year"] = value

class SelectorMonitor:
    """Class to monitor selector performance and detect failures"""
    
    def __init__(self, selectors=None):
        self.selectors = selectors or SELECTOR_CONFIG
        self.stats = {
            "total_pages": 0,
            "total_publications": 0,
            "selector_success": {},
            "selector_failure": {},
            "alerts": []
        }
        
        # Initialize stats for each selector
        for section, section_config in self.selectors.items():
            for name, selector_config in section_config.items():
                key = f"{section}.{name}"
                self.stats["selector_success"][key] = 0
                self.stats["selector_failure"][key] = 0
    
    def record_success(self, section, name, used_selector=None):
        """Record a successful selector use"""
        key = f"{section}.{name}"
        if key in self.stats["selector_success"]:
            self.stats["selector_success"][key] += 1
            
            # Record which selector was actually used
            if used_selector:
                if not hasattr(self, "selector_usage"):
                    self.selector_usage = {}
                
                if key not in self.selector_usage:
                    self.selector_usage[key] = {}
                
                self.selector_usage[key][used_selector] = self.selector_usage[key].get(used_selector, 0) + 1
    
    def record_failure(self, section, name):
        """Record a failed selector use"""
        key = f"{section}.{name}"
        if key in self.stats["selector_failure"]:
            self.stats["selector_failure"][key] += 1
            
            # Check if failure rate is high enough to trigger alert
            total = self.stats["selector_success"][key] + self.stats["selector_failure"][key]
            if total >= 5:  # Only check after a minimum sample
                failure_rate = self.stats["selector_failure"][key] / total
                if failure_rate > 0.5:  # Alert if more than 50% failure
                    self.create_alert(section, name, failure_rate)
    
    def create_alert(self, section, name, failure_rate):
        """Create an alert for a failing selector"""
        selector_config = self.selectors[section][name]
        alert = {
            "timestamp": None,  # Would use datetime in real implementation
            "selector": f"{section}.{name}",
            "failure_rate": failure_rate,
            "primary": selector_config["primary"],
            "fallbacks": selector_config["fallbacks"],
            "message": f"Selector {section}.{name} is failing at a rate of {failure_rate:.2%}"
        }
        
        # Check if we already have an alert for this selector
        existing_alerts = [a for a in self.stats["alerts"] if a["selector"] == alert["selector"]]
        if not existing_alerts:
            self.stats["alerts"].append(alert)
            logger.warning(f"SELECTOR ALERT: {alert['message']}")
    
    def record_page_processed(self):
        """Record that a page was processed"""
        self.stats["total_pages"] += 1
    
    def record_publication_processed(self):
        """Record that a publication was processed"""
        self.stats["total_publications"] += 1
    
    def check_selector_health(self):
        """Check the health of all selectors"""
        print("\n🔍 Selector Health Check:")
        
        for section, section_config in self.selectors.items():
            print(f"\n{section.upper()} Selectors:")
            
            for name, selector_config in section_config.items():
                key = f"{section}.{name}"
                success = self.stats["selector_success"].get(key, 0)
                failure = self.stats["selector_failure"].get(key, 0)
                total = success + failure
                
                if total > 0:
                    success_rate = success / total
                    status = "✅ GOOD" if success_rate >= 0.9 else "⚠️ WARNING" if success_rate >= 0.5 else "❌ FAILING"
                    print(f"  - {key}: {status} ({success}/{total}, {success_rate:.1%})")
                else:
                    print(f"  - {key}: ⚪ NO DATA")
    
    def print_selector_usage(self):
        """Print which selectors were actually used"""
        if not hasattr(self, "selector_usage"):
            print("\nNo selector usage data available")
            return
            
        print("\n📊 Selector Usage Statistics:")
        
        for key, usage in self.selector_usage.items():
            print(f"\n{key}:")
            total_uses = sum(usage.values())
            
            for selector, count in sorted(usage.items(), key=lambda x: x[1], reverse=True):
                percentage = count / total_uses * 100
                print(f"  - {selector}: {count} times ({percentage:.1f}%)")
    
    def generate_report(self):
        """Generate a full report of selector performance"""
        print("\n📊 Selector Performance Report")
        print(f"Pages processed: {self.stats['total_pages']}")
        print(f"Publications processed: {self.stats['total_publications']}")
        
        # Calculate overall selector success rate
        total_success = sum(self.stats["selector_success"].values())
        total_failure = sum(self.stats["selector_failure"].values())
        total_attempts = total_success + total_failure
        
        if total_attempts > 0:
            overall_rate = total_success / total_attempts
            print(f"Overall selector success rate: {overall_rate:.2%}")
        
        # Print selector-specific stats
        print("\nSelector Performance:")
        for key in sorted(self.stats["selector_success"].keys()):
            success = self.stats["selector_success"][key]
            failure = self.stats["selector_failure"][key]
            total = success + failure
            
            if total > 0:
                rate = success / total
                status = "✅ GOOD" if rate >= 0.9 else "⚠️ WARNING" if rate >= 0.5 else "❌ FAILING"
                print(f"  - {key}: {status} {rate:.2%} success ({success}/{total})")
        
        # Print selector usage statistics
        self.print_selector_usage()

# Simplified retry function
async def retry_with_backoff(func, *args, max_retries=3, base_delay=1.0, max_delay=30.0, retry_on=Exception, **kwargs):
    """Retry a function with exponential backoff"""
    retries = 0
    while True:
        try:
            return await func(*args, **kwargs)
        except retry_on as e:
            if retries >= max_retries:
                raise
            delay = min(base_delay * (2 ** retries), max_delay)
            logger.info(f"Retrying after error: {e}. Attempt {retries+1}/{max_retries}. Waiting {delay:.2f}s...")
            await asyncio.sleep(delay)
            retries += 1

# Selector utility functions
def find_with_selectors(element, selector_config, monitor=None, section=None, name=None):
    """Find an element using selectors with fallbacks
    
    Args:
        element: BeautifulSoup element to search within
        selector_config: Configuration with primary, fallbacks, xpath
        monitor: Optional SelectorMonitor to record success/failure
        section: Section name for monitoring
        name: Selector name for monitoring
    
    Returns:
        Tuple of (element, selector_used) or (None, None) if not found
    """
    # Try primary selector
    if selector_config["primary"]:
        try:
            results = element.select(selector_config["primary"])
            if results:
                if monitor and section and name:
                    monitor.record_success(section, name, selector_config["primary"])
                return results[0], selector_config["primary"]
        except Exception as e:
            logger.debug(f"Error with primary selector {selector_config['primary']}: {e}")
    
    # Try fallbacks
    for fallback in selector_config.get("fallbacks", []):
        try:
            results = element.select(fallback)
            if results:
                if monitor and section and name:
                    monitor.record_success(section, name, fallback)
                return results[0], fallback
        except Exception as e:
            logger.debug(f"Error with fallback selector {fallback}: {e}")
    
    # Try XPath as last resort
    if "xpath" in selector_config and selector_config["xpath"]:
        try:
            import lxml.html
            from lxml import etree
            
            # Parse the HTML of the element
            dom = lxml.html.fromstring(str(element))
            xpath_results = dom.xpath(selector_config["xpath"])
            
            if xpath_results:
                if monitor and section and name:
                    monitor.record_success(section, name, "xpath")
                
                # Convert back to BeautifulSoup for consistency
                result_html = etree.tostring(xpath_results[0])
                result_soup = BeautifulSoup(result_html, "html.parser")
                if result_soup.contents:
                    return result_soup.contents[0], "xpath"
        except ImportError:
            logger.debug("lxml not available for XPath queries")
        except Exception as e:
            logger.debug(f"Error with XPath: {e}")
    
    # Record failure if we got here
    if monitor and section and name:
        monitor.record_failure(section, name)
    
    return None, None

def find_all_with_selectors(element, selector_config, monitor=None, section=None, name=None):
    """Find all elements matching selectors with fallbacks
    
    Returns:
        Tuple of (elements, selector_used) or ([], None) if none found
    """
    # Try primary selector
    if selector_config["primary"]:
        try:
            results = element.select(selector_config["primary"])
            if results:
                if monitor and section and name:
                    monitor.record_success(section, name, selector_config["primary"])
                return results, selector_config["primary"]
        except Exception as e:
            logger.debug(f"Error with primary selector {selector_config['primary']}: {e}")
    
    # Try fallbacks
    for fallback in selector_config.get("fallbacks", []):
        try:
            results = element.select(fallback)
            if results:
                if monitor and section and name:
                    monitor.record_success(section, name, fallback)
                return results, fallback
        except Exception as e:
            logger.debug(f"Error with fallback selector {fallback}: {e}")
    
    # Try XPath
    if "xpath" in selector_config and selector_config["xpath"]:
        try:
            import lxml.html
            from lxml import etree
            
            dom = lxml.html.fromstring(str(element))
            xpath_results = dom.xpath(selector_config["xpath"])
            
            if xpath_results:
                if monitor and section and name:
                    monitor.record_success(section, name, "xpath")
                
                bs_results = []
                for result in xpath_results:
                    result_html = etree.tostring(result)
                    result_soup = BeautifulSoup(result_html, "html.parser")
                    if result_soup.contents:
                        bs_results.append(result_soup.contents[0])
                
                if bs_results:
                    return bs_results, "xpath"
        except ImportError:
            logger.debug("lxml not available for XPath queries")
        except Exception as e:
            logger.debug(f"Error with XPath: {e}")
    
    # Record failure
    if monitor and section and name:
        monitor.record_failure(section, name)
    
    return [], None

# Extract components from citation
def extract_from_citation(citation_text):
    """Extract author, year, and title from a citation string"""
    result = {}
    
    if not citation_text:
        return result
    
    # Try to extract year
    year_match = re.search(r'(\d{4})\. ', citation_text)
    if year_match:
        year_pos = year_match.start()
        after_year = year_match.end()
        
        # Author is everything before the year
        result["authors"] = citation_text[:year_pos].strip()
        if result["authors"].endswith(','):
            result["authors"] = result["authors"][:-1]
        
        # Title is everything after the year until next period or end
        title_end = citation_text.find('.', after_year)
        if title_end > after_year:
            result["title"] = citation_text[after_year:title_end].strip()
        else:
            result["title"] = citation_text[after_year:].strip()
        
        # Extract year as integer
        try:
            result["year"] = int(year_match.group(1))
        except:
            pass
    
    return result

# Function to parse a publication with our selector configuration
def parse_publication(element, base_url, monitor=None, year_corrections=None):
    """Parse a publication using configured selectors with fallbacks"""
    # Initialize the publication
    publication = SimpleGrowthLabPublication(
        title=None,
        authors=None,
        year=None,
        abstract=None,
        pub_url=None,
        file_urls=[],
        source="GrowthLab"
    )
    
    # Record that we're processing a publication
    if monitor:
        monitor.record_publication_processed()
    
    # 1. Find title using selectors
    title_element, title_selector = find_with_selectors(
        element, 
        SELECTOR_CONFIG["publication"]["title"],
        monitor, "publication", "title"
    )
    
    if title_element:
        publication.title = title_element.text.strip()
        
        # Get publication URL from title link
        title_link = title_element.find("a")
        if title_link and title_link.get("href"):
            pub_url = title_link["href"]
            # Ensure URL is absolute
            if not pub_url.startswith(("http://", "https://")):
                pub_url = f"{base_url}{pub_url}" if not pub_url.startswith("/") else f"{base_url.rstrip('/')}{pub_url}"
            publication.pub_url = pub_url
    
    # 2. Find authors
    authors_element, authors_selector = find_with_selectors(
        element, 
        SELECTOR_CONFIG["publication"]["authors"],
        monitor, "publication", "authors"
    )
    
    if authors_element:
        publication.authors = authors_element.text.strip()
    
    # 3. Extract year from text after authors
    if authors_element:
        sibling_text = authors_element.next_sibling
        if sibling_text:
            year_match = re.search(r"\b(19|20)\d{2}\b", str(sibling_text))
            if year_match:
                publication.year = int(year_match.group())
    
    # 4. Find abstract
    abstract_element, abstract_selector = find_with_selectors(
        element, 
        SELECTOR_CONFIG["publication"]["abstract"],
        monitor, "publication", "abstract"
    )
    
    if abstract_element:
        publication.abstract = abstract_element.text.strip()
    
    # 5. Find file URLs
    file_elements, file_selector = find_all_with_selectors(
        element, 
        SELECTOR_CONFIG["publication"]["file"],
        monitor, "publication", "file"
    )
    
    file_urls = set()  # Use set to avoid duplicates
    
    for elem in file_elements:
        if elem.name == "a" and elem.get("href"):
            # Direct link
            file_url = elem["href"]
            if not file_url.startswith(("http://", "https://")):
                file_url = f"{base_url}{file_url}" if not file_url.startswith("/") else f"{base_url.rstrip('/')}{file_url}"
            file_urls.add(file_url)
        else:
            # Container with links
            for link in elem.find_all("a"):
                if link.get("href"):
                    file_url = link["href"]
                    if not file_url.startswith(("http://", "https://")):
                        file_url = f"{base_url}{file_url}" if not file_url.startswith("/") else f"{base_url.rstrip('/')}{file_url}"
                    file_urls.add(file_url)
    
    publication.file_urls = list(file_urls)
    
    # 6. FALLBACK: If critical fields are still missing, try citation
    if not publication.title or not publication.authors or not publication.year:
        citation_element, citation_selector = find_with_selectors(
            element, 
            SELECTOR_CONFIG["publication"]["citation"],
            monitor, "publication", "citation"
        )
        
        if citation_element:
            citation_text = citation_element.text.strip()
            citation_data = extract_from_citation(citation_text)
            
            # Only use citation data for missing fields
            if not publication.title and "title" in citation_data:
                publication.title = citation_data["title"]
            if not publication.authors and "authors" in citation_data:
                publication.authors = citation_data["authors"]
            if not publication.year and "year" in citation_data:
                publication.year = citation_data["year"]
    
    # 7. Apply year corrections if needed
    if publication.pub_url and year_corrections and publication.pub_url in year_corrections:
        publication.year = year_corrections[publication.pub_url]
    
    # Generate stable ID and content hash
    publication.paper_id = publication.generate_id()
    publication.content_hash = publication.generate_content_hash()
    
    return publication

# Class for scraping Growth Lab website
class GrowthLabScraperWithSelectors:
    """Enhanced Growth Lab scraper with configurable selectors"""
    
    def __init__(self, config=None):
        """Initialize the scraper with configuration"""
        self.config = config or {
            "base_url": "https://growthlab.hks.harvard.edu/publications",
            "scrape_delay": 1.0,  # Reduced for testing
            "concurrency_limit": 2,  # Reduced for testing
            "max_retries": 3,
            "retry_base_delay": 1.0,
            "retry_max_delay": 10.0,
        }
        
        self.base_url = self.config["base_url"]
        self.scrape_delay = self.config["scrape_delay"]
        self.concurrency_limit = self.config["concurrency_limit"]
        self.semaphore = asyncio.Semaphore(self.concurrency_limit)
        self.monitor = SelectorMonitor(SELECTOR_CONFIG)
        
        # Sample year corrections
        self.year_corrections = {
            "https://growthlab.hks.harvard.edu/publications/sri-lanka-growth-diagnostic": 2018,
            "https://growthlab.hks.harvard.edu/publications/recommendations-trade-adjustment-assistance-sri-lanka": 2017,
            "https://growthlab.hks.harvard.edu/publications/immigration-policy-research": 2017,
            "https://growthlab.hks.harvard.edu/publications/sri-lanka%E2%80%99s-edible-oils-exports": 2016,
            "https://growthlab.hks.harvard.edu/publications/economic-complexity-brief": 2013,
            "https://growthlab.hks.harvard.edu/publications/journey-through-time-story-behind-%E2%80%98eight-decades-changes-occupational-tasks": 2024,
        }
    
    async def get_max_page_num(self, session):
        """Get the maximum page number from pagination"""
        self.monitor.record_page_processed() 
        
        async with session.get(self.base_url) as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Find pagination container
            pagination, pagination_selector = find_with_selectors(
                soup, 
                SELECTOR_CONFIG["pagination"]["container"],
                self.monitor, "pagination", "container"
            )
            
            if not pagination:
                logger.warning("No pagination container found")
                return 0
            
            # Find last page link
            last_page, last_page_selector = find_with_selectors(
                pagination, 
                SELECTOR_CONFIG["pagination"]["last_page"],
                self.monitor, "pagination", "last_page"
            )
            
            if not last_page or not last_page.find("a"):
                logger.warning("No last page link found")
                return 0
            
            # Extract page number from URL
            last_page_url = last_page.find("a")["href"]
            match = re.search(r"page=(\d+)", last_page_url)
            
            if match:
                return int(match.group(1))
            else:
                return 0
    
    async def fetch_page(self, session, page_num):
        """Fetch a single page of publications"""
        self.monitor.record_page_processed()
        
        url = self.base_url if page_num == 0 else f"{self.base_url}?page={page_num}"
        logger.info(f"Fetching page {page_num}: {url}")
        
        # Use the semaphore to limit concurrency
        async with self.semaphore:
            async with session.get(url) as response:
                if response.status != 200:
                    logger.error(f"Error fetching page {page_num}: {response.status}")
                    return []
                
                html = await response.text()
                soup = BeautifulSoup(html, "html.parser")
                
                # Find all publication containers
                containers, container_selector = find_all_with_selectors(
                    soup, 
                    SELECTOR_CONFIG["publication"]["container"],
                    self.monitor, "publication", "container"
                )
                
                if not containers:
                    logger.warning(f"No publications found on page {page_num}")
                    return []
                
                logger.info(f"Found {len(containers)} publications on page {page_num}")
                
                # Parse each publication
                publications = []
                for container in containers:
                    publication = parse_publication(
                        container, 
                        self.base_url.split("/publications")[0],
                        self.monitor,
                        self.year_corrections
                    )
                    publications.append(publication)
                
                # Add a delay to prevent overwhelming the server
                await asyncio.sleep(self.scrape_delay)
                
                return publications
    
    async def extract_publications(self, max_pages=None):
        """Extract publications from all pages or a specified number of pages"""
        # Set up a robust session
        timeout = aiohttp.ClientTimeout(total=30, connect=10, sock_connect=10, sock_read=10)
        
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
        }
        
        async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
            # Get the maximum page number
            max_page = await self.get_max_page_num(session)
            logger.info(f"Maximum page number: {max_page}")
            
            # Limit the number of pages if specified
            if max_pages is not None:
                max_page = min(max_page, max_pages - 1)  # -1 because we start from page 0
            
            # Create tasks for fetching pages
            tasks = [self.fetch_page(session, page) for page in range(max_page + 1)]
            
            # Process all pages and collect publications
            all_publications = []
            for future in asyncio.as_completed(tasks):
                try:
                    publications = await future
                    all_publications.extend(publications)
                except Exception as e:
                    logger.error(f"Error processing page: {e}")
            
            logger.info(f"Total publications extracted: {len(all_publications)}")
            return all_publications
    
    def generate_report(self):
        """Generate a report on the scraping results"""
        self.monitor.generate_report()

In [35]:
# Function to test the scraper across all pages
async def test_scraper_across_pages(max_pages=None):
    """Run the scraper and generate a report across multiple pages"""
    print(f"🧪 Testing scraper across {'all' if max_pages is None else max_pages} pages")
    
    # Create a scraper instance
    scraper = GrowthLabScraperWithSelectors()
    
    # Extract publications
    publications = await scraper.extract_publications(max_pages)
    
    # Count publications with missing fields
    missing_title = sum(1 for pub in publications if not pub.get("title"))
    missing_authors = sum(1 for pub in publications if not pub.get("authors"))
    missing_year = sum(1 for pub in publications if not pub.get("year"))
    missing_abstract = sum(1 for pub in publications if not pub.get("abstract"))
    missing_file_urls = sum(1 for pub in publications if not pub.get("file_urls") or len(pub.get("file_urls", [])) == 0)
    
    # Print statistics
    print(f"\n📊 Publication Statistics (Total: {len(publications)})")
    print(f"  - Missing titles: {missing_title} ({missing_title/len(publications)*100:.1f}%)")
    print(f"  - Missing authors: {missing_authors} ({missing_authors/len(publications)*100:.1f}%)")
    print(f"  - Missing year: {missing_year} ({missing_year/len(publications)*100:.1f}%)")
    print(f"  - Missing abstract: {missing_abstract} ({missing_abstract/len(publications)*100:.1f}%)")
    print(f"  - Missing file URLs: {missing_file_urls} ({missing_file_urls/len(publications)*100:.1f}%)")
    
    # Sample of successful publications
    print("\n✅ Sample of Complete Publications:")
    complete_pubs = [p for p in publications if p.get("title") and p.get("authors") and p.get("year") and p.get("abstract")]
    for i, pub in enumerate(complete_pubs[:3]):
        print(f"\n{i+1}. {pub.get('title', 'No title')}")
        print(f"   Authors: {pub.get('authors', 'No authors')}")
        print(f"   Year: {pub.get('year', 'No year')}")
        print(f"   Abstract: {pub.get('abstract', 'No abstract')[:100]}..." if pub.get('abstract') else "   No abstract")
        print(f"   URL: {pub.get('pub_url', 'No URL')}")
        print(f"   File URLs: {pub.get('file_urls', [])}")
    
    # Sample of publications with missing fields
    print("\n⚠️ Sample of Publications with Missing Fields:")
    incomplete_pubs = [p for p in publications if not p.get("title") or not p.get("authors") or not p.get("year") or not p.get("abstract")]
    for i, pub in enumerate(incomplete_pubs[:3]):
        print(f"\n{i+1}. {pub.get('title', 'No title')}")
        print(f"   Authors: {pub.get('authors', 'No authors')}")
        print(f"   Year: {pub.get('year', 'No year')}")
        print(f"   Abstract: {pub.get('abstract', 'No abstract')[:100]}..." if pub.get('abstract') else "   No abstract")
        print(f"   URL: {pub.get('pub_url', 'No URL')}")
        print(f"   Missing: " + ", ".join([
            "title" if not pub.get("title") else "",
            "authors" if not pub.get("authors") else "",
            "year" if not pub.get("year") else "", 
            "abstract" if not pub.get("abstract") else ""
        ]).replace(", ,", ",").strip(", "))
    
    # Generate selector monitoring report
    print("\n📋 Selector Performance Report")
    scraper.generate_report()
    
    return publications, scraper.monitor

# Test functionality to check a specific selector on all pages
async def test_specific_selector(selector_section, selector_name, max_pages=3):
    """Test a specific selector across multiple pages"""
    print(f"🧪 Testing {selector_section}.{selector_name} selector across {max_pages} pages")
    
    # Set up session
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    }
    
    selector_config = SELECTOR_CONFIG[selector_section][selector_name]
    base_url = "https://growthlab.hks.harvard.edu"
    publications_url = f"{base_url}/publications"
    
    monitor = SelectorMonitor(SELECTOR_CONFIG)
    results = {
        "success": 0,
        "failure": 0,
        "selectors_used": {}
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        # First get max page number
        async with session.get(publications_url) as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            pagination = soup.select("ul.pager")
            max_page = 0
            
            if pagination:
                last_page = pagination[0].select("li.pager-last a")
                if last_page:
                    last_page_url = last_page[0]["href"]
                    match = re.search(r"page=(\d+)", last_page_url)
                    if match:
                        max_page = int(match.group(1))
        
        # Limit to specified max_pages
        max_page = min(max_page, max_pages - 1)
        
        # Test on multiple pages
        for page in range(max_page + 1):
            page_url = publications_url if page == 0 else f"{publications_url}?page={page}"
            print(f"Testing page {page}: {page_url}")
            
            async with session.get(page_url) as response:
                html = await response.text()
                soup = BeautifulSoup(html, "html.parser")
                
                # Find containers
                containers = soup.select(SELECTOR_CONFIG["publication"]["container"]["primary"])
                
                if not containers:
                    print(f"No publication containers found on page {page}")
                    continue
                
                print(f"Found {len(containers)} containers on page {page}")
                
                # Test the specified selector on each container
                for i, container in enumerate(containers):
                    # Get title for context
                    title_elem = container.select("span.biblio-title")
                    title = title_elem[0].text.strip() if title_elem else f"Publication {i+1}"
                    
                    # Try to find the element using our selector with fallbacks
                    element, selector_used = find_with_selectors(
                        container, 
                        selector_config,
                        monitor, selector_section, selector_name
                    )
                    
                    if element:
                        results["success"] += 1
                        if selector_used:
                            results["selectors_used"][selector_used] = results["selectors_used"].get(selector_used, 0) + 1
                        print(f"✅ Found {selector_section}.{selector_name} for '{title[:30]}...' using {selector_used}")
                    else:
                        results["failure"] += 1
                        print(f"❌ Failed to find {selector_section}.{selector_name} for '{title[:30]}...'")
            
            # Add a small delay
            await asyncio.sleep(1)
    
    # Print results
    total = results["success"] + results["failure"]
    success_rate = results["success"] / total if total > 0 else 0
    
    print(f"\n📊 Results for {selector_section}.{selector_name}:")
    print(f"  - Success rate: {success_rate:.1%} ({results['success']}/{total})")
    
    if results["selectors_used"]:
        print("  - Selectors used:")
        for selector, count in sorted(results["selectors_used"].items(), key=lambda x: x[1], reverse=True):
            percentage = count / results["success"] * 100 if results["success"] > 0 else 0
            print(f"    - {selector}: {count} times ({percentage:.1f}%)")
    
    return results

# Test multiple pages and get comprehensive results
async def run_comprehensive_test(max_pages=5):
    """Run a comprehensive test of the scraper"""
    print(f"🔍 Running comprehensive test across {max_pages} pages")
    
    # Test main scraper
    publications, monitor = await test_scraper_across_pages(max_pages)
    
    # Test the most critical selectors specifically
    print("\n🧪 Testing critical selectors individually")
    
    title_results = await test_specific_selector("publication", "title", max_pages)
    authors_results = await test_specific_selector("publication", "authors", max_pages)
    file_results = await test_specific_selector("publication", "file", max_pages)
    
    # Print final assessment
    print("\n🏆 Final Assessment")
    print("Critical selectors performance:")
    print(f"- Title: {title_results['success']/(title_results['success']+title_results['failure'])*100:.1f}% success")
    print(f"- Authors: {authors_results['success']/(authors_results['success']+authors_results['failure'])*100:.1f}% success")
    print(f"- File URLs: {file_results['success']/(file_results['success']+file_results['failure'])*100:.1f}% success")
    
    return publications, monitor

# Run a demo on a few pages
async def run_demo():
    """Run a demo of the scraper on a few pages"""
    # Test a few pages for demonstration
    MAX_PAGES = 3  # Set the maximum number of pages to test
    publications, monitor = await run_comprehensive_test(MAX_PAGES)
    
    # Print a sample of publications
    print("\n📚 Sample Publications:")
    for i, pub in enumerate(publications[:5]):
        print(f"\n{i+1}. {pub.get('title', 'No title')}")
        print(f"   Authors: {pub.get('authors', 'No authors')}")
        print(f"   Year: {pub.get('year', 'No year')}")
        print(f"   Abstract: {pub.get('abstract', 'No abstract')[:70]}..." if pub.get('abstract') else "   No abstract")
        
    print("\n✅ Demo completed!")
    return publications, monitor

# Execute the demo
publications, monitor = await run_demo()


🔍 Running comprehensive test across 3 pages
🧪 Testing scraper across 3 pages


INFO:__main__:Maximum page number: 21
INFO:__main__:Fetching page 0: https://growthlab.hks.harvard.edu/publications
INFO:__main__:Fetching page 2: https://growthlab.hks.harvard.edu/publications?page=2
INFO:__main__:Fetching page 1: https://growthlab.hks.harvard.edu/publications?page=1
INFO:__main__:Found 20 publications on page 0
INFO:__main__:Found 20 publications on page 2
INFO:__main__:Found 20 publications on page 1
ERROR:__main__:Error processing page: can only concatenate str (not "NoneType") to str
INFO:__main__:Total publications extracted: 40



📊 Publication Statistics (Total: 40)
  - Missing titles: 0 (0.0%)
  - Missing authors: 0 (0.0%)
  - Missing year: 0 (0.0%)
  - Missing abstract: 0 (0.0%)
  - Missing file URLs: 0 (0.0%)

✅ Sample of Complete Publications:

1. Global Networks, Monetary Policy and Trade
   Authors: Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.
   Year: 2025
   Abstract: We develop a novel framework to study the interaction between monetary policy and trade. Our New Key...
   URL: https://growthlab.hks.harvard.edu/publications/global-networks-monetary-policy-and-trade
   File URLs: ['https://www.nber.org/system/files/working_papers/w33686/w33686.pdf']

2. Industrial policy for competitiveness in the energy transition
   Authors: Ahuja, K. & Hausmann, R.
   Year: 2025
   Abstract: Green objectives have reshaped public policy worldwide since the signing in 2015 of the Paris Agreem...
   URL: https://growthlab.hks.harvard.edu/publications/industrial-policy-competitiveness-energy-transition
   File URLs: ['h

Second section is more on endnote

In [30]:
# CHUNK 5: Get Endnote URL for a Publication

async def get_endnote_file_url(publication_url):
    """Find the Endnote file URL for a specific publication"""
    if not publication_url:
        print("No publication URL provided")
        return None
    
    print(f"Fetching publication page: {publication_url}")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(publication_url) as response:
            if response.status != 200:
                print(f"Error fetching publication page: {response.status}")
                return None
            
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            endnote_link = soup.find("li", class_="biblio_tagged")
            
            if not endnote_link or not endnote_link.find("a"):
                print("No Endnote link found on the page")
                return None
            
            endnote_url = endnote_link.find("a")["href"]
            
            # Ensure URL is absolute
            if not endnote_url.startswith(("http://", "https://")):
                endnote_url = f"{scraper.base_url.split('/publications')[0]}{endnote_url}"
            
            print(f"Found Endnote URL: {endnote_url}")
            return endnote_url

# Run the function if we have a publication
if publication and publication['pub_url']:
    endnote_url = await get_endnote_file_url(publication['pub_url'])
else:
    print("Cannot get Endnote URL - no publication URL available")
    endnote_url = None

Fetching publication page: https://growthlab.hks.harvard.edu/publications/global-networks-monetary-policy-and-trade
Found Endnote URL: https://growthlab.hks.harvard.edu/publications/export/tagged/1942811


In [None]:
# CHUNK 6: Download and Parse Endnote Content

async def download_and_parse_endnote(endnote_url):
    """Download and parse the content of an Endnote file"""
    if not endnote_url:
        print("No Endnote URL provided")
        return None
    
    print(f"Downloading Endnote file from: {endnote_url}")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/plain,application/octet-stream",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(endnote_url) as response:
            if response.status != 200:
                print(f"Error downloading Endnote file: {response.status}")
                return None
            
            content = await response.text()
            print(f"Downloaded Endnote file: {len(content)} bytes")
            
            # Print a sample of the content
            print("\nEndnote file content (first 200 chars):")
            print(content[:200] + "..." if len(content) > 200 else content)
            
            # Parse the content (similar to parse_endnote_content method)
            record = {}
            lines = content.split("\n")
            
            print("\nParsing Endnote content...")
            for line in lines[:10]:  # Just show first 10 lines
                if line.startswith("%"):
                    print(f"Processing line: {line}")
                    key = line[1]
                    value = line[3:].strip() if len(line) > 3 else ""
                    
                    if key == "A":  # Author
                        name_parts = value.split(", ")
                        if len(name_parts) == 2:
                            value = f"{name_parts[1]} {name_parts[0]}"
                        print(f"- Author: {value}")
                        record["author"] = record.get("author", []) + [value]
                    elif key == "T":  # Title
                        print(f"- Title: {value}")
                        record["title"] = value
                    elif key == "D":  # Date
                        print(f"- Date: {value}")
                        record["date"] = value
                    elif key == "X":  # Abstract
                        print(f"- Abstract: {value[:50]}..." if len(value) > 50 else f"- Abstract: {value}")
                        
                        # Check if value contains HTML
                        if "<" in value and ">" in value:
                            print("  Abstract contains HTML, parsing...")
                            soup = BeautifulSoup(value, "html.parser")
                            for tag in soup.find_all(["b", "strong"]):
                                tag.unwrap()
                            
                            abstract = "\n".join(
                                p.get_text(separator=" ", strip=True)
                                for p in soup.find_all("p")
                                if p.get_text(strip=True)
                            )
                            record["abstract"] = abstract.strip()
                        else:
                            print("  Abstract is plain text, no HTML parsing needed")
                            record["abstract"] = value
            
            # Join author list
            if "author" in record:
                record["author"] = ", ".join(record["author"])
            
            return record

# Run the function if we have an Endnote URL
if endnote_url:
    endnote_data = await download_and_parse_endnote(endnote_url)
    
    if endnote_data:
        print("\nParsed Endnote data:")
        for key, value in endnote_data.items():
            print(f"- {key}: {value[:100]}..." if len(value) > 100 else f"- {key}: {value}")
else:
    print("Cannot download Endnote - no URL available")

Step 1: Fetching publications page to find a publication URL...
Found publication URL: https://growthlab.hks.harvard.edu/publications/global-networks-monetary-policy-and-trade

Step 2: Fetching publication page to find Endnote link...
Found Endnote URL: https://growthlab.hks.harvard.edu/publications/export/tagged/1942811

Step 3: Fetching Endnote file...
Endnote file size: 1731 bytes

Raw Endnote Content (first 500 chars):
%0 Generic
%D 2025
%T Global Networks, Monetary Policy and Trade
%A Ṣebnem Kalemli-Özcan
%A Can Soylu
%A Muhammed A. Yildirim
%X We develop a novel framework to study the interaction between monetary policy and trade. Our New Keynesian open economy model incorporates international production networks, sectoral heterogeneity in price rigidities, and trade distortions. We decompose the general equilibrium response to trade shocks into distinct channels that account for demand shifts, policy ...

Step 4: Parsing Endnote content...

Processing Endnote lines:

Line: %0 Ge

In [8]:
# CHUNK 6: Download and Parse Endnote Content

async def download_and_parse_endnote(endnote_url):
    """Download and parse the content of an Endnote file"""
    if not endnote_url:
        print("No Endnote URL provided")
        return None
    
    print(f"Downloading Endnote file from: {endnote_url}")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/plain,application/octet-stream",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(endnote_url) as response:
            if response.status != 200:
                print(f"Error downloading Endnote file: {response.status}")
                return None
            
            content = await response.text()
            print(f"Downloaded Endnote file: {len(content)} bytes")
            
            # Print a sample of the content
            print("\nEndnote file content (first 200 chars):")
            print(content[:200] + "..." if len(content) > 200 else content)
            
            # Parse the content (similar to parse_endnote_content method)
            record = {}
            lines = content.split("\n")
            
            print("\nParsing Endnote content...")
            for line in lines[:10]:  # Just show first 10 lines
                if line.startswith("%"):
                    print(f"Processing line: {line}")
                    key = line[1]
                    value = line[3:].strip() if len(line) > 3 else ""
                    
                    if key == "A":  # Author
                        name_parts = value.split(", ")
                        if len(name_parts) == 2:
                            value = f"{name_parts[1]} {name_parts[0]}"
                        print(f"- Author: {value}")
                        record["author"] = record.get("author", []) + [value]
                    elif key == "T":  # Title
                        print(f"- Title: {value}")
                        record["title"] = value
                    elif key == "D":  # Date
                        print(f"- Date: {value}")
                        record["date"] = value
                    elif key == "X":  # Abstract
                        print(f"- Abstract: {value[:50]}..." if len(value) > 50 else f"- Abstract: {value}")
                        
                        # Check if value contains HTML
                        if "<" in value and ">" in value:
                            print("  Abstract contains HTML, parsing...")
                            soup = BeautifulSoup(value, "html.parser")
                            for tag in soup.find_all(["b", "strong"]):
                                tag.unwrap()
                            
                            abstract = "\n".join(
                                p.get_text(separator=" ", strip=True)
                                for p in soup.find_all("p")
                                if p.get_text(strip=True)
                            )
                            record["abstract"] = abstract.strip()
                        else:
                            print("  Abstract is plain text, no HTML parsing needed")
                            record["abstract"] = value
            
            # Join author list
            if "author" in record:
                record["author"] = ", ".join(record["author"])
            
            return record

# Run the function if we have an Endnote URL
if endnote_url:
    endnote_data = await download_and_parse_endnote(endnote_url)
    
    if endnote_data:
        print("\nParsed Endnote data:")
        for key, value in endnote_data.items():
            print(f"- {key}: {value[:100]}..." if len(value) > 100 else f"- {key}: {value}")
else:
    print("Cannot download Endnote - no URL available")

Downloading Endnote file from: https://growthlab.hks.harvard.edu/publications/export/tagged/1942811
Downloaded Endnote file: 1731 bytes

Endnote file content (first 200 chars):
%0 Generic
%D 2025
%T Global Networks, Monetary Policy and Trade
%A Ṣebnem Kalemli-Özcan
%A Can Soylu
%A Muhammed A. Yildirim
%X We develop a novel framework to study the interaction between mon...

Parsing Endnote content...
Processing line: %0 Generic
Processing line: %D 2025
- Date: 2025
Processing line: %T Global Networks, Monetary Policy and Trade
- Title: Global Networks, Monetary Policy and Trade
Processing line: %A Ṣebnem Kalemli-Özcan
- Author: Ṣebnem Kalemli-Özcan
Processing line: %A Can Soylu
- Author: Can Soylu
Processing line: %A Muhammed A. Yildirim
- Author: Muhammed A. Yildirim
Processing line: %X We develop a novel framework to study the interaction between monetary policy and trade. Our New Keynesian open economy model incorporates international production networks, sectoral heterogeneity in pr

In [9]:
# CHUNK 7: Enrich a Publication with Endnote Data

async def enrich_publication_with_endnote(pub, endnote_data):
    """Update a publication with data from an Endnote file"""
    if not pub or not endnote_data:
        print("Missing publication or Endnote data")
        return pub
    
    print(f"Enriching publication: {pub['title']}")
    
    # Update publication with Endnote data if missing
    if not pub['title'] and "title" in endnote_data:
        pub['title'] = endnote_data["title"]
        print(f"- Updated title to: {pub['title']}")
    
    if not pub['authors'] and "author" in endnote_data:
        pub['authors'] = endnote_data["author"]
        print(f"- Updated authors to: {pub['authors']}")
    
    if not pub['abstract'] and "abstract" in endnote_data:
        pub['abstract'] = endnote_data["abstract"]
        print(f"- Updated abstract to: {pub['abstract'][:50]}..." if len(pub['abstract']) > 50 else f"- Updated abstract to: {pub['abstract']}")
    
    # Regenerate content hash after updates
    pub.content_hash = pub.generate_content_hash()
    print(f"- Updated content hash: {pub['content_hash']}")
    
    return pub

# Run the function if we have both publication and Endnote data
if publication and endnote_data:
    # Create a copy to show the changes
    original_pub = SimpleGrowthLabPublication(**publication)
    
    # Clear some fields to simulate missing data
    publication['abstract'] = None
    
    print("\nOriginal publication:")
    print(f"Title: {original_pub['title']}")
    print(f"Abstract: {original_pub['abstract']}")
    
    enriched_pub = await enrich_publication_with_endnote(publication, endnote_data)
    
    print("\nEnriched publication:")
    print(f"Title: {enriched_pub['title']}")
    print(f"Abstract: {enriched_pub['abstract'][:100]}..." if enriched_pub['abstract'] else "No abstract")
else:
    print("Cannot enrich publication - missing data")


Original publication:
Title: Global Networks, Monetary Policy and Trade
Abstract: We develop a novel framework to study the interaction between monetary policy and trade. Our New Keynesian open economy model incorporates international production networks, sectoral heterogeneity in price rigidities, and trade distortions. We decompose the general equilibrium response to trade shocks into distinct channels that account for demand shifts, policy effects, exchange rate adjustments, expectations, price stickiness, and input–output linkages. Tariffs act simultaneously as demand and supply shocks, leading to endogenous fragmentation through changes in trade and production network linkages. We show that the net impact of tariffs on domestic inflation, output, employment, and the dollar depends on the endogenous monetary policy response in both the tariff-imposing and tariff-exposed countries, within a global general equilibrium framework. Our quantitative exercise replicates the observed effe

In [10]:
# CHUNK 8: Testing Improved CSS Selectors Configuration

# Define selectors in a configuration dictionary
selectors = {
    "publication": {
        "container": "div.biblio-entry",
        "title": "span.biblio-title",
        "authors": "span.biblio-authors",
        "abstract": "div.biblio-abstract-display",
        "file": "span.file",
    },
    "endnote": {
        "link": "li.biblio_tagged a",
    }
}

# Helper function to parse CSS selectors
def get_selector_parts(selector):
    """Split a CSS selector into tag and attributes for BeautifulSoup"""
    if "." in selector:
        parts = selector.split(".")
        tag = parts[0] or None
        attrs = {"class": parts[1]} if len(parts) > 1 else {}
        return tag, attrs
    else:
        return selector, {}

# Test the selector parser
test_selectors = [
    "div.biblio-entry",
    "span.biblio-title",
    "li.biblio_tagged a"
]

print("Testing selector parser:")
for selector in test_selectors:
    tag, attrs = get_selector_parts(selector)
    print(f"{selector} -> tag: '{tag}', attrs: {attrs}")

# Fetch a publication using configurable selectors
async def fetch_with_configurable_selectors():
    """Fetch a publication using configurable selectors"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    }
    
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(scraper.base_url) as response:
            if response.status != 200:
                print(f"Error fetching page: {response.status}")
                return None
            
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            # Get container selector
            container_selector = selectors["publication"]["container"]
            tag, attrs = get_selector_parts(container_selector)
            pub_elements = soup.find_all(tag, attrs)
            
            if not pub_elements:
                print("No publications found")
                return None
            
            # Get first publication
            pub_element = pub_elements[0]
            
            # Extract using configurable selectors
            title_selector = selectors["publication"]["title"]
            tag, attrs = get_selector_parts(title_selector)
            title_element = pub_element.find(tag, attrs)
            title = title_element.text.strip() if title_element else None
            
            authors_selector = selectors["publication"]["authors"]
            tag, attrs = get_selector_parts(authors_selector)
            authors_element = pub_element.find(tag, attrs)
            authors = authors_element.text.strip() if authors_element else None
            
            abstract_selector = selectors["publication"]["abstract"]
            tag, attrs = get_selector_parts(abstract_selector)
            abstract_element = pub_element.find(tag, attrs)
            abstract = abstract_element.text.strip() if abstract_element else None
            
            return {
                "title": title,
                "authors": authors,
                "abstract": abstract
            }

# Run the function
config_pub = await fetch_with_configurable_selectors()

print("\nPublication fetched with configurable selectors:")
if config_pub:
    print(f"Title: {config_pub['title']}")
    print(f"Authors: {config_pub['authors']}")
    print(f"Abstract: {config_pub['abstract'][:100]}..." if config_pub['abstract'] else "No abstract")

Testing selector parser:
div.biblio-entry -> tag: 'div', attrs: {'class': 'biblio-entry'}
span.biblio-title -> tag: 'span', attrs: {'class': 'biblio-title'}
li.biblio_tagged a -> tag: 'li', attrs: {'class': 'biblio_tagged a'}

Publication fetched with configurable selectors:
Title: Global Networks, Monetary Policy and Trade
Authors: Kalemli-Özcan, Ṣ., Soylu, C. & Yildirim, M.A.
Abstract: We develop a novel framework to study the interaction between monetary policy and trade. Our New Key...


In [11]:
# CHUNK 9: Testing Improved Endnote Parser with Error Handling

# Sample Endnote content with different formatting issues
sample_endnote = """%0 Journal Article
%A Smith, John
%A Doe, Jane
%T Sample Article Title
%D 2023
%X <p>This is a <b>sample</b> abstract with some <strong>formatting</strong>.</p>
%J Journal of Sample Studies
"""

malformed_endnote = """%0 Journal Article
%ASmith, John
%T Missing space after %A
%X <p>Unclosed tag <b>problem</p>
%D Invalid date
"""

empty_endnote = ""

async def improved_parse_endnote(content):
    """Parse Endnote file content with robust error handling"""
    record = {}
    
    # Validate content format
    if not content or not content.strip():
        print("Empty Endnote content received")
        return record
    
    # Check for expected format
    if not any(line.startswith("%") for line in content.split("\n")):
        print("Unexpected Endnote format - missing % line markers")
        return record
    
    lines = content.split("\n")
    
    try:
        for line in lines:
            if line.startswith("%"):
                # Validate line format
                if len(line) < 3 or line[2] != ' ':
                    print(f"Malformed Endnote line: {line}")
                    continue
                
                key = line[1]
                value = line[3:].strip()
                
                if key == "A":  # Author
                    try:
                        name_parts = value.split(", ")
                        if len(name_parts) == 2:
                            value = f"{name_parts[1]} {name_parts[0]}"
                        record["author"] = record.get("author", []) + [value]
                    except Exception as e:
                        print(f"Error processing author: {e}")
                        
                elif key == "T":  # Title
                    record["title"] = value
                elif key == "D":  # Date
                    record["date"] = value
                elif key == "X":  # Abstract
                    try:
                        # Check if contains HTML
                        if "<" in value and ">" in value:
                            soup = BeautifulSoup(value, "html.parser")
                            # Remove formatting tags
                            for tag in soup.find_all(["b", "strong"]):
                                tag.unwrap()
                                
                            # Extract paragraphs
                            paragraphs = soup.find_all("p")
                            if paragraphs:
                                abstract = "\n".join(
                                    p.get_text(separator=" ", strip=True)
                                    for p in paragraphs
                                    if p.get_text(strip=True)
                                )
                            else:
                                # Fallback to full text
                                abstract = soup.get_text(separator=" ", strip=True)
                                
                            record["abstract"] = abstract.strip()
                        else:
                            # Plain text
                            record["abstract"] = value
                    except Exception as e:
                        print(f"Error processing abstract: {e}")
                        # Fallback to raw text
                        record["abstract"] = value
        
        # Join author list
        if "author" in record:
            record["author"] = ", ".join(record["author"])
            
        return record
    except Exception as e:
        print(f"Error parsing Endnote content: {e}")
        return record

# Test with valid Endnote content
print("Testing with valid Endnote content:")
valid_result = await improved_parse_endnote(sample_endnote)
print("\nParsed valid Endnote:")
for key, value in valid_result.items():
    print(f"- {key}: {value}")

# Test with malformed Endnote content
print("\nTesting with malformed Endnote content:")
malformed_result = await improved_parse_endnote(malformed_endnote)
print("\nParsed malformed Endnote:")
for key, value in malformed_result.items():
    print(f"- {key}: {value}")

# Test with empty Endnote content
print("\nTesting with empty Endnote content:")
empty_result = await improved_parse_endnote(empty_endnote)
print("\nParsed empty Endnote:")
print(empty_result)

Testing with valid Endnote content:

Parsed valid Endnote:
- author: John Smith, Jane Doe
- title: Sample Article Title
- date: 2023
- abstract: This is a sample abstract with some formatting .

Testing with malformed Endnote content:
Malformed Endnote line: %ASmith, John

Parsed malformed Endnote:
- title: Missing space after %A
- abstract: Unclosed tag problem
- date: Invalid date

Testing with empty Endnote content:
Empty Endnote content received

Parsed empty Endnote:
{}
