In [1]:
"""
This script is used for web scraping, particularly from https://www.aeaweb.org/journals/aer. 
It extracts article information and processes it into structured and clean CSV files.

Dependencies:
    csv: For reading and writing CSV files.
    re: For regular expression operations, useful in text processing.
    selenium: For automating web browser interaction.
    pandas: For data manipulation and analysis, especially with table-like data (DataFrames).
"""

# Standard library imports for file and text operations
import csv  # Module for reading and writing CSV files
import re  # Module for regular expression operations

# Selenium imports for web scraping
from selenium import webdriver  # Main class for web browser automation
from selenium.webdriver.common.by import By  # Enum for types of locating strategies in Selenium
from selenium.webdriver.chrome.options import Options  # Class for managing options specific to Chrome WebDriver

# Import for data manipulation and analysis
import pandas as pd  # Popular data manipulation library for Python

## Scrape Information From the AER Website

In [2]:
def scrape_journal_info(article, driver):
    """
    Extracts and returns journal name, issue, abstract, JEL codes, and article URL from a journal article webpage.

    Opens the article's webpage in a new tab, scrapes the data, and then closes the tab after extraction.
    
    Parameters:
        article (WebElement): The element containing the link to the article.
        driver (WebDriver): The instance used for browser automation.

    Returns:
        tuple: Contains (journal_name, journal_issue, abstract, jel_codes, journal_link). Elements will be None or empty if data is missing.
        
    Notes:
        - Assumes a consistent HTML structure with specific CSS selectors.
        - Errors during extraction are logged but not raised. This function does not throw exceptions but returns None values for missing data.
    """
    original_window = driver.current_window_handle  # Stores the handle of the original browser window.

    paper_link = article.find_element(By.CSS_SELECTOR, "a").get_attribute('href')  # Retrieves the article's URL.
    driver.execute_script("window.open(arguments[0]);", paper_link)  # Opens the article in a new browser tab.
    driver.switch_to.window(driver.window_handles[-1])  # Switches to the new browser tab.

    # Initializes variables for data extraction.
    journal_name, journal_issue, abstract, jel_codes = None, None, "", []

    try:
        # Attempts to extract the journal name and issue number.
        journal_elements = driver.find_elements(By.CSS_SELECTOR, "div.journal")
        journal_name = journal_elements[0].text if journal_elements else None
        journal_issue = journal_elements[1].text if len(journal_elements) > 1 else None
        
        # Attempts to extract and format the article's abstract.
        try:
            abstract_section = driver.find_element(By.CSS_SELECTOR, "section.article-information.abstract")
            abstract_lines = abstract_section.text.splitlines()[1:]  # Removes the title line from the abstract text.
            abstract = "\n".join(abstract_lines).strip()
            abstract = re.sub(r'\(JEL [A-Z][0-9]+(, [A-Z][0-9]+)*\)\.?$', '', abstract)  # Removes trailing JEL codes from the abstract.
        except Exception:
            pass  # Continues without abstract if it is not found.

        # Attempts to extract JEL Classification codes.
        for jel in driver.find_elements(By.CSS_SELECTOR, "ul.jel-codes > li"):
            try:
                code = jel.find_element(By.CSS_SELECTOR, "strong.code").text
                jel_codes.append(code)
            except Exception:
                pass  # Skips any JEL codes that cannot be extracted.
    except Exception as e:
        print(f"Error during scraping: {e}")  # Logs any errors encountered during scraping.
    finally:
        driver.close()  # Closes the browser tab with the article.
        driver.switch_to.window(original_window)  # Returns to the original browser window.

    return journal_name, journal_issue, abstract, jel_codes, paper_link  # Returns the scraped data.


In [3]:
def scrape_paper(journal_name):
    """
    Extracts and compiles article details from a specified AEA journal into a CSV file.
    
    Iterates through all available issues of the journal on the AEA website, extracting 
    details for each article including the title, issue, journal name, abstract, 
    authors, JEL codes, and the article link, then saves these details into a CSV file.

    Parameters:
        journal_name (str): The name of the journal to be scraped.

    Returns:
        str: The filename of the generated CSV containing the article details.
    """
    browser_options = Options()  # Initialize browser settings for WebDriver.
    driver = webdriver.Chrome(options=browser_options)  # Create a new instance of the Chrome WebDriver.
    driver.get("https://www.aeaweb.org/journals")  # Navigate to the AEA journals listing page.

    try:
        # Navigate to the specific journal's page and access its issues section.
        driver.find_element(By.LINK_TEXT, journal_name).click()
        driver.find_element(By.LINK_TEXT, "Issues").click()
        
        # Extract all issue links available on the journal's page.
        issue_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='issues/']")
        issue_urls = [link.get_attribute('href') for link in issue_links]
        
        # Set up the CSV file that will store the extracted article information.
        csv_name = f"{journal_name.replace(': ', '-').replace(' ', '-')}.csv"
        with open(csv_name, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            # Define and write the header row of the CSV file.
            writer.writerow(["Title", "Issue", "Journal", "Abstract", "Link"] + [f"Author{i}" for i in range(1, 21)] + [f"JEL{i}" for i in range(1, 21)])
            
            # Iterate through each issue URL and extract article details.
            for url in issue_urls:
                driver.execute_script("window.open(arguments[0]);", url)  # Open each issue in a new browser tab.
                driver.switch_to.window(driver.window_handles[-1])  # Switch context to the new tab.
                try:
                    # Iterate through each article in the issue and extract relevant details.
                    for article in driver.find_elements(By.CSS_SELECTOR, "article.journal-article"):
                        try:
                            # Extract individual article details and compile them into a CSV row.
                            title = article.find_element(By.CSS_SELECTOR, "h3.title").text or None
                            authors_str = article.find_element(By.CSS_SELECTOR, "div.article-item-authors").text
                            authors_list = authors_str.replace(' and ', ', ').strip('by ').split(', ')  # Clean and split author names.
                            journal_name, issue, abstract, jel_codes, paper_link = scrape_journal_info(article, driver)
                            # Combine all article details and append to the CSV.
                            row = [title, issue, journal_name, abstract, paper_link] + authors_list[:20] + [''] * (20 - len(authors_list)) + jel_codes[:20] + [''] * (20 - len(jel_codes))
                            writer.writerow(row)
                        except Exception:
                            pass  # Continue to the next article if there's an error.
                finally:
                    driver.close()  # Close the current tab with the issue.
                    driver.switch_to.window(driver.window_handles[0])  # Return to the main journal page tab.
    except Exception as e:
        print(f"Error processing {journal_name}: {e}")  # Log any encountered errors.
    finally:
        driver.quit()  # Close the browser once all processing is complete.

    return csv_name  # Return the path of the filled CSV file.

In [7]:
# Define a list of target journals to scrape from the AEA website.
journals = [
    "American Economic Review",  
    "AER: Insights", 
    "AEJ: Applied Economics", 
    "AEJ: Economic Policy", 
    "AEJ: Macroeconomics",
    "AEJ: Microeconomics" 
]

# Initialize an empty list to store CSV filenames for each scraped journal.
journals_csv = []

# Iterate through each journal, scrape its data, and save the filename of the created CSV.
for journal_name in journals:
    csv_name = scrape_paper(journal_name)  # Call scraping function.
    journals_csv.append(csv_name)  # Append the CSV filename to the list.
    
# Initialize an empty DataFrame to combine all scraped data.
combined_df = pd.DataFrame()

# Iterate through each generated CSV file, read its contents, and combine it into one DataFrame.
for csv_filename in journals_csv:
    temp_df = pd.read_csv(csv_filename)  # Read individual journal CSV into a temporary DataFrame.
    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)  # Combine with the main DataFrame.

# Export the combined DataFrame containing all journals' data into a single CSV file.
combined_df.to_csv('All-Journals.csv', index=False)

## Clean the Dataset

In [46]:
df = pd.read_csv('All-Journals.csv')

# Filter out rows where the first JEL code starts with 'Y'
df = df.dropna(subset=['JEL1'])
df = df[-df['JEL1'].str.startswith('Y')] # Use '-' for negation

# Drop columns where all values are NaN
df = df.dropna(axis=1, how='all')

# Save the cleaned DataFrame to a new CSV file, without the index
df.to_csv('All-Journals-Cleaned.csv', index=False)