In [None]:
# Importing standard library modules for file and text operations
import csv
import re
import os

# Importing Selenium modules for web scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Importing pandas for data manipulation and analysis
import pandas as pd


# Scrape Information From the AER Website

In [None]:
def scrape_journal_info(article, driver):
    """
    Extracts and returns journal name, issue, abstract, JEL codes, and article URL from a journal article webpage.

    Opens the article's webpage in a new tab, scrapes the data, and then closes the tab after extraction.
    
    Parameters:
        article (WebElement): The element containing the link to the article.
        driver (WebDriver): The instance used for browser automation.

    Returns:
        tuple: Contains (journal_name, journal_issue, abstract, jel_codes, journal_link). Elements will be None or empty if data is missing.
        
    Notes:
        - Assumes a consistent HTML structure with specific CSS selectors.
        - Errors during extraction are logged but not raised. This function does not throw exceptions but returns None values for missing data.
    """
    # Save the original browser window handle
    original_window = driver.current_window_handle

    # Extract and open the article link in a new browser tab
    paper_link = article.find_element(By.CSS_SELECTOR, "a").get_attribute('href')
    driver.execute_script("window.open(arguments[0]);", paper_link)
    driver.switch_to.window(driver.window_handles[-1])

    # Initialize variables to store scraped information
    journal_name, journal_issue, abstract, jel_codes = None, None, "", []

    try:
        # Extract the journal name and issue from the webpage
        journal_elements = driver.find_elements(By.CSS_SELECTOR, "div.journal")
        journal_name = journal_elements[0].text if journal_elements else None
        journal_issue = journal_elements[1].text if len(journal_elements) > 1 else None
        
        # Extract the abstract, stripping out JEL codes if present
        try:
            abstract_section = driver.find_element(By.CSS_SELECTOR, "section.article-information.abstract")
            abstract_lines = abstract_section.text.splitlines()[1:]  # Skip the section title
            abstract = "\n".join(abstract_lines).strip()
            # Remove trailing JEL code references from the abstract
            abstract = re.sub(r'\(JEL [A-Z][0-9]+(, [A-Z][0-9]+)*\)\.?$', '', abstract)
        except Exception:
            pass  # Ignore exceptions and leave 'abstract' empty if extraction fails

        # Extract JEL codes
        for jel in driver.find_elements(By.CSS_SELECTOR, "ul.jel-codes > li"):
            try:
                code = jel.find_element(By.CSS_SELECTOR, "strong.code").text
                jel_codes.append(code)
            except Exception:
                pass  # Ignore exceptions and leave 'jel_codes' empty if extraction fails
    except Exception as e:
        # Log any errors encountered during the entire extraction process
        print(f"Error during scraping: {e}")
    finally:
        # Close the new tab and switch back to the original window
        driver.close()
        driver.switch_to.window(original_window)

    # Return the extracted information
    return journal_name, journal_issue, abstract, jel_codes, paper_link


In [None]:
def scrape_paper(journal_name):
    """
    Extracts and compiles article details from a specified AEA journal into a CSV file.
    
    Iterates through all available issues of the journal on the AEA website, extracting 
    details for each article including the title, issue, journal name, abstract, 
    authors, JEL codes, and the article link, then saves these details into a CSV file.

    Parameters:
        journal_name (str): The name of the journal to be scraped.

    Returns:
        str: The filename of the generated CSV containing the article details.
    """
    # Configure Selenium browser options
    browser_options = Options()
    
    # Set the option to run the browser in headless mode
    # browser_options.add_argument("--headless")

    # Initialize the Chrome WebDriver with the specified options
    driver = webdriver.Chrome(options=browser_options)
    # Navigate to the AEA journals main page
    driver.get("https://www.aeaweb.org/journals")

    # Initialize variable to hold the CSV filename
    csv_name = None
    
    try:
        # Click the journal_name link via JavaScript
        element = driver.find_element(By.LINK_TEXT, journal_name)
        driver.execute_script("arguments[0].click();", element)
        # Navigate to the issues section of the journal
        driver.find_element(By.LINK_TEXT, "Issues").click()
        
        # Gather all issue links from the page
        issue_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='issues/']")
        # Extract the URLs from the issue links
        issue_urls = [link.get_attribute('href') for link in issue_links]
        
        # Set the filename for the CSV file to be created
        csv_name = os.path.join("Raw", f"{journal_name.replace(': ', '-').replace(' ', '-')}.csv")
        # Open the CSV file for writing
        with open(csv_name, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            # Write the CSV header row
            writer.writerow(["Title", "Issue", "Journal", "Abstract", "Link"] + [f"Author{i}" for i in range(1, 21)] + [f"JEL{i}" for i in range(1, 21)])
            
            # Iterate through all issue URLs
            for url in issue_urls:
                # Open each issue in a new browser tab
                driver.execute_script("window.open(arguments[0]);", url)
                # Switch to the new tab
                driver.switch_to.window(driver.window_handles[-1])
                try:
                    # Extract details for each article in the issue
                    for article in driver.find_elements(By.CSS_SELECTOR, "article.journal-article"):
                        try:
                            # Extract article details
                            title = article.find_element(By.CSS_SELECTOR, "h3.title").text or None
                            authors_str = article.find_element(By.CSS_SELECTOR, "div.article-item-authors").text
                            authors_list = authors_str.replace(' and ', ', ').removeprefix('by ').replace('ⓡ ', ', ').split(', ')
                            journal_name, issue, abstract, jel_codes, paper_link = scrape_journal_info(article, driver)
                            # Prepare the data row for the CSV file
                            row = [title, issue, journal_name, abstract, paper_link] + authors_list[:20] + [''] * (20 - len(authors_list)) + jel_codes[:20] + [''] * (20 - len(jel_codes))
                            # Write the article details to the CSV file
                            writer.writerow(row)
                        except Exception:
                            pass  # Ignore exceptions and continue with the next article
                finally:
                    # Close the new tab and switch back to the original window
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
    except Exception as e:
        # Log any exceptions encountered during the process
        print(f"Error processing {journal_name}: {e}")
    finally:
        # Quit the WebDriver and close all browser windows
        driver.quit()

    # Return the name of the created CSV file
    return csv_name


In [None]:
# List of journal names to be scraped
journals = [
    "American Economic Review",  
    "AER: Insights", 
    "AEJ: Applied Economics", 
    "AEJ: Economic Policy", 
    "AEJ: Macroeconomics",
    "AEJ: Microeconomics" 
]

# Initialize an empty list to store filenames of the CSVs created for each journal
journals_csv = []

# Iterate over each journal name
for journal_name in journals:
    # Call the scrape_paper function for each journal
    csv_name = scrape_paper(journal_name)
    # Append the resulting CSV filename to the journals_csv list
    journals_csv.append(csv_name)

# Initialize an empty DataFrame to hold the combined data from all journals
combined_df = pd.DataFrame()

# Iterate over each CSV filename
for csv_filename in journals_csv:
    # Read the CSV file into a temporary DataFrame
    temp_df = pd.read_csv(csv_filename)
    # Concatenate the temporary DataFrame with the combined DataFrame
    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(os.path.join('Derived', 'All-Journals.csv'), index=False)


# Clean the Dataset

In [None]:
# Read the combined journals CSV into a DataFrame
df = pd.read_csv(os.path.join('Derived', 'All-Journals.csv'))

# Drop rows where the 'JEL1' column has missing values
df = df.dropna(subset=['JEL1'])

# Remove rows where the 'JEL1' column starts with 'Y'
df = df[-df['JEL1'].str.startswith('Y')]

# Drop columns where all values are NaN
df = df.dropna(axis=1, how='all')

# Save the cleaned DataFrame to a new CSV file
df.to_csv(os.path.join('Derived', 'All-Journals-Cleaned.csv'), index=False)