In [None]:
import csv
import re
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd


In [None]:
def scrape_journal_info(article, driver):
    """
    Extracts information from a journal article's webpage.

    Navigates to an article's page and extracts the journal name, issue, abstract,
    and JEL codes using the provided Selenium WebDriver.

    Parameters:
        article (WebElement): The element containing the article's link.
        driver (WebDriver): Selenium WebDriver for browser automation.

    Returns:
        tuple: Contains journal_name, journal_issue, abstract, jel_codes.
    """
    original_window = driver.current_window_handle  # Remember the initial browser window

    paper_link = article.find_element(By.CSS_SELECTOR, "a").get_attribute('href')  # Get article's URL
    driver.execute_script("window.open(arguments[0]);", paper_link)  # Open the article in a new tab
    driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab

    journal_name, journal_issue, abstract, jel_codes = None, None, "", []  # Initialize empty data variables

    try:
        # Extract journal name and issue
        journal_elements = driver.find_elements(By.CSS_SELECTOR, "div.journal")
        journal_name = journal_elements[0].text if journal_elements else None
        journal_issue = journal_elements[1].text if len(journal_elements) > 1 else None
        
        # Extract and clean article abstract
        try:
            abstract_section = driver.find_element(By.CSS_SELECTOR, "section.article-information.abstract")
            abstract_lines = abstract_section.text.splitlines()[1:]  # Remove title line from abstract
            abstract = "\n".join(abstract_lines).strip()
            abstract = re.sub(r'\(JEL [A-Z][0-9]+(, [A-Z][0-9]+)*\)\.?$', '', abstract)  # Strip trailing JEL codes
        except Exception:
            pass  # Proceed without abstract if not found

        # Extract JEL Classification codes
        for jel in driver.find_elements(By.CSS_SELECTOR, "ul.jel-codes > li"):
            try:
                code = jel.find_element(By.CSS_SELECTOR, "strong.code").text
                jel_codes.append(code)
            except Exception:
                pass  # Skip any JEL code extraction issues
    except Exception as e:
        print(f"Error during scraping: {e}")  # Log general scraping errors
    finally:
        driver.close()  # Close the article tab
        driver.switch_to.window(original_window)  # Return to the original browser window

    return journal_name, journal_issue, abstract, jel_codes  # Return collected data

In [None]:
def scrape_paper(journal_name):
    """
    Scrapes articles from a specified journal listed on the AEA website.

    Navigates to the journal's webpage, iterates through all available issues,
    and extracts details for each article. Details include the title, issue,
    journal name, abstract, authors, and JEL codes. This information is then
    saved to a CSV file named after the journal.

    Parameters:
        journal_name (str): The name of the journal to scrape.

    Returns:
        str: The name of the created CSV file containing the scraped data.
    """
    browser_options = Options()
    driver = webdriver.Chrome(options=browser_options)
    driver.get("https://www.aeaweb.org/journals")

    try:
        driver.find_element(By.LINK_TEXT, journal_name).click()
        driver.find_element(By.LINK_TEXT, "Issues").click()

        issue_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='issues/']")
        issue_urls = [link.get_attribute('href') for link in issue_links]
        csv_name = f"{journal_name.replace(': ', '-').replace(' ', '-')}.csv"

        with open(csv_name, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            header_row = ["Title", "Issue", "Journal", "Abstract"] + \
                         [f"Author{i}" for i in range(1, 21)] + [f"JEL{i}" for i in range(1, 21)]
            writer.writerow(header_row)

            for url in issue_urls:
                driver.execute_script("window.open(arguments[0]);", url)
                driver.switch_to.window(driver.window_handles[-1])

                try:
                    article_links = driver.find_elements(By.CSS_SELECTOR, "article.journal-article")
                    for article in article_links:
                        title, authors_list = None, []

                        try:
                            title = article.find_element(By.CSS_SELECTOR, "h3.title").text
                        except Exception:
                            pass
                        
                        try:
                            authors_str = article.find_element(By.CSS_SELECTOR, "div.article-item-authors").text
                            authors_str = authors_str[3:] if authors_str.lower().startswith('by ') else authors_str
                            authors_str = authors_str.replace(' and ', ', ')
                            authors_list = authors_str.split(', ')
                        except Exception:
                            pass
                        
                        try:
                            _, journal_details, cleaned_abstract, jel_codes = scrape_journal_info(article, driver)
                        except Exception:
                            pass
                        
                        row = [title, journal_details, journal_name, cleaned_abstract] + \
                              authors_list[:20] + [''] * (20 - len(authors_list)) + \
                              jel_codes[:20] + [''] * (20 - len(jel_codes))
                        writer.writerow(row)

                finally:
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])

    except Exception as e:
        print(f"An error occurred while processing {journal_name}: {str(e)}")
    finally:
        driver.quit()

    return csv_name

In [None]:
journals = [
    "American Economic Review",  
    "AER: Insights", 
    "AEJ: Applied Economics", 
    "AEJ: Economic Policy", 
    "AEJ: Macroeconomics",
    "AEJ: Microeconomics" 
]

journals_csv = []

for journal_name in journals:
    csv_name = scrape_paper(journal_name) 
    journals_csv.append(csv_name)
    
combined_df = pd.DataFrame()

for csv_filename in journals_csv:
    temp_df = pd.read_csv(csv_filename)
    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

combined_df.to_csv('All-Journals.csv', index=False)

In [None]:
def clean_and_transform_aej_data(input_filename):
    """
    Transforms AEJ article data from wide to long format and cleans it.

    Specific operations:
    - Removes entries without an abstract.
    - Assigns sequential Paper_IDs.
    - Consolidates author and JEL columns into semicolon-separated strings.
    - Reshapes data to long format for both authors and JEL codes.
    - Sorts data and outputs to a new CSV, suffixing the original filename with '-Cleaned'.

    Parameters:
        input_filename (str): Path to the source CSV file.

    Output:
        None, but writes transformed data to '{original_filename}-Cleaned.csv'.
    """
    # Setup new file name and read data
    output_filename = input_filename.replace('.csv', '-Cleaned.csv')
    df = pd.read_csv(input_filename)
    
    # Clean and transform data
    df.dropna(subset=['Abstract'], inplace=True)
    df['Paper_ID'] = range(1, len(df) + 1)
    df['Authors Combined'] = df[[f'Author{i}' for i in range(1, 21)]].apply(lambda x: '; '.join(x.dropna()), axis=1)
    df['JEL Combined'] = df[[f'JEL{i}' for i in range(1, 21)]].apply(lambda x: '; '.join(x.dropna()), axis=1)

    # Reshape data into long format
    long_df_authors = pd.melt(df, id_vars=['Paper_ID', 'Title', 'Issue', 'Journal', 'Abstract', 'Authors Combined', 'JEL Combined'],
                              var_name='Author_ID', value_name='Author').dropna(subset=['Author'])
    long_df_authors['Author_ID'] = long_df_authors['Author_ID'].str.extract('(\d+)').astype(int)

    long_df_jel = pd.melt(long_df_authors, id_vars=['Paper_ID', 'Title', 'Issue', 'Journal', 'Abstract', 'Authors Combined', 'Author_ID', 'Author'],
                          var_name='JEL_ID', value_name='JEL Code').dropna(subset=['JEL Code'])
    long_df_jel['JEL_ID'] = long_df_jel['JEL_ID'].str.extract('(\d+)').astype(int)

    # Sort and save cleaned data
    long_df_jel.sort_values(by=['Paper_ID', 'Author_ID', 'JEL_ID'], ascending=[True, True, True], inplace=True)
    long_df_jel.to_csv(output_filename, index=False)

# Run the function for a specified file
clean_and_transform_aej_data('All-Journals.csv')