In [15]:
from bs4 import BeautifulSoup
import requests
import re
from urllib.parse import urljoin, urlparse
import pandas as pd
import numpy as np
import gender_guesser.detector as gender

In [16]:
# THIS CODE WORKS FOR COHORTS FROM 2023-2024
# Website of job market candidates that we want to scrape
# Note the year, as it will be used to save the CSV output later:
# 2022
url = "https://web.archive.org/web/20220201032755/https://economics.yale.edu/graduate/graduate-placement"

# Get HTML content of the website
page = requests.get(url)

soup = BeautifulSoup(page.text, "html.parser")

# Create a list to store the data
data = []

In [18]:
# Find all header items with class "node-teaser_header"
headers = soup.find_all("div", class_="node-teaser__heading")
print(headers)

# Find all div tags with class "node-teaser__cv-link"
cv_links = soup.find_all("div", class_="node-teaser__cv-link")

for cv_link in cv_links:
    # Get the link to the CV by finding the a tag and extracting the href attribute
    link = cv_link.find("a")["href"]

# Find all div tags with class "node-teaser__website-link"
website_links = soup.find_all("div", class_="node-teaser__website-link")

for website_link in website_links:
    # Get the link to the website by finding the a tag and extracting the href attribute
    link = website_link.find("a")["href"]

# Extract fields, references, and dissertation titles stored in the same div "node-teaser__metadata-label"

# Find all div tags with the class 'node-teaser__metadata-label'
metadata_labels = soup.find_all("div", class_='node-teaser__metadata-label')

# Create lists to store the fields of interest, references, and dissertation titles
fields_of_interest = []
references = []
dissertations = []

# Loop through the metadata labels to distinguish fields, references, and dissertation titles by their text
for label in metadata_labels:
    if 'Fields of Interest' in label.get_text():
        # Find the sibling div with class 'node-teaser__metadata-value'
        field_of_interest_divs = label.find_next_sibling('div', class_='node-teaser__metadata-value')
        # Append the div to the list
        fields_of_interest.append(field_of_interest_divs)

    if 'References' in label.get_text():
        # Find the sibling div with class 'node-teaser__metadata-value'
        reference_divs = label.find_next_sibling('div', class_='node-teaser__metadata-value')
        # Append the div to the list
        references.append(reference_divs)
        #print(references_divs.get_text())

    # Find all div tags with the label 'Dissertation Title' or 'Job Market Paper' (technically they are different, but we just want to extract the text)
    if 'Job Market Paper' in label.get_text() or 'Dissertation Title' in label.get_text():
        # Find the sibling div with class 'node-teaser__metadata-value'
        dissertation_divs = label.find_next_sibling('div', class_='node-teaser__metadata-value')
        # Append the div to the list
        dissertations.append(dissertation_divs)

# Creating item columns
df = pd.DataFrame({
    "name": [header.get_text(strip=True) for header in headers],
    "school": 'Yale',
    "year": '2022',
    "cv_link": [cv_link.find("a")["href"] for cv_link in cv_links],
    "personal_website": [website_link.find("a")["href"] for website_link in website_links],
    "field": [field.get_text(strip=True) for field in fields_of_interest],
    "committee_members": [reference.get_text(strip=True) for reference in references],
    "dissertation": [dissertation.get_text(strip=True) for dissertation in dissertations]
})

print(df)

[]
Empty DataFrame
Columns: [name, school, year, cv_link, personal_website, field, committee_members, dissertation]
Index: []


In [13]:
# Create and call gender guesser function
d = gender.Detector()
def guess_gender(name):
    if not name: 
        return 'unknown'
    first_name = name.split()[0]
    return d.get_gender(first_name)

#Create new gender column
df['gender_guess'] = df['name'].apply(guess_gender)

#Reorder columns
df = df[['name', 'gender_guess', 'school', 'year', 'cv_link', 'personal_website', 'field', 'committee_members', 'dissertation']]
print(df)

                   name   gender_guess school  year  \
0     Francesco Beraldi           male   Yale  2024   
1        Carlo Cusumano           male   Yale  2024   
2         Daniel Graves           male   Yale  2024   
3        Timothy Hersey           male   Yale  2024   
4            Jack Liang           male   Yale  2024   
5     Ferdinand Pieroth           male   Yale  2024   
6   Matthew Schwartzman           male   Yale  2024   
7           Xiangyu Shi        unknown   Yale  2024   
8        Samuel Solomon           male   Yale  2024   
9         Lindsey Uniat  mostly_female   Yale  2024   
10     Stephan Waizmann           male   Yale  2024   

                                              cv_link  \
0   /sites/default/files/cv/FrancescoBeraldi_CV_0.pdf   
1   /sites/default/files/cv/Cusumano_Carlo_CV_FINA...   
2   /sites/default/files/cv/Graves_Daniel_CV_Final...   
3     /sites/default/files/cv/Hersey_Timothy_CV_1.pdf   
4                /sites/default/files/cv/LIANG_CV.pdf 

In [14]:
# Save the data to a CSV file, making sure to specify the year
df.to_csv("yale_econ_jm_candidates_2024.csv", index=False)

#Check first few columns
print(df.head())

                name gender_guess school  year  \
0  Francesco Beraldi         male   Yale  2024   
1     Carlo Cusumano         male   Yale  2024   
2      Daniel Graves         male   Yale  2024   
3     Timothy Hersey         male   Yale  2024   
4         Jack Liang         male   Yale  2024   

                                             cv_link  \
0  /sites/default/files/cv/FrancescoBeraldi_CV_0.pdf   
1  /sites/default/files/cv/Cusumano_Carlo_CV_FINA...   
2  /sites/default/files/cv/Graves_Daniel_CV_Final...   
3    /sites/default/files/cv/Hersey_Timothy_CV_1.pdf   
4               /sites/default/files/cv/LIANG_CV.pdf   

                               personal_website  \
0              http://www.francescoberaldi.com/   
1                https://www.carlocusumano.org/   
2  https://sites.google.com/view/daniel-graves/   
3                https://www.timothyhersey.com/   
4      https://sites.google.com/view/jackliang/   

                                               field  \