In [62]:
from bs4 import BeautifulSoup
import requests
import re
from urllib.parse import urljoin, urlparse
import pandas as pd
import numpy as np

# Website of job market candidates that we want to scrape
url = "https://economics.yale.edu/phd-program/placement"

# Get HTML content of the website
page = requests.get(url)

soup = BeautifulSoup(page.text, "html.parser")

# Create a list to store the data
data = []

# Find all header items with class "node-teaser_header"
headers = soup.find_all("div", class_="node-teaser__heading")

#for header in headers:
    
    #print(header.get_text(strip=True))

# Find all div tags with class "node-teaser__cv-link"
cv_links = soup.find_all("div", class_="node-teaser__cv-link")

for cv_link in cv_links:
    # Get the link to the CV by finding the a tag and extracting the href attribute
    link = cv_link.find("a")["href"]

# Find all div tags with class "node-teaser__website-link"
website_links = soup.find_all("div", class_="node-teaser__website-link")

for website_link in website_links:
    # Get the link to the website by finding the a tag and extracting the href attribute
    link = website_link.find("a")["href"]

# Extract fields, references, and dissertation titles stored in the same div "node-teaser__metadata-label"

# Find all div tags with the class 'node-teaser__metadata-label'
metadata_labels = soup.find_all("div", class_='node-teaser__metadata-label')

# Create lists to store the fields of interest, references, and dissertation titles
fields_of_interest = []
references = []
dissertations = []

# Loop through the metadata labels to distinguish fields, references, and dissertation titles by their text
for label in metadata_labels:
    if 'Fields of Interest' in label.get_text():
        # Find the sibling div with class 'node-teaser__metadata-value'
        field_of_interest_divs = label.find_next_sibling('div', class_='node-teaser__metadata-value')
        # Append the div to the list
        fields_of_interest.append(field_of_interest_divs)
        # Get text from the div
        #for field in fields_of_interest_divs:
            #print(field.get_text(strip=True))
        #print(field_of_interest_divs.get_text(strip=True))

    if 'References' in label.get_text():
        # Find the sibling div with class 'node-teaser__metadata-value'
        reference_divs = label.find_next_sibling('div', class_='node-teaser__metadata-value')
        # Append the div to the list
        references.append(reference_divs)
        #print(references_divs.get_text())

    # Find all div tags with the label 'Dissertation Title' or 'Job Market Paper' (technically they are different, but we just want to extract the text)
    if 'Job Market Paper' in label.get_text() or 'Dissertation Title' in label.get_text():
        # Find the sibling div with class 'node-teaser__metadata-value'
        dissertation_divs = label.find_next_sibling('div', class_='node-teaser__metadata-value')
        # Append the div to the list
        dissertations.append(dissertation_divs)

# Creating item columns
df = pd.DataFrame({
    "name": [header.get_text(strip=True) for header in headers],
    "cv_link": [cv_link.find("a")["href"] for cv_link in cv_links],
    "personal_website": [website_link.find("a")["href"] for website_link in website_links],
    "field": [field.get_text(strip=True) for field in fields_of_interest],
    "committee_members": [reference.get_text(strip=True) for reference in references],
    "dissertation": [dissertation.get_text(strip=True) for dissertation in dissertations]
})

print(df)

# Save the data to a CSV file
df.to_csv("yale_econ_phd_candidates.csv", index=False)

                         name  \
0       Pedro Casavilca Silva   
1   Fernando Pereira Cordeiro   
2          Alvaro Cox Lescano   
3                 Hanxiao Cui   
4               Mirco Dinelli   
5                     Tan Gan   
6         Daniel Giraldo Paez   
7            Rodrigo Guerrero   
8                Nghiem Huynh   
9               Sid Kankanala   
10                 Jaewon Lee   
11                 Ryungha Oh   
12           Bernardo Ribeiro   
13              Hiroki Saruya   
14                Jihoon Sung   
15             Anthony Tokman   
16                 Allen Vong   
17              Siu Yuat Wong   
18                  Wei Xiang   
19                 Qianyao Ye   
20                 Yingkai Li   

                                              cv_link  \
0   https://pedrocasavilcasilva.github.io/MyWebsit...   
1   https://fpcordeiro.github.io/files/Cordeiro_Fe...   
2   https://www.dropbox.com/scl/fi/pjrnv57sfan8hms...   
3   https://hanxcui.github.io/files/Hanxiao_C