In [92]:
from bs4 import BeautifulSoup
import requests
import re
from urllib.parse import urljoin, urlparse
import pandas as pd
import numpy as np

In [93]:
# Website of job market candidates that we want to scrape
url = "https://economics.yale.edu/phd-program/placement/outcomes"

# Get HTML content of the website
page = requests.get(url)

soup = BeautifulSoup(page.text, "html.parser")

print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).init={privacy:{cookies_enabled:true},ajax:{deny_list:["bam.nr-data.net"]},distributed_tracing:{enabled:true}};(window.NREUM||(NREUM={})).loader_config={agentID:"1588800701",accountID:"3876684",trustKey:"66686",xpid:"Vw4AV1BbDBABUVZbAwUEVFUD",licenseKey:"NRJS-a588e19175fb60800e5",applicationID:"1579443212"};;/*! For license information please see nr-loader-spa-1.274.0.min.js.LICENSE.txt */
  </script>
  <style>
   /* @see https://github.com/aFarkas/lazysizes#broken-image-symbol */.j

In [94]:
# Create a list to store the data
data = []

# Initialize lists to store names, placements
names = []
placements = []

# Find table tags within HTML code
tables = soup.find_all("table")

# List of years for each table, assuming each table corresponds to one year in sequence
years = list(range(2024, 1997, -1))  # Generates [2024, 2023, ..., 1998]

print(len(tables))

27


In [95]:
# Loop through each table to process rows within each one
for i, table in enumerate(tables): # i = index of the current item, table = value of the current item
    # Set the current year based on the table's position
    current_year = years[i]
    # print(f"Processing Table {i+1} for Year: {current_year}")  # Debug info

    # Find all rows (<tr>) within the current table
    rows = table.find_all('tr')
    # print(f"Number of rows found in Table {i+1}: {len(rows)}")  # Debug info
    # print(f"Processing Table {i+1}, Rows Found: {len(rows)}")

    # Loop through each row in the current table
    for row in rows:
        # Find all <td> elements within this row
        tds = row.find_all('td')
        #print(f"Number of tds found in Table {i + 1}: {len(tds)}")
        
        # Proceed only if there are at least two <td> elements
        if len(tds) > 1:
            # Extract the name from the <strong> tag if it exists
            strong_tag = tds[0].find('strong')
            name = strong_tag.get_text().strip() if strong_tag else None  # Append None if no name is found
            
            # Extract the placement (college/university) from the second <td>
            placement = tds[1].get_text().strip() if tds[1] else None
            
            # Append the row data to the list, including the current year
            data.append({
                'name': name,
                'placement': placement,
                'year': current_year
            })
        #else:
        #     # Optionally, print or log rows that are skipped due to insufficient <td> elements
           #print(f"Skipping row due to insufficient <td> elements: {row}")

In [96]:
# Convert to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
# print(len(df)) #Current len = 1800

# Save the data to a CSV file, this is just a test file
# df.to_csv("yale_econ_jm_placements.csv", index=False)

In [97]:
# Identify and flag empty cells
empty_cells = df.isnull()
empty_rows = df[empty_cells.any(axis=1)]

print("\nRows with empty cells:")
print(len(empty_rows)) # Length should be zero, but it's 6
print(empty_rows)

# Let's locate the surrounding rows to figure out from what part of the website is our data considered missing
#print(df.iloc[286:290]) # Print rows from index a to b (inclusive of a but exclusive of b)
print(df.iloc[287:290])


Rows with empty cells:
6
     name                    placement  year
36   None  Stellation Care, co-founder  2023
188  None                               2013
223  None                               2011
241  None      Northwestern University  2011
242  None                               2010
288  None                               2007
                  name               placement  year
287         Zhao, Ying     CUNY Queens College  2008
288               None                          2007
289  Bachmann, Rudiger  University of Michigan  2007


In [98]:
# Manually enter the missing data

#Example: Set 'Name' where 'Placement' is 'Some College'
df.loc[df['placement'] == 'Stellation Care, co-founder', 'name'] = 'Zhu, Diana'
#df.loc[241, 'Name'] = 'Torgovitsky, Alexander' # Change this!!
df.loc[(df['placement'] == 'Northwestern University') &
       (df['year'] == 2011) &
       (df['name'].shift(1) == 'Souza, Priscila'), # Ensures that the empty cell replaces the correct name and no more
       'name'
       ] = 'Torgovitsky, Alexander'
# Row 188 is empty; between years
# 223 is empty; between years
# 242 is empty; between years
# 288 is empty; between years

In [99]:
# ACADEMIC, SCHOOL, AND DEPT. COLUMNS
# Create and call a new academic placement function
# Create new academic column
df['academic'] = df['placement'].str.contains(r'University|College|School', na=False).astype(int) # returns a boolean series True or False # creates binary

# School and department columns
df['school'] = 'Yale'
df['department'] = 'Economics'

In [100]:
# Save the data to a CSV file, this is just a test file
df.to_csv("yale_econ_jm_placements.csv", index=False)