In [1]:
import json
import pandas as pd

import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
validation_file = "peterson_url_validation_results.json"
with open(validation_file) as f:
    results = json.load(f)

results = pd.DataFrame(results)
logger.info(f"Loaded {len(results)} validation results")

# Filter URLs to scrape
matched = results[results["Overall_Match"]]
name_matched = results[results["Name_Match"] & ~results["Overall_Match"]]
tmp = results[~results["Name_Match"] & ~results["Overall_Match"]]
accepted = tmp[tmp["Location_Match"] & (tmp["Name_Similarity"] > 60)]
rejected = tmp[~tmp["Location_Match"]]
salvage = rejected[rejected["Name_Similarity"] > 90]

to_scrape = pd.concat([matched, name_matched, accepted, salvage]).drop_duplicates(
    subset=["Scraped_Name", "Scraped_Location", "URL"]
)

to_scrape_uni = dict(zip(to_scrape["University"], to_scrape["URL"]))

INFO:__main__:Loaded 1707 validation results


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time

# ─── SETUP ────────────────────────────────────────────────────────────────
options = Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)


def scrape_majors_json(name, url):
    driver.get(url)
    # close cookie banner if present
    try:
        close_btn = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Close']"))
        )
        driver.execute_script("arguments[0].click();", close_btn)
    except:
        pass

    # click Majors & Degrees tab
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.LINK_TEXT, "Majors & Degrees"))
    ).click()

    # wait for the majors container
    container = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.table-responsive.read-more"))
    )
    time.sleep(0.5)

    # expand all majors via 'See More'
    try:
        toggle = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "a.see-link.more"))
        )
        driver.execute_script("arguments[0].scrollIntoView(true); arguments[0].click();", toggle)
        # wait until rows increase
        WebDriverWait(driver, 5).until(
            lambda d: len(container.find_elements(By.TAG_NAME, "tr")) > 1
        )
    except Exception:
        pass

    # build JSON structure
    result = {"university_name": name, "majors_and_degrees": []}
    current_category = None
    for tr in container.find_elements(By.TAG_NAME, "tr"):
        title = tr.find_element(By.TAG_NAME, "th").text.strip()
        tds = tr.find_elements(By.TAG_NAME, "td")
        # detect category header rows by 'Associate' and 'Bachelors' labels
        if len(tds) >= 2 and \
           tds[0].text.strip().lower() == "associate" and \
           tds[1].text.strip().lower() == "bachelors":
            current_category = title
            result["majors_and_degrees"].append({"category": current_category, "programs": []})
        # program rows
        elif len(tds) >= 2:
            offers_assoc = 'check-teal' in tds[0].get_attribute("innerHTML")
            offers_bach = 'check-teal' in tds[1].get_attribute("innerHTML")
            program = {"name": title, "offers_associate": offers_assoc, "offers_bachelors": offers_bach}
            if not result["majors_and_degrees"]:
                # if no category yet, create one with None
                result["majors_and_degrees"].append({"category": None, "programs": [program]})
            else:
                result["majors_and_degrees"][-1]["programs"].append(program)
    return result


# --- Main execution ---
# provide your university URLs here
universities = to_scrape_uni 

output = []
error_log = []

universities = to_scrape_uni

total_unis = len(universities)
for idx, (name, url) in enumerate(universities.items(), 1):
    print(f"Scraping majors for {name} ({idx}/{total_unis})...")
    try:
        output.append(scrape_majors_json(name, url))
    except Exception as e:
        print(f"Error scraping {name} at index {idx}: {e}")
        error_log.append({"Index": idx, "University Name": name})
        continue  # Skip to the next university

    if idx % 10 == 0 or idx == total_unis:
        # save progress every 10 universities or at the end
        with open("university_majors_full.json", "w", encoding="utf-8") as f:
            json.dump(output, f, indent=2)
        print(f"Progress saved after {idx} universities.")
        # Save error log as Excel file
        if error_log:
            df_errors = pd.DataFrame(error_log)
            df_errors.to_excel("scrape_errors_full.xlsx", index=False)

driver.quit()


Scraping majors for Abilene Christian University (1/1526)...
Scraping majors for Abraham Baldwin Agricultural College (2/1526)...
Scraping majors for Academy of Art University (3/1526)...
Scraping majors for Adams State University (4/1526)...
Scraping majors for Adelphi University (5/1526)...
Scraping majors for Adrian College (6/1526)...
Scraping majors for AdventHealth University (7/1526)...
Scraping majors for Agnes Scott College (8/1526)...
Scraping majors for Endicott College (9/1526)...
Scraping majors for Mount St. Joseph University (10/1526)...
Progress saved after 10 universities.
Scraping majors for Erskine College (11/1526)...
Scraping majors for Eureka College (12/1526)...
Scraping majors for Evangel University (13/1526)...
Scraping majors for Everglades University (14/1526)...
Scraping majors for Fairfield University (15/1526)...
Scraping majors for Fairleigh Dickinson University (16/1526)...
Scraping majors for Fairmont State University (17/1526)...
Scraping majors for Fa

In [24]:
# Re-run failed universities
options = Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)

error_unis = {key: universities[key] for key in universities.keys() if key in df_errors['University Name'].values}

# rerun those with error

universities = error_unis

# output = []
# reset error log
error_log = []

total_unis = len(universities)
for idx, (name, url) in enumerate(universities.items(), 1):
    print(f"Scraping majors for {name} ({idx}/{total_unis})...")
    try:
        output.append(scrape_majors_json(name, url))
    except Exception as e:
        print(f"Error scraping {name} at index {idx}: {e}")
        error_log.append({"Index": idx, "University Name": name})
        continue  # Skip to the next university

    if idx % 10 == 0 or idx == total_unis:
        # save progress every 10 universities or at the end
        with open("university_majors_2.json", "w", encoding="utf-8") as f:
            json.dump(output, f, indent=2)
        print(f"Progress saved after {idx} universities.")
        # Save error log as Excel file
        if error_log:
            df_errors = pd.DataFrame(error_log)
            df_errors.to_excel("scrape_errors_2.xlsx", index=False)
driver.quit()


Scraping majors for Dharma Realm Buddhist University (1/1)...
Error scraping Dharma Realm Buddhist University at index 1: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff72653fe75+79173]
	GetHandleVerifier [0x0x7ff72653fed0+79264]
	(No symbol) [0x0x7ff7262f9e5a]
	(No symbol) [0x0x7ff726350586]
	(No symbol) [0x0x7ff72635083c]
	(No symbol) [0x0x7ff7263a4247]
	(No symbol) [0x0x7ff7263789af]
	(No symbol) [0x0x7ff7263a100d]
	(No symbol) [0x0x7ff726378743]
	(No symbol) [0x0x7ff7263414c1]
	(No symbol) [0x0x7ff726342253]
	GetHandleVerifier [0x0x7ff72680a2ad+3004797]
	GetHandleVerifier [0x0x7ff7268046fd+2981325]
	GetHandleVerifier [0x0x7ff726823350+3107360]
	GetHandleVerifier [0x0x7ff72655a9fe+188622]
	GetHandleVerifier [0x0x7ff72656228f+219487]
	GetHandleVerifier [0x0x7ff726548dc4+115860]
	GetHandleVerifier [0x0x7ff726548f79+116297]
	GetHandleVerifier [0x0x7ff72652f528+11256]
	BaseThreadInitThunk [0x0x7ff9941b7374+20]
	RtlUserThreadStart [0x0x7ff995e1cc91+33]

