In [54]:
# ONLY DO THIS ONCE!
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
driver.get("https://fbschedules.com/2025-penn-state-football-schedule/")
time.sleep(5)  # Wait for JS table to render
with open("psu_2025.html", "w", encoding="utf-8") as f:
    f.write(driver.page_source)
driver.quit()

In [55]:
import re
import pandas as pd
from bs4 import BeautifulSoup

# Load from disk
with open("psu_2025.html", "r", encoding="utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", class_="cfb-sch")

# If the table is still None, increase Selenium wait time!
rows = table.find_all("tr", class_=re.compile(r"^\d{6}$"))

# The order of columns based on fbschedules.com is likely:
# Date | Opponent | Location | Result | Record | TV | Time
headers = ["Date", "Opponent", "Location", "Result", "Record", "TV", "Time"]

games = []
for row in rows:
    cols = row.find_all("td")
    if len(cols) >= 7:
        game = {headers[i]: cols[i].get_text(strip=True) for i in range(7)}
        games.append(game)
    else:
        # Defensive: If row is short, still try to fill what you can
        game = {headers[i]: cols[i].get_text(strip=True) if i < len(cols) else "" for i in range(7)}
        games.append(game)

def split_opponent_venue(raw):
    # Clean up: Remove location prefix ("at", "vs") even if followed by a number, and pull off rank
    # E.g. "at5Ohio State Buckeyes", "8Oregon Ducks"
    m = re.match(r'^(at|vs)\s*(\d+)?\s*(.*)', raw)
    if m:
        location_type = m.group(1)
        raw2 = m.group(3)
    else:
        location_type = ""
        # Remove rank prefix, e.g. "17Indiana Hoosiers" -> "Indiana Hoosiers"
        raw2 = re.sub(r'^\d+\s*', '', raw)
    
    # Remove parenthetical notes like (HC)
    raw2 = re.sub(r'\([^)]*\)', '', raw2).strip()

    # Regex: split where last lowercase is followed by uppercase
    m2 = re.search(r'([a-z])([A-Z])', raw2)
    if m2:
        idx = m2.start(2)
        opponent = raw2[:idx].strip()
        venue_city = raw2[idx:].strip()
    else:
        opponent = raw2.strip()
        venue_city = ""

    # Split venue/city on first comma
    if "," in venue_city:
        idx = venue_city.find(",")
        venue = venue_city[:idx].strip()
        city_state = venue_city[idx+1:].strip()
    else:
        venue = venue_city
        city_state = ""
    return location_type, opponent, venue, city_state

games = []
for row in rows:
    cols = [td.get_text(strip=True) for td in row.find_all("td")]
    if len(cols) < 5:
        continue

    date = cols[0]
    raw = cols[2]
    time_tv = cols[3]
    result = cols[4]
    
    # BYE week
    if raw == "OFF":
        games.append({
            "Date": date,
            "Opponent": "BYE",
            "Location_Type": "",
            "Venue": "",
            "City_State": "",
            "Time": "",
            "TV": "",
            "Result": result
        })
        continue

    location_type, opponent, venue, city_state = split_opponent_venue(raw)

    # Parse time and TV
    tv_match = re.match(r'^(.+?[EC]T)(.*)$', time_tv)
    if tv_match:
        time = tv_match.group(1).strip()
        tv = tv_match.group(2).strip()
    else:
        time, tv = time_tv, ''
    
    games.append({
        "Date": date,
        "Opponent": opponent,
        "Location_Type": location_type,
        "Venue": venue,
        "City_State": city_state,
        "Time": time,
        "TV": tv,
        "Result": result
    })

df = pd.DataFrame(games)
print(df)


               Date                 Opponent Location_Type              Venue  \
0   SaturdayAug. 30         Nevada Wolf Pack                   Beaver Stadium   
1    SaturdaySep. 6             FIU Panthers                   Beaver Stadium   
2   SaturdaySep. 13       Villanova Wildcats                   Beaver Stadium   
3   SaturdaySep. 20                      BYE                                    
4   SaturdaySep. 27             Oregon Ducks                   Beaver Stadium   
5    SaturdayOct. 4              UCLA Bruins            at  Rose Bowl Stadium   
6   SaturdayOct. 11    Northwestern Wildcats                   Beaver Stadium   
7   SaturdayOct. 18            Iowa Hawkeyes            at    Kinnick Stadium   
8   SaturdayOct. 25                      BYE                                    
9    SaturdayNov. 1      Ohio State Buckeyes            at       Ohio Stadium   
10   SaturdayNov. 8         Indiana Hoosiers                   Beaver Stadium   
11  SaturdayNov. 15  Michiga