# Air Force Academy Roster Scraper

This notebook scrapes the Air Force Academy hockey roster page, extracts player names, positions, and other details, and assembles the data into a pandas DataFrame.

In [1]:
# Import Required Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Headers for HTTP requests
headers = {"User-Agent": "Mozilla/5.0"}

In [4]:
# Fetch and Parse the Webpage

# First try the NCAA league team listing to obtain the Air Force link
league_url = "https://www.eliteprospects.com/league/ncaa"
resp = requests.get(league_url, headers=headers)
try:
    resp.raise_for_status()
    soup = BeautifulSoup(resp.content, "html.parser")
    # look for the Air Force Academy link
    team_link = None
    for a in soup.select("a.TextLink_link__RhSiC"):
        if a.text.strip().lower() == "air force academy":
            team_link = a.get('href')
            break
    if team_link:
        roster_url = "https://www.eliteprospects.com" + team_link.rstrip("/") + "/roster"
    else:
        # fallback to a hardcoded id or search
        roster_url = "https://www.eliteprospects.com/team/2453/air-force-academy/roster"
    print("Using roster URL:", roster_url)
    resp = requests.get(roster_url, headers=headers)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.content, "html.parser")
    print("Page fetched and parsed")
except Exception as e:
    print("Error fetching roster page:", e)
    # leave soup undefined, subsequent cells may fail


Using roster URL: https://www.eliteprospects.com/team/2453/air-force-academy/roster
Error fetching roster page: 404 Client Error: Not Found for url: https://www.eliteprospects.com/team/2453/air-force-academy/roster


In [3]:
# If the direct URL fails, perform a search for the team page
search_url = "https://www.eliteprospects.com/search?q=Air+Force+Academy"
resp2 = requests.get(search_url, headers=headers)
resp2.raise_for_status()
soup2 = BeautifulSoup(resp2.content, "html.parser")

team_url = None
for a in soup2.select("a[href*='/team/']"):
    href = a.get('href')
    text = a.get_text(strip=True)
    if 'air' in text.lower() and 'force' in text.lower():
        team_url = 'https://www.eliteprospects.com' + href
        break

print("Search found team URL:", team_url)

# if team_url is found, append /roster if not present
if team_url and 'roster' not in team_url:
    team_url = team_url.rstrip('/') + '/roster'
    print("Roster URL constructed:", team_url)

# try fetching again if we have team_url
if team_url:
    resp = requests.get(team_url, headers=headers)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.content, "html.parser")
        print("Page fetched and parsed via search")
    else:
        print("Failed to fetch roster at", team_url, "status", resp.status_code)
else:
    print("Could not determine team URL")

HTTPError: 404 Client Error: Not Found for url: https://www.eliteprospects.com/search?q=Air+Force+Academy

In [None]:
# Extract Player Information
players = []
for a in soup.select("a.TextLink_link__RhSiC"):
    name = a.get_text(strip=True)
    href = a.get("href")
    players.append({"name": name, "url": href})

print(f"Found {len(players)} player link elements")

In [None]:
# Filter Players by Position
for p in players:
    m = re.search(r"\((G|D|F)\)", p["name"])
    p["position"] = m.group(1) if m else ""

print("Positions extracted")

In [None]:
# Create a DataFrame and Display Results
df = pd.DataFrame(players)
df.head(10)

In [None]:
# Validate Data Collection
print("Total players scraped:", len(df))
print(df["position"].value_counts())