In [1]:
import nflreadpy as nfl
import pandas as pd

rosters = nfl.load_rosters().to_pandas()
qbs = rosters[rosters['position'] == 'QB']
active_qb_ids = qbs['pfr_id'].dropna().tolist()

In [2]:
active_qb_ids[0]

'RivePh00'

In [81]:
full_rosters = nfl.load_rosters(range(2010, 2026)).to_pandas()
qbs_since_2010 = full_rosters[full_rosters["position"] == "QB"]

qb_roster_unique = (
    qbs_since_2010
    .groupby("gsis_id", as_index=False)
    .agg({"pfr_id": "first",
    "full_name": "first",
    "position": "first"})
)

all_stats = nfl.load_player_stats(range(2010, 2025), "reg").to_pandas()

qb_stats = all_stats.merge(
    qb_roster_unique[["pfr_id", "gsis_id"]],
    left_on="player_id",
    right_on="gsis_id",
    how="inner"
)

qb_totals = (
    qb_stats
    .groupby(["player_id", "pfr_id"], as_index=False)["passing_yards"]
    .sum()
)

valid_qbs = qb_totals[qb_totals["passing_yards"] >= 2000]

valid_players = valid_qbs["pfr_id"].dropna().unique().tolist()

In [82]:
final_qbs = valid_qbs.merge(qb_roster_unique, left_on="pfr_id", right_on="pfr_id", how="left").drop(columns=["player_id", "gsis_id", "passing_yards"])

In [102]:
len(valid_players)

122

In [104]:
final_qbs[final_qbs["full_name"] == "Philip Rivers"]

Unnamed: 0,pfr_id,full_name,position
11,RivePh00,Philip Rivers,QB


In [3]:
import requests
from bs4 import BeautifulSoup
import time

In [72]:
from io import StringIO

BASE_URL = "https://www.pro-football-reference.com/players"

def scrape_qb(pfr_slug):
    first_letter = pfr_slug[0]
    url = f"{BASE_URL}/{first_letter}/{pfr_slug}.htm"

    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")

    table = soup.find("table", {"id": "passing"})
    if table is None:
        return None

    df = pd.read_html(StringIO(str(table)))[0]

    cleaned = df[
        df["Season"].astype(str).str.match(r"^\d{4}$", na=False)
    ].copy()

    wanted_cols = [
        "Season", "Age", "Team", "Pos", "G", "GS", "Cmp", "Att", "Cmp%",
        "Yds", "TD", "Int", "Y/A", "Rate", "QBR", "AV", "Awards"
    ]

    for col in wanted_cols:
        if col not in cleaned.columns:
            cleaned.loc[:, col] = None

    final = cleaned[wanted_cols].copy()

    numeric_cols = final.columns.difference(["Team", "Pos", "Awards"])
    final.loc[:, numeric_cols] = final[numeric_cols].apply(
        pd.to_numeric, errors="coerce"
    )

    return final

In [17]:
qb_meta = qbs[["pfr_id", "full_name", "position"]].drop_duplicates()
qb_meta

Unnamed: 0,pfr_id,full_name,position
1,RivePh00,Philip Rivers,QB
2,RodgAa00,Aaron Rodgers,QB
7,FlacJo00,Joe Flacco,QB
9,JohnJo05,Josh Johnson,QB
10,StafMa00,Matthew Stafford,QB
...,...,...,...
3033,MilrJa00,Jalen Milroe,QB
3035,WardCa00,Cam Ward,QB
3045,DartJa00,Jaxson Dart,QB
3052,GabrDi00,Dillon Gabriel,QB


In [91]:
import sqlite3

In [None]:
conn = sqlite3.connect("/data/football_wordle.db")
cur = conn.cursor()

In [95]:
cur.execute("DELETE FROM passing_seasons;")
cur.execute("DELETE FROM players;")
conn.commit()

In [96]:
cur.execute("""
CREATE TABLE IF NOT EXISTS players (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    pfr_id TEXT UNIQUE,
    name TEXT,
    position TEXT
)
""")

<sqlite3.Cursor at 0x2c804e5c0>

In [97]:
for _, row in final_qbs.iterrows():
    cur.execute(
        """
        INSERT OR IGNORE INTO players (pfr_id, name, position)
        VALUES (?, ?, ?)
        """,
        (row["pfr_id"], row["full_name"], row["position"])
    )

conn.commit()

In [100]:
cur.execute("""
CREATE TABLE IF NOT EXISTS passing_seasons (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    player_id INTEGER,
    season INTEGER,
    age INTEGER,
    team TEXT,
    games INTEGER,
    games_started INTEGER,
    completions INTEGER,
    attempts INTEGER,
    completion_pct REAL,
    yards INTEGER,
    touchdowns INTEGER,
    interceptions INTEGER,
    yards_per_attempt REAL,
    passer_rating REAL,
    qbr REAL,
    av REAL,
    awards TEXT,
    FOREIGN KEY(player_id) REFERENCES players(id)
)
""")

<sqlite3.Cursor at 0x2c804e5c0>

In [105]:
for pfr_id in valid_players:
    seasons = scrape_qb(pfr_id)
    if seasons is None or seasons.empty:
        continue

    cur.execute(
        "SELECT id FROM players WHERE pfr_id = ?",
        (pfr_id,)
    )
    player_id = cur.fetchone()[0]

    rows = []
    for _, s in seasons.iterrows():
        rows.append((
            player_id,
            s["Season"], s["Age"], s["Team"], s["G"], s["GS"],
            s["Cmp"], s["Att"], s["Cmp%"], s["Yds"],
            s["TD"], s["Int"], s["Y/A"],
            s["Rate"], s["QBR"], s["AV"], s["Awards"]
        ))

    cur.executemany(
        """
        INSERT INTO passing_seasons (
            player_id, season, age, team, games, games_started,
            completions, attempts, completion_pct, yards,
            touchdowns, interceptions, yards_per_attempt,
            passer_rating, qbr, av, awards
        )
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """,
        rows
    )

    conn.commit()
    print(f"Inserted {len(rows)} seasons for {pfr_id}")
    time.sleep(7)


Inserted 17 seasons for HassMa00
Inserted 18 seasons for MannPe00
Inserted 23 seasons for BradTo00
Inserted 20 seasons for BreeDr00
Inserted 17 seasons for McCoJo01
Inserted 9 seasons for GarrDa00
Inserted 14 seasons for PalmCa00
Inserted 9 seasons for GrosRe00
Inserted 16 seasons for SchaMa00
Inserted 16 seasons for MannEl00
Inserted 18 seasons for RoetBe00
Inserted 22 seasons for RivePh00
Inserted 16 seasons for SmitAl03
Inserted 21 seasons for RodgAa00
Inserted 9 seasons for CampJa00
Inserted 11 seasons for OrtoKy00
Inserted 13 seasons for AndeDe00
Inserted 16 seasons for CassMa00
Inserted 17 seasons for FitzRy00
Inserted 6 seasons for YounVi00
Inserted 12 seasons for CutlJa00
Inserted 12 seasons for ClemKe00
Inserted 9 seasons for JackTa00
Inserted 7 seasons for WhitCh02
Inserted 6 seasons for KolbKe00
Inserted 7 seasons for StanDr00
Inserted 15 seasons for RyanMa00
Inserted 20 seasons for FlacJo00
Inserted 13 seasons for HennCh01
Inserted 9 seasons for FlynMa00
Inserted 17 seasons

In [None]:
def get_random_qb_career():
    conn = sqlite3.connect("/data/football_wordle.db")

    player = pd.read_sql("""
        SELECT DISTINCT p.id, p.name
        FROM players p
        JOIN passing_seasons ps
        ON p.id = ps.player_id
        ORDER BY RANDOM()
        LIMIT 1;
    """, conn).iloc[0]

    seasons = pd.read_sql("""
        SELECT *
        FROM passing_seasons
        WHERE player_id = ?
        ORDER BY season
    """, conn, params=(int(player["id"]),))

    conn.close()
    return player['name'], seasons


In [107]:
name, df = get_random_qb_career()
name

'Sam Howell'

In [108]:
df

Unnamed: 0,id,player_id,season,age,team,games,games_started,completions,attempts,completion_pct,yards,touchdowns,interceptions,yards_per_attempt,passer_rating,qbr,av,awards
0,1605,371,2022,22,WAS,1,1,11,19,57.9,169,1,1,8.9,83.0,48.3,1.0,
1,1606,371,2023,23,WAS,17,17,388,612,63.4,3946,21,21,6.4,78.9,43.2,11.0,
2,1607,371,2024,24,SEA,2,0,5,14,35.7,24,0,1,1.7,14.6,2.9,0.0,


In [109]:
def qb_career_to_payload():
    name, df = get_random_qb_career()
    return {
        "player_name": name,
        "seasons": df.to_dict(orient="records")
    }

In [110]:
qb_career_to_payload()

{'player_name': 'Russell Wilson',
 'seasons': [{'id': 1152,
   'player_id': 309,
   'season': 2012,
   'age': 24,
   'team': 'SEA',
   'games': 16,
   'games_started': 16,
   'completions': 252,
   'attempts': 393,
   'completion_pct': 64.1,
   'yards': 3118,
   'touchdowns': 26,
   'interceptions': 10,
   'yards_per_attempt': 7.9,
   'passer_rating': 100.0,
   'qbr': 72.7,
   'av': 16.0,
   'awards': 'PB,AP ORoY-3'},
  {'id': 1153,
   'player_id': 309,
   'season': 2013,
   'age': 25,
   'team': 'SEA',
   'games': 16,
   'games_started': 16,
   'completions': 257,
   'attempts': 407,
   'completion_pct': 63.1,
   'yards': 3357,
   'touchdowns': 26,
   'interceptions': 9,
   'yards_per_attempt': 8.2,
   'passer_rating': 101.2,
   'qbr': 67.2,
   'av': 17.0,
   'awards': 'PB,AP OPoY-4'},
  {'id': 1154,
   'player_id': 309,
   'season': 2014,
   'age': 26,
   'team': 'SEA',
   'games': 16,
   'games_started': 16,
   'completions': 285,
   'attempts': 452,
   'completion_pct': 63.1,
   'y