In [1]:
import pathlib, time, duckdb, pandas as pd, requests
from random import randint


START, END  = 2000, 2024
RAW_REG_DIR = pathlib.Path("data/raw/regular")
RAW_PO_DIR  = pathlib.Path("data/raw/playoffs")
DB_FILE     = pathlib.Path("data/processed/nba.duckdb")


SLEEP = 1.0       # seconds between requests (politeness)


In [2]:
def html_url(year: int, playoffs: bool) -> str:
    return (
        f"https://www.basketball-reference.com/playoffs/NBA_{year}_advanced.html"
        if playoffs
        else f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"
    )

def fetch_advanced(year: int, playoffs: bool) -> pd.DataFrame | None:
    url = html_url(year, playoffs)
    print("↳", url)
    try:
        # pandas uses urllib under the hood—this has already succeeded for you
        df = pd.read_html(url, match="Advanced")[0]
    except ValueError:
        print(f"   ⚠️  advanced table missing for {year} ({'PO' if playoffs else 'RS'})")
        return None

    # drop the extra header rows that repeat "Rk"
    df = df[df["Rk"] != "Rk"].copy()
    df["Season"]   = year
    df["Playoffs"] = playoffs
    time.sleep(SLEEP)
    return df


In [3]:
for yr in range(START, END + 1):
    for po in (False, True):
        df = fetch_advanced(yr, po)
        if df is None:       # skip missing seasons
            continue
        target_dir = RAW_PO_DIR if po else RAW_REG_DIR
        target_dir.mkdir(parents=True, exist_ok=True)
        df.to_csv(target_dir / f"{yr}.csv", index=False)
        print(f"   ✓ saved {yr} {'PO' if po else 'RS'} ({len(df)} rows)")
        time.sleep(SLEEP)
print("✅ All CSVs saved")



↳ https://www.basketball-reference.com/leagues/NBA_2000_advanced.html
   ✓ saved 2000 RS (497 rows)
↳ https://www.basketball-reference.com/playoffs/NBA_2000_advanced.html
   ✓ saved 2000 PO (181 rows)
↳ https://www.basketball-reference.com/leagues/NBA_2001_advanced.html
   ✓ saved 2001 RS (538 rows)
↳ https://www.basketball-reference.com/playoffs/NBA_2001_advanced.html
   ✓ saved 2001 PO (190 rows)
↳ https://www.basketball-reference.com/leagues/NBA_2002_advanced.html
   ✓ saved 2002 RS (501 rows)
↳ https://www.basketball-reference.com/playoffs/NBA_2002_advanced.html
   ✓ saved 2002 PO (186 rows)
↳ https://www.basketball-reference.com/leagues/NBA_2003_advanced.html
   ✓ saved 2003 RS (484 rows)
↳ https://www.basketball-reference.com/playoffs/NBA_2003_advanced.html
   ✓ saved 2003 PO (187 rows)
↳ https://www.basketball-reference.com/leagues/NBA_2004_advanced.html
   ✓ saved 2004 RS (586 rows)
↳ https://www.basketball-reference.com/playoffs/NBA_2004_advanced.html
   ✓ saved 2004 PO (189 r

In [4]:
con = duckdb.connect(DB_FILE)

con.sql("DROP TABLE IF EXISTS adv_regular;")
con.sql("DROP TABLE IF EXISTS adv_playoffs;")

con.sql("""CREATE TABLE adv_regular AS
           SELECT * FROM read_csv_auto('data/raw/regular/*.csv');""")

con.sql("""CREATE TABLE adv_playoffs AS
           SELECT * FROM read_csv_auto('data/raw/playoffs/*.csv');""")

reg_rows = con.sql("SELECT COUNT(*) AS n FROM adv_regular").fetchone()[0]
po_rows  = con.sql("SELECT COUNT(*) AS n FROM adv_playoffs").fetchone()[0]
print(f"Rows loaded → {reg_rows} regular | {po_rows} playoff")

con.close()


Rows loaded → 15183 regular | 5082 playoff


In [5]:
con = duckdb.connect(DB_FILE)
con.sql("""
CREATE OR REPLACE VIEW v_player_career AS
WITH reg AS (
  SELECT Player,
         SUM(WS)       AS reg_ws,
         SUM(MP)       AS reg_mp
  FROM adv_regular
  GROUP BY 1
),
po AS (
  SELECT Player,
         SUM(WS) AS po_ws,
         SUM(MP) AS po_mp
  FROM adv_playoffs
  GROUP BY 1
)
SELECT
  r.Player,
  (po_ws / NULLIF(po_mp,0))*48  AS po_ws48,
  (reg_ws/ NULLIF(reg_mp,0))*48 AS reg_ws48,
  (po_ws / NULLIF(po_mp,0))*48 - (reg_ws/NULLIF(reg_mp,0))*48 AS ws48_diff
FROM reg r JOIN po USING(Player);
""")
print("View created ➜ v_player_career")
con.close()



View created ➜ v_player_career


In [6]:
con = duckdb.connect(DB_FILE)

top15 = con.sql("""SELECT Player, ROUND(ws48_diff,3) AS diff
                   FROM v_player_career
                   ORDER BY diff DESC
                   LIMIT 15""").df()
bottom15 = con.sql("""SELECT Player, ROUND(ws48_diff,3) AS diff
                      FROM v_player_career
                      ORDER BY diff ASC
                      LIMIT 15""").df()

display(top15)
display(bottom15)

con.close()


Unnamed: 0,Player,diff
0,Jordan McRae,2.348
1,Vassilis Spanoulis,1.618
2,Malcolm Delaney,1.577
3,Anthony Roberson,1.161
4,Kevin Knox,1.096
5,Sundiata Gaines,0.647
6,Blake Ahearn,0.621
7,Dalano Banton,0.572
8,Terrel Harris,0.569
9,Elliot Perry,0.513


Unnamed: 0,Player,diff
0,Harry Giles,-1.275
1,John Wallace,-1.251
2,Tracy Murray,-1.031
3,Cassius Winston,-0.995
4,Stephen Graham,-0.986
5,David Duke Jr.,-0.983
6,Phil Pressey,-0.975
7,D.J. Strawberry,-0.924
8,Wayne Simien,-0.761
9,Kyle O'Quinn,-0.728
