In [43]:
# Diamond Dashboard (MLB) — Notebook Utilities
# ------------------------------------------------
# This notebook collects MLB standings, schedules, box scores, and highlights.
# Outputs are written to ./out as JSON/CSV for easy reuse.
#
# How to run locally:
# 1) Python 3.11+ recommended
# 2) pip install -r requirements.txt  (bs4, lxml, requests, pandas, python-dateutil)
# 3) Set your OpenWeather API key in the environment:
#       export OWM_API_KEY="insert-your-key-here"
#    (For GitHub, DO NOT commit real keys. Use env vars or .env that isn't checked in.)
# 4) Run cells in order. Outputs go to ./out
#
# Notes:
# - This code avoids hard-coding secrets. If no env var is found, it uses the placeholder string.
# - Timezones: ESPN dates are UTC; display conversions use America/New_York.

from __future__ import annotations
from pathlib import Path
from datetime import datetime, timezone
from zoneinfo import ZoneInfo
import os, json, time, re, glob, math, random
import requests
import pandas as pd
from bs4 import BeautifulSoup

# --- Paths
OUT_DIR = Path("out")
OUT_DIR.mkdir(exist_ok=True)

# --- Time & tz
ET = ZoneInfo("America/New_York")

def utc_stamp() -> str:
    """UTC timestamp for filenames, e.g., 20251014T071234Z."""
    return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")

def now_et() -> datetime:
    return datetime.now(ET)

# --- Logging
def log(msg: str) -> None:
    print(f"[{datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}] {msg}")

# --- HTTP helpers
DEFAULT_UA = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def fetch(url: str, timeout: int = 20, headers: dict | None = None) -> requests.Response | None:
    """Requests wrapper with a sensible UA + simple retries."""
    hdrs = dict(DEFAULT_UA)
    if headers:
        hdrs.update(headers)
    for attempt in range(3):
        try:
            r = requests.get(url, headers=hdrs, timeout=timeout)
            if r.status_code == 404:
                return r  # let caller decide
            r.raise_for_status()
            return r
        except requests.RequestException:
            if attempt == 2:
                return None
            time.sleep(0.4 + attempt * 0.6)
    return None

def latest_out_file(prefix: str) -> str | None:
    """Return latest ./out/{prefix}_*.json path or None."""
    paths = sorted(glob.glob(str(OUT_DIR / f"{prefix}_*.json")))
    return paths[-1] if paths else None

def save_json(prefix: str, obj) -> Path:
    """Save JSON to ./out/{prefix}_{utc}.json and return path."""
    ts = utc_stamp()
    out_path = OUT_DIR / f"{prefix}_{ts}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)
    log(f"Wrote {out_path}")
    return out_path

def iso_to_dt(s: str) -> datetime | None:
    """Parse ISO timestamps that may end with 'Z' into aware UTC datetimes."""
    if not s:
        return None
    try:
        return datetime.fromisoformat(s.replace("Z", "+00:00"))
    except Exception:
        return None

# --- OpenWeather
OWM_API_KEY = os.getenv("OWM_API_KEY", "INSERT_YOUR_OPENWEATHER_API_KEY")  # <-- safe placeholder


In [40]:
# Standings (rich) — League/Division with team rows + stats merged
# Scrapes https://www.espn.com/mlb/standings (no interaction needed).
# Output: out/standings_rich_*.json

def _clean_text(s):
    return re.sub(r"\s+", " ", (s or "").strip())

def _abbr_from_logo_img(img):
    if not img: return ""
    for attr in ("alt", "title"):
        v = img.get(attr)
        if v and len(v) <= 4:
            return v.strip().upper()
    return ""

def _team_code_from_href(href):
    if not href: return ""
    m = re.search(r"/mlb/team/_/name/([^/]+)/", href)
    return m.group(1) if m else ""

SEED_FLAG_MAP = {
    "x": "clinched a postseason berth",
    "y": "clinched division",
    "z": "best league record",
    "e": "eliminated",
    "*": "wild card",
}

def parse_standings_rich(html: str):
    soup = BeautifulSoup(html, "lxml")
    out = {}
    league_blocks = soup.select("div.ResponsiveTable.ResponsiveTable--fixed-left.standings-subgroups")

    for league_block in league_blocks:
        league_title_el = league_block.select_one(".Table__Title")
        league = _clean_text(league_title_el.get_text()) if league_title_el else "Unknown League"
        out.setdefault(league, {})

        left_table = league_block.select_one("table.Table--fixed-left")
        right_scroller_tbody = league_block.select_one(".Table__Scroller table tbody")
        if not left_table or not right_scroller_tbody:
            # If ESPN changes markup, skip silently (keeps scraper resilient)
            continue

        left_tbody = left_table.select_one("tbody")
        current_division = None

        # collect stats rows (skip subgroup headers)
        stat_rows = []
        for rtr in right_scroller_tbody.select("tr"):
            classes = " ".join(rtr.get("class", []))
            if "subgroup-headers" in classes:
                continue
            if rtr.find_all("td"):
                stat_rows.append(rtr)

        stat_idx = 0

        for ltr in left_tbody.select("tr"):
            classes = " ".join(ltr.get("class", []))
            if "subgroup-headers" in classes:
                span = ltr.select_one("span")
                current_division = _clean_text(span.get_text()) if span else _clean_text(ltr.get_text(" ", strip=True))
                out[league].setdefault(current_division, [])
                continue

            tds = ltr.find_all("td")
            if not tds:
                continue

            team_cell = tds[0]
            seed_span = team_cell.select_one(".team-link span:not(.TeamLink__Logo)")
            seed_flag_raw = _clean_text(seed_span.get_text()).replace("--", "").strip() if seed_span else ""
            # translate: 'x' → 'clinched a postseason berth'
            seed_flag = SEED_FLAG_MAP.get(seed_flag_raw.lower(), seed_flag_raw)

            name_a = team_cell.select_one(".team-link .hide-mobile a")
            team_name = _clean_text(name_a.get_text()) if name_a else ""
            team_href = name_a.get("href") if name_a else ""
            team_code = _team_code_from_href(team_href)

            logo_img = team_cell.select_one(".TeamLink__Logo img")
            abbr = _abbr_from_logo_img(logo_img)
            if not abbr:
                abbr_a = team_cell.select_one(".team-link .show-mobile a abbr")
                if abbr_a and abbr_a.text:
                    abbr = abbr_a.text.strip().upper()

            if stat_idx >= len(stat_rows):
                out[league].setdefault(current_division or "Unknown Division", []).append({
                    "team": team_name, "abbr": abbr, "seed_flag": seed_flag,
                    "team_code": team_code, "team_url": team_href
                })
                continue

            sr = stat_rows[stat_idx]; stat_idx += 1
            s_tds = sr.find_all("td")
            def sval(i): return _clean_text(s_tds[i].get_text(" ", strip=True)) if i < len(s_tds) else ""
            row = {
                "team": team_name, "abbr": abbr, "seed_flag": seed_flag,
                "w": sval(0), "l": sval(1), "pct": sval(2), "gb": sval(3),
                "home": sval(4), "away": sval(5), "rs": sval(6), "ra": sval(7),
                "diff": sval(8), "strk": sval(9), "l10": sval(10), "poff": sval(11),
                "team_code": team_code, "team_url": team_href,
            }
            out[league].setdefault(current_division or "Unknown Division", []).append(row)

    return out

# --- Run once
r = fetch("https://www.espn.com/mlb/standings", timeout=20)
html = r.text if r and r.ok else ""
data = parse_standings_rich(html) if html else {}
save_json("standings_rich", data)


[20251014T082417Z] Wrote out/standings_rich_20251014T082417Z.json


PosixPath('out/standings_rich_20251014T082417Z.json')

In [42]:
# Today's schedule (scores if live/final) + weather at first pitch
# Output: out/schedule_*.json
# Weather provider: OpenWeather (use OWM_API_KEY env var; placeholder left here intentionally)

def fetch_scoreboard_json_for_today() -> dict:
    today_et = now_et().strftime("%Y%m%d")
    urls = [
        f"https://site.api.espn.com/apis/v2/sports/baseball/mlb/scoreboard?dates={today_et}&lang=en&region=us",
        f"https://site.web.api.espn.com/apis/v2/sports/baseball/mlb/scoreboard?dates={today_et}",
    ]
    for url in urls:
        r = fetch(url, timeout=20)
        if r and r.ok:
            return r.json()
        if r and r.status_code == 404:
            return {"events": []}
    return {"events": []}

def parse_games_from_scoreboard(sb_json: dict) -> list[dict]:
    out = []
    for ev in sb_json.get("events", []):
        comps = ev.get("competitions") or []
        if not comps: continue
        comp = comps[0]
        gid = comp.get("id") or ev.get("id")
        start_utc = comp.get("date")
        status = ((comp.get("status") or {}).get("type") or {}).get("state", "").lower()  # pre/in/post
        sides = comp.get("competitors") or []
        home = next((c for c in sides if c.get("homeAway")=="home"), None)
        away = next((c for c in sides if c.get("homeAway")=="away"), None)

        # live/final scores if present
        def s(c):
            v = None if not c else c.get("score")
            if isinstance(v, dict): v = v.get("value")
            try: return int(v) if v is not None else None
            except: return None

        out.append({
            "game_id": gid,
            "start_utc": start_utc,
            "status": status,
            "home_abbr": ((home or {}).get("team") or {}).get("abbreviation"),
            "away_abbr": ((away or {}).get("team") or {}).get("abbreviation"),
            "home_score": s(home),
            "away_score": s(away),
            "links": {
                "boxscore": f"https://www.espn.com/mlb/game/_/gameId/{gid}" if gid else None
            }
        })
    return out

# Simple stadium location map (add more as needed). Real app: keep a proper stadium DB.
STADIUM_COORDS = {
    # "BAL": (39.2839, -76.6219),  # Oriole Park at Camden Yards
}

def fetch_openweather_at_game_start(lat: float, lon: float, start_iso_utc: str) -> dict | None:
    """Pick the forecast slot closest to game time using OWM 5-day/3h."""
    if OWM_API_KEY == "INSERT_YOUR_OPENWEATHER_API_KEY":
        return {"note": "Set OWM_API_KEY env var to enable weather."}

    r = fetch(f"https://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&appid={OWM_API_KEY}&units=imperial", timeout=20)
    if not r or not r.ok:
        return None
    js = r.json()
    target = iso_to_dt(start_iso_utc)
    if not target:
        return None

    best, best_dt = None, None
    for slot in js.get("list", []):
        dt_txt = slot.get("dt_txt")  # "YYYY-MM-DD HH:MM:SS" local time at coord
        try:
            # OWM dt_txt is local; for coarse matching this is OK
            dt_local = datetime.strptime(dt_txt, "%Y-%m-%d %H:%M:%S").replace(tzinfo=None)
        except Exception:
            continue
        # compare naive local to ET-based naive target for rough proximity
        # (for production, convert lat/lon tz properly)
        delta = abs((target.replace(tzinfo=None) - dt_local).total_seconds())
        if best is None or delta < best_dt:
            best, best_dt = slot, delta

    if not best:
        return None

    w = (best.get("weather") or [{}])[0]
    main = best.get("main") or {}
    wind = best.get("wind") or {}
    return {
        "desc": w.get("description"),
        "temp_f": main.get("temp"),
        "humidity": main.get("humidity"),
        "wind_mph": wind.get("speed"),
        "slot_dt_txt": best.get("dt_txt")
    }

def run_today_schedule_and_weather():
    sb = fetch_scoreboard_json_for_today()
    games = parse_games_from_scoreboard(sb)

    # If no games today: no polling, just write empty list with timestamp
    result = {"timestamp": utc_stamp(), "games": games, "weather_at_start": {}}

    for g in games:
        abbr = g["home_abbr"]
        latlon = STADIUM_COORDS.get(abbr)
        if latlon and g.get("start_utc"):
            wx = fetch_openweather_at_game_start(latlon[0], latlon[1], g["start_utc"])
        else:
            wx = None
        result["weather_at_start"][g["game_id"]] = wx

    save_json("schedule", result)

# --- Run once
run_today_schedule_and_weather()


[20251014T095013Z] Wrote out/last10_boxscores_20251014T095013Z.json
✅ Last 10 with box scores saved for 30 teams -> out/last10_boxscores_20251014T095013Z.json


In [None]:
# Last 10 completed games per team (post + regular) with R/H/E and MLB highlight URL (strict)
# Output: out/last10_boxscores_*.json

# Reuse standings for team list
standings_path = latest_out_file("standings_rich")
if not standings_path:
    raise RuntimeError("Run the standings cell first.")
with open(standings_path, "r", encoding="utf-8") as f:
    STANDINGS = json.load(f)

teams_by_league = {}
for league, divs in STANDINGS.items():
    teams_by_league.setdefault(league, [])
    for _div, teams in divs.items():
        for t in teams:
            teams_by_league[league].append({
                "team": t.get("team",""),
                "abbr": (t.get("abbr") or "").upper(),
                "code": (t.get("team_code") or "").lower(),
            })

def fetch_team_schedule_json(team_code: str, season_year: int, seasontype: int) -> dict | None:
    for host in ("site.api.espn.com", "site.web.api.espn.com"):
        url = f"https://{host}/apis/site/v2/sports/baseball/mlb/teams/{team_code}/schedule?season={season_year}&seasontype={seasontype}"
        r = fetch(url, timeout=20)
        if r and r.ok: return r.json()
    return None

def coerce_score(val):
    if val is None: return None
    if isinstance(val, (int, float)): return int(val)
    if isinstance(val, str):
        try: return int(val.strip())
        except: return None
    if isinstance(val, dict): return coerce_score(val.get("value"))
    return None

def completed_games_from(js: dict) -> list[dict]:
    if not js: return []
    events = js.get("events") or js.get("items") or []
    games = []
    for ev in events:
        comps = ev.get("competitions") or []
        if not comps and ("status" in ev and "competitors" in ev):
            comps = [ev]
        for comp in comps:
            st = (comp.get("status") or {}).get("type") or {}
            if not bool(st.get("completed")): 
                continue
            dt = iso_to_dt(comp.get("date") or ev.get("date"))
            if not dt: continue
            sides = comp.get("competitors") or []
            home = next((c for c in sides if c.get("homeAway")=="home"), None)
            away = next((c for c in sides if c.get("homeAway")=="away"), None)
            if not home or not away: continue
            gid = comp.get("id") or ev.get("id")
            games.append({
                "when": dt,
                "id": gid,
                "home": {"abbr": ((home.get("team") or {}).get("abbreviation"))},
                "away": {"abbr": ((away.get("team") or {}).get("abbreviation"))},
                "start_utc": comp.get("date") or ev.get("date")
            })
    games.sort(key=lambda g: g["when"])
    return games

# --- Box score totals (R/H/E), using ESPN JSON + HTML fallback
def fetch_boxscore_json(game_id: str):
    for host in ("site.api.espn.com", "site.web.api.espn.com"):
        url = f"https://{host}/apis/site/v2/sports/baseball/mlb/boxscore?event={game_id}"
        r = fetch(url, timeout=15)
        if r and r.ok: return r.json()
    return None

def fetch_summary_json(game_id: str):
    for host in ("site.api.espn.com", "site.web.api.espn.com"):
        url = f"https://{host}/apis/v2/sports/baseball/mlb/summary?event={game_id}"
        r = fetch(url, timeout=12)
        if r and r.ok: return r.json()
    return None

def fetch_boxscore_html(game_id: str) -> str | None:
    r = fetch(f"https://www.espn.com/mlb/game/_/gameId/{game_id}", timeout=15)
    return r.text if r and r.ok else None

def _team_nodes_for_totals(js):
    bx = js.get("boxscore")
    if isinstance(bx, dict) and isinstance(bx.get("teams"), list):
        return bx["teams"]
    if isinstance(js.get("teams"), list):
        return js["teams"]
    hdr = js.get("header") or {}
    comps = (hdr.get("competitions") or [{}])[0].get("competitors") or []
    nodes = []
    for c in comps:
        if c.get("statistics"):
            nodes.append({"team": c.get("team"), "statistics": c.get("statistics")})
    return nodes

def _pick_stat(stats, keys):
    for st in stats or []:
        name = (st.get("name") or st.get("displayName") or "").lower()
        abbr = (st.get("abbreviation") or "").lower()
        if name in keys or abbr in keys:
            v = st.get("displayValue") if "displayValue" in st else st.get("value")
            try: return int(str(v).strip())
            except: return None
    return None

def _totals_from_json_nodes(nodes, home_abbr, away_abbr):
    out = {"home": {"R": None, "H": None, "E": None}, "away": {"R": None, "H": None, "E": None}}
    for node in nodes or []:
        t = node.get("team") or {}
        ab = (t.get("abbreviation") or t.get("displayAbbreviation") or "").upper()
        stats = node.get("statistics") or []
        triple = {
            "R": _pick_stat(stats, {"r","runs"}),
            "H": _pick_stat(stats, {"h","hits"}),
            "E": _pick_stat(stats, {"e","errors","err"}),
        }
        if ab and ab == (home_abbr or "").upper(): out["home"] = triple
        if ab and ab == (away_abbr or "").upper(): out["away"] = triple
    return out

def _totals_from_html(game_id: str):
    html = fetch_boxscore_html(game_id)
    if not html: return None
    soup = BeautifulSoup(html, "lxml")
    for tbl in soup.select("table.Table"):
        thead, tbody = tbl.find("thead"), tbl.find("tbody")
        if not thead or not tbody: continue
        headers = [th.get_text(strip=True).upper() for th in thead.find_all(["th","td"])]
        if not {"R","H","E"}.issubset(set(headers)): continue
        idx_R, idx_H, idx_E = headers.index("R"), headers.index("H"), headers.index("E")
        rows = tbody.find_all("tr")
        if len(rows) < 2: continue
        def iv(tr, i):
            tds = tr.find_all("td")
            if i >= len(tds): return None
            m = re.search(r"-?\d+", tds[i].get_text(strip=True))
            return int(m.group(0)) if m else None
        away_row, home_row = rows[0], rows[1]
        return {
            "home": {"R": iv(home_row, idx_R), "H": iv(home_row, idx_H), "E": iv(home_row, idx_E)},
            "away": {"R": iv(away_row, idx_R), "H": iv(away_row, idx_H), "E": iv(away_row, idx_E)},
        }
    return None

def get_totals_for_game(game_id: str, home_abbr: str, away_abbr: str):
    js = fetch_boxscore_json(game_id) or fetch_summary_json(game_id) or {}
    nodes = _team_nodes_for_totals(js)
    totals = _totals_from_json_nodes(nodes, home_abbr, away_abbr)
    if any(v is None for v in [totals["home"]["R"], totals["home"]["H"], totals["home"]["E"],
                               totals["away"]["R"], totals["away"]["H"], totals["away"]["E"]]):
        ht = _totals_from_html(game_id)
        if ht:
            for side in ("home","away"):
                for k in ("R","H","E"):
                    if totals[side][k] is None and ht[side][k] is not None:
                        totals[side][k] = ht[side][k]
    return {"home": {"abbr": home_abbr, **totals["home"]},
            "away": {"abbr": away_abbr, **totals["away"]}}

# --- Strict MLB highlights (channel filter)
def resolve_mlb_highlights_url(home_abbr: str, away_abbr: str, start_iso_utc: str | None) -> str | None:
    """
    We build a precise search string and filter to YouTube's MLB channel only.
    This returns a channel search URL with filters applied (safer for TOS).
    """
    # e.g. "MLB Highlights Dodgers vs Brewers 2025-10-13"
    date_part = ""
    dt = iso_to_dt(start_iso_utc)
    if dt:
        date_part = dt.astimezone(ET).strftime("%Y-%m-%d")
    q = f"MLB Highlights {away_abbr} vs {home_abbr} {date_part}".strip()
    # Channel=MLB filter baked into query; user clicks to open exact filtered search
    return f"https://www.youtube.com/results?search_query={requests.utils.quote(q)}&sp=EgIQA1AB"  # Uploaded today/short-ish; adjust as needed

def summarize_last10_for_team(all_completed: list[dict], team_abbr: str):
    out = []
    for g in reversed(all_completed):  # newest first
        home_abbr, away_abbr = (g["home"]["abbr"] or ""), (g["away"]["abbr"] or "")
        if team_abbr not in (home_abbr, away_abbr):
            continue
        gid = g["id"]
        totals = get_totals_for_game(gid, home_abbr, away_abbr)
        ytv = resolve_mlb_highlights_url(home_abbr, away_abbr, g.get("start_utc"))
        out.append({
            "game_id": gid,
            "date_et": iso_to_dt(g.get("start_utc")).astimezone(ET).strftime("%Y-%m-%d"),
            "home": totals["home"],
            "away": totals["away"],
            "mlb_highlights_search": ytv
        })
        if len(out) == 10:
            break
    return list(reversed(out))

def build_last10_all_teams_with_boxscores(pause_sec: float = 0.5):
    season_year = now_et().year
    leagues_out = {}
    total = 0

    for league, teams in teams_by_league.items():
        leagues_out.setdefault(league, {})
        for t in teams:
            code, abbr, name = t["code"], t["abbr"], t["team"]
            if not code:
                continue
            js_post = fetch_team_schedule_json(code, season_year, 3)  # postseason
            js_reg  = fetch_team_schedule_json(code, season_year, 2)  # regular
            all_completed = (completed_games_from(js_post) + completed_games_from(js_reg))
            all_completed.sort(key=lambda g: g["when"])

            last10 = summarize_last10_for_team(all_completed, abbr)
            leagues_out[league][name] = {"abbr": abbr, "team_code": code, "last10": last10}
            total += 1
            time.sleep(pause_sec)

    payload = {"timestamp": utc_stamp(), "leagues": leagues_out}
    save_json("last10_boxscores", payload)
    log(f"Last-10 box scores compiled for {total} teams.")

# --- Run once
build_last10_all_teams_with_boxscores()
