# Sharks Moneyline Odds
Analyzing how often the San Jose Sharks have been favored over the past few seasons. Using data from Moneyline.com https://moneyline.com/nhl/teams/san-jose-sharks/2026/schedule/past-games

In [1]:
!pip install pandas requests lxml

You should consider upgrading via the '/Users/Stephan/Desktop/Python/sharks-odds/.venv/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
import pandas as pd
import requests

### Scrape Moneyline Table

In [13]:
import re
import time
from typing import Iterable, Optional

import pandas as pd
import requests


def _pick_table_with_moneyline(tables: list[pd.DataFrame]) -> pd.DataFrame:
    """
    From all tables on the page, pick the one that contains a Moneyline column.
    """
    for t in tables:
        cols = [str(c).strip().lower() for c in t.columns]
        if any("moneyline" in c for c in cols):
            return t
    raise ValueError("No table found with a 'moneyline' column.")


def _parse_moneyline(value: object) -> Optional[int]:
    """
    Parse values like 'w-120', 'l+250' -> -120, +250
    """
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return None
    m = re.search(r"([+-]\d+)", str(value))
    return int(m.group(1)) if m else None


def scrape_sharks_past_games(season_year_url: int) -> pd.DataFrame:
    """
    Scrape San Jose Sharks past games from Moneyline.
    season_year_url = year used in URL (e.g. 2026).
    """
    url = f"https://moneyline.com/nhl/teams/san-jose-sharks/{season_year_url}/schedule/past-games"

    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; sharks-odds-research/1.0)",
        "Accept-Language": "en-US,en;q=0.9",
    }

    resp = requests.get(url, headers=headers, timeout=30)
    resp.raise_for_status()

    tables = pd.read_html(resp.text)
    df = _pick_table_with_moneyline(tables).copy()

    # Normalize column names
    df.columns = [str(c).strip().lower() for c in df.columns]

    # Ensure column is named exactly 'moneyline'
    if "moneyline" not in df.columns:
        ml_cols = [c for c in df.columns if "moneyline" in c]
        if not ml_cols:
            raise ValueError("Moneyline column not found after normalization.")
        df.rename(columns={ml_cols[0]: "moneyline"}, inplace=True)

    df["moneyline_num"] = df["moneyline"].apply(_parse_moneyline)

    # Favorite = negative moneyline
    df["sharks_favored"] = df["moneyline_num"].apply(
        lambda x: True if x is not None and x < 0 else (False if x is not None else pd.NA)
    )

    df["season_year_url"] = season_year_url
    df["source_url"] = url

    return df

def scrape_many(years: Iterable[int], sleep_seconds: float = 1.5) -> pd.DataFrame:
    frames = []
    for y in years:
        try:
            d = scrape_sharks_past_games(y)
            frames.append(d)
            print(f"✅ {y}: scraped {len(d)} games")
        except Exception as e:
            print(f"❌ {y}: {e}")
        time.sleep(sleep_seconds)

    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

# ---- RUN IT HERE ----

years = [2026, 2025, 2024]   # adjust as needed
df = scrape_many(years)

df.head()


  tables = pd.read_html(resp.text)


✅ 2026: scraped 50 games


  tables = pd.read_html(resp.text)


✅ 2025: scraped 82 games


  tables = pd.read_html(resp.text)


✅ 2024: scraped 82 games


Unnamed: 0,date,opponent,score,moneyline,spread,total (o/u),moneyline_num,sharks_favored,season_year_url,source_url
0,Today,vsNYR,W 3 - 1,w-120,w-1.5,u6.5,-120,True,2026,https://moneyline.com/nhl/teams/san-jose-shark...
1,"Wed, Jan 21",@TB,L 1 - 4,l+250,l+1.5,u6.5,250,False,2026,https://moneyline.com/nhl/teams/san-jose-shark...
2,"Mon, Jan 19",@FLA,W 4 - 1,w+157,w+1.5,u6.5,157,False,2026,https://moneyline.com/nhl/teams/san-jose-shark...
3,"Sat, Jan 17",@DET,L 2 - 4,l+141,l+1.5,u6.5,141,False,2026,https://moneyline.com/nhl/teams/san-jose-shark...
4,"Fri, Jan 16",@WAS,W 3 - 2,w+148,w+1.5,u6.5,148,False,2026,https://moneyline.com/nhl/teams/san-jose-shark...


In [19]:
import numpy as np
import pandas as pd
import re

# ---------- 1) Parse date into a real datetime ----------

def season_start_year(season_year_url: int) -> int:
    """
    Moneyline URL year 2026 corresponds to season 2025-26.
    So start year = 2025 for 2026.
    """
    return season_year_url - 1

def parse_moneyline_date(date_str: str, season_year_url: int) -> pd.Timestamp:
    """
    Convert Moneyline date strings (e.g., 'Wed, Jan 21', sometimes with an apostrophe year)
    into a proper pandas Timestamp with an inferred year based on the season.

    Strategy:
    - Strip weekday prefix (e.g., 'Wed, ')
    - Try to find an explicit year (e.g., "Jan 2 '25" or "Jan 2, 2025")
    - If no year is present, infer year:
        Months Jul-Dec -> start year (season_year_url - 1)
        Months Jan-Jun -> end year (season_year_url)
    """
    if pd.isna(date_str):
        return pd.NaT

    s = str(date_str).strip()

    # Remove weekday prefix like "Wed," or "Wed"
    s = re.sub(r"^[A-Za-z]{3},?\s+", "", s)

    # Normalize apostrophe year like "'25" -> "2025"
    # Examples: "Dec 31 '25" or "Dec 31, 25" (we handle both)
    m_apos = re.search(r"'\s*(\d{2})\b", s)
    if m_apos:
        yy = int(m_apos.group(1))
        yyyy = 2000 + yy
        s = re.sub(r"'\s*\d{2}\b", str(yyyy), s)

    # If there's a trailing ", 25" (without apostrophe), convert to 2025
    m_2digit = re.search(r",\s*(\d{2})\b", s)
    if m_2digit and not re.search(r"\d{4}", s):
        yy = int(m_2digit.group(1))
        yyyy = 2000 + yy
        s = re.sub(r",\s*\d{2}\b", f", {yyyy}", s)

    # If we now have a 4-digit year, parse directly
    if re.search(r"\b\d{4}\b", s):
        dt = pd.to_datetime(s, errors="coerce")
        return dt

    # Otherwise infer year from month + season boundary
    # Parse without year first
    dt_no_year = pd.to_datetime(s, format="%b %d", errors="coerce")
    if pd.isna(dt_no_year):
        # some pages might use full month name like "January 2"
        dt_no_year = pd.to_datetime(s, format="%B %d", errors="coerce")
    if pd.isna(dt_no_year):
        return pd.NaT

    month = int(dt_no_year.month)
    start = season_start_year(season_year_url)
    year = start if month >= 7 else season_year_url

    return pd.Timestamp(year=year, month=month, day=int(dt_no_year.day))

# locate the date column (usually named 'date')
date_col = "date"
if date_col not in df.columns:
    # attempt to find something date-like
    candidates = [c for c in df.columns if "date" in c.lower()]
    if not candidates:
        raise ValueError(f"Couldn't find a date column. Columns are: {list(df.columns)}")
    date_col = candidates[0]

df["game_date"] = df.apply(lambda r: parse_moneyline_date(r[date_col], int(r["season_year_url"])), axis=1)

# ---------- 2) Implied win probability from moneyline_num ----------

def implied_prob_from_moneyline(ml: float) -> float:
    """
    American odds implied probability (no vig removed):
      - If ml < 0:  p = (-ml) / ((-ml) + 100)
      - If ml > 0:  p = 100 / (ml + 100)
    """
    if ml is None or pd.isna(ml) or ml == 0:
        return np.nan
    ml = float(ml)
    if ml < 0:
        return (-ml) / ((-ml) + 100.0)
    return 100.0 / (ml + 100.0)

df["implied_win_prob"] = df["moneyline_num"].apply(implied_prob_from_moneyline)

# Optional: nice season label for external tools
df["season"] = df["season_year_url"].apply(lambda y: f"{int(y)-1}-{str(int(y))[-2:]}")

# Make sure values 
df = df.sort_values(
    by=["season_year_url", "game_date"],
    ascending=[True, True]
).reset_index(drop=True)


# Create game no. index per season
df["game_no"] = (
    df.groupby("season_year_url")
      .cumcount()
      .add(1)
)

# create global game index for one continuous x-axis across seasons
df["game_no_global"] = range(1, len(df) + 1)


# ---------- 3) Output to CSV ----------

# Put the clean, analysis-friendly columns first (keep the rest too)
preferred_first = ["season", "season_year_url",  "game_no", "game_no_global", "game_date", date_col, "moneyline", "moneyline_num", "implied_win_prob", "sharks_favored", "source_url"]
existing_first = [c for c in preferred_first if c in df.columns]
remaining = [c for c in df.columns if c not in existing_first]
df_out = df[existing_first + remaining].copy()

out_path = "sharks_moneyline_clean.csv"
df_out.to_csv(out_path, index=False)
print(f"Saved CSV -> {out_path}")

# quick sanity checks
print("Date parse nulls:", df_out["game_date"].isna().sum())
print("Implied prob nulls:", df_out["implied_win_prob"].isna().sum())
df_out.head()

Saved CSV -> sharks_moneyline_clean.csv
Date parse nulls: 1
Implied prob nulls: 0


Unnamed: 0,season,season_year_url,game_no,game_no_global,game_date,date,moneyline,moneyline_num,implied_win_prob,sharks_favored,source_url,opponent,score,spread,total (o/u)
0,2023-24,2024,1,1,2023-10-13,"Fri, Oct 13, 23'",l+200,200,0.333333,False,https://moneyline.com/nhl/teams/san-jose-shark...,vsVEG,L 1 - 4,l+1.5,u6.5
1,2023-24,2024,2,2,2023-10-15,"Sun, Oct 15, 23'",l+236,236,0.297619,False,https://moneyline.com/nhl/teams/san-jose-shark...,vsCOL,L 1 - 2 SO,w+1.5,u6.5
2,2023-24,2024,3,3,2023-10-18,"Wed, Oct 18, 23'",l+251,251,0.2849,False,https://moneyline.com/nhl/teams/san-jose-shark...,vsCAR,L 3 - 6,l+1.5,o6
3,2023-24,2024,4,4,2023-10-20,"Fri, Oct 20, 23'",l+220,220,0.3125,False,https://moneyline.com/nhl/teams/san-jose-shark...,vsBOS,L 1 - 3,l+1.5,u6
4,2023-24,2024,5,5,2023-10-22,"Sun, Oct 22, 23'",l+200,200,0.333333,False,https://moneyline.com/nhl/teams/san-jose-shark...,@NAS,L 1 - 5,l+1.5,P6


### Implied Probability Bar Chart

In [71]:
import altair as alt
import matplotlib.pyplot as plt

# Make sure these exist and are right types
plot_df = df_out.dropna(subset=["game_no_global", "implied_win_prob"]).copy()
plot_df["sharks_favored"] = plot_df["sharks_favored"].astype(bool)

plot_df["bar_legend"] = plot_df["sharks_favored"].map(
    {True: "Favored", False: "Not Favored"}
)

plot_df["line_legend"] = "10-game rolling avg"

# Compute subtitle dynamically (so it stays correct)
fav_ct = int(plot_df["sharks_favored"].sum())
total_ct = int(plot_df["sharks_favored"].shape[0])
season_start = plot_df["game_date"].min().date()
season_end = plot_df["game_date"].max().date()

title = "San Jose Sharks: Rising from the Depths"
subtitle = f"The Sharks have been favored in only {fav_ct} games ({fav_ct/total_ct:.1%}) since the start of the 2023-24 season."
footer = "Source: Moneyline.com | Graphic: @steodosescu"

combined_title = alt.TitleParams(
    text=title,
    subtitle=subtitle,
    anchor="middle",
    color="black",
    fontSize=20,
    subtitleColor="gray",
    subtitleFontSize=14,
)

# create 50% prob threshold line
threshold = pd.DataFrame([{"threshold": .50}])

# create season dividers
season_dividers = pd.DataFrame({
    "x": [82.5, 164.5]
})

# create season rules
season_rules = (
    alt.Chart(season_dividers)
    .mark_rule(
        color="gray",
        strokeDash=[4, 4],
        opacity=0.6
    )
    .encode(
        x=alt.X("x:Q")
    )
)

# Altair theme-ish defaults
alt.data_transformers.disable_max_rows()

bar = (
    alt.Chart(plot_df)
    .mark_bar(size=4)
    .encode(
        x=alt.X(
            "game_no_global:Q",
            title="Game No.",
            scale=alt.Scale(domain=[0, 220]),
            axis=alt.Axis(labelFlush=False, tickMinStep=20, grid=False)
        ),
        y=alt.Y(
            "implied_win_prob:Q",
            title="Market Implied Win Probability",
            axis=alt.Axis(format=".0%", grid=False),
            scale=alt.Scale(domain=[0, 1])
        ),
        color=alt.Color(
            "bar_legend:N",
            scale=alt.Scale(domain=["Favored", "Not Favored"],
                            range=["#006D75", "#B0B0B0"]
                            ),
                            legend=alt.Legend(title="")
                            ),
        tooltip=[
            alt.Tooltip("season:N", title="Season"),
            alt.Tooltip("game_no:Q", title="Game No. (season)"),
            alt.Tooltip("game_date:T", title="Date"),
            alt.Tooltip("moneyline_num:Q", title="Moneyline"),
            alt.Tooltip("implied_win_prob:Q", title="Implied Prob", format=".1%")
        ]
    )
    .properties(title=combined_title, width=900, height=320)
)


# Create line chart
# Create line chart (legend-safe)
line = (
    alt.Chart(plot_df)
    .transform_window(
        rolling_mean="mean(implied_win_prob)",
        frame=[-9, 0]
    )
    .transform_calculate(
        line_label='"10-game rolling avg"'  # guarantees label exists in Vega
    )
    .mark_line(strokeWidth=2)
    .encode(
        x=alt.X(
            "game_no_global:Q",
            scale=alt.Scale(domain=[0, 220]),
            axis=alt.Axis(title="", grid=False)
        ),
        y=alt.Y(
            "rolling_mean:Q",
            axis=alt.Axis(format=".0%", grid=False),
            scale=alt.Scale(domain=[0, 1])
        ),
        stroke=alt.Stroke(
            "line_label:N",
            scale=alt.Scale(
                domain=["10-game rolling avg"],
                range=["#EA7200"]
            ),
            legend=alt.Legend(title="")
        )
    )
)

# Footer via text mark
footer_text = (
    alt.Chart(pd.DataFrame({"text": [footer]}))
    .mark_text(align="left", baseline="top", dy=5, fontSize=12, color="gray")
    .encode(text="text:N")
    .properties(width=900, height=25)
)

# Add 50% win prob threshold line
rule = alt.Chart(threshold).mark_rule().encode(
    y='threshold:Q'
)

# Create logo layer
logo_url = "https://a.espncdn.com/combiner/i?img=/i/teamlogos/nhl/500/sj.png"

logo = (
    alt.Chart(pd.DataFrame({"dummy": [1]}))
    .mark_image(
        url=logo_url,
        width=60,
        height=60
    )
    .encode(
        # Pixel-based positioning
        x=alt.value(20),     # from left edge
        y=alt.value(-25)     # NEGATIVE moves it ABOVE the plot area
    )
)



## Combine bar and line charts
combined_chart = (bar + line + rule + season_rules + logo).properties(width=900)

# IMPORTANT: configure BEFORE display
chart = alt.vconcat(combined_chart, footer_text, spacing=5).configure(
    background="floralwhite",
    view=alt.ViewConfig(fill="floralwhite",
                        stroke=None), # <-- removes faint border around plot area
    axis=alt.AxisConfig(domainColor="black")
)

# Save high-resolution image. Use scaleFactor = 3 to increase image size and resolution
chart.save(
    "sharks_implied_prob.png",
    scale_factor=3
)

chart


### Summarize

In [None]:
# Summarize how many games Sharks have been favored in each season 

summary = (
        df_out.dropna(subset=["sharks_favored"])
          .groupby("season", as_index=False)
          .agg(
              games=("sharks_favored", "count"),
              favored_games=("sharks_favored", "sum"),
              avg_win_prob=('implied_win_prob', "mean")
          )
    )
summary["favored_pct"] = summary["favored_games"] / summary["games"]

summary.head()

Unnamed: 0,season,games,favored_games,avg_win_prob,favored_pct
0,2023-24,82,1,0.317044,0.012195
1,2024-25,82,2,0.338443,0.02439
2,2025-26,50,3,0.387873,0.06


### Plotting games favored by season

In [None]:
import altair as alt
import pandas as pd

title = "No Dice in San Jose"
subtitle = f"Average implied win probability since '23-24. Note: Vig is not removed."
footer = "Source: Moneyline.com | Graphic: @steodosescu"

combined_title = alt.TitleParams(
    text=title,
    subtitle=subtitle,
    anchor="middle",
    color="black",
    fontSize=20,
    subtitleColor="gray",
    subtitleFontSize=14,
)

# 1) Build stacked counts in long format
stack_df = summary.copy()
stack_df["not_favored_games"] = stack_df["games"] - stack_df["favored_games"]

stack_long = stack_df.melt(
    id_vars=["season", "games", "avg_win_prob"],
    value_vars=["favored_games", "not_favored_games"],
    var_name="status",
    value_name="count"
)

stack_long["status"] = stack_long["status"].map({
    "favored_games": "Favored",
    "not_favored_games": "Not Favored"
})

# Ensure stack order: Favored on bottom, Not Favored on top
status_order = ["Favored", "Not Favored"]

# 2) Base stacked bar chart
bars = (
    alt.Chart(stack_long)
    .mark_bar()
    .encode(
        x=alt.X(
            "season:N",
            title="Avg Win Prob by Season",
            sort=None,
            axis=alt.Axis(
                labelAngle=0,
                labelAlign="center",
                labelPadding=6
            )
        ),
        y=alt.Y("count:Q", title="Game No."),
        color=alt.Color(
            "status:N",
            scale=alt.Scale(
                domain=status_order,
                range=["#006D75", "#B0B0B0"]
            ),
            legend=alt.Legend(title="")
        ),
        order=alt.Order("status:N", sort="ascending")
    )
    .properties(title=combined_title, width=520, height=320)
)

# 3) Label at the top of each bar: avg implied probability
labels = (
    alt.Chart(stack_df)
    .mark_text(dy=-8, fontSize=12, fontWeight="bold", color="black")
    .encode(
        x=alt.X("season:N", sort=None),
        y=alt.Y("games:Q"),  # place at total height of the bar
        text=alt.Text("avg_win_prob:Q", format=".1%")
    )
)

chart = (bars + labels).configure(
    background="floralwhite",
    view=alt.ViewConfig(
        fill="floralwhite",
        stroke=None   # <-- removes faint border around plot area
    ),
    axis=alt.AxisConfig(
        domainColor="black",
        grid=False
    ),
    legend=alt.LegendConfig(
        orient="top",
        direction="horizontal"
    )
)

# Save high-resolution image. Use scaleFactor = 3 to increase image size and resolution
chart.save(
    "sharks_prob_by_season.png",
    scale_factor=3
)

chart