# Dodgers Jumbo Jack
> This notebook processes game logs from Baseball Reference and calculates the share of games in which Dodgers pitchers had at least seven strikeouts. This triggers a promotion at Jack in the Box.

---

In [1]:
import os
import requests
import time
import pandas as pd
import jupyter_black
from tqdm.notebook import tqdm
from datetime import datetime
import altair as alt
import altair_stiles as altstiles

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('stiles')

---

In [3]:
base_url = "https://www.baseball-reference.com/teams/tgl.cgi?team=LAD&t=p&year={year}"
years = range(2000, datetime.now().year + 1)

In [4]:
frames = []
for yr in tqdm(years):
    url = base_url.format(year=yr)
    try:
        src = pd.read_html(url)[0]
        src.columns = src.columns.droplevel(0)  # drop first level
        src.columns = src.columns.str.lower()
        df = src.rename(columns={"unnamed: 3_level_1": "home_away"}).copy()
        df["year"] = yr  # tag year
        df_slim = df[["date", "home_away", "so", "year"]]
        frames.append(df_slim)
    except Exception as e:
        print(f"Failed on {yr}: {e}")
    time.sleep(5)

  0%|          | 0/26 [00:00<?, ?it/s]

In [5]:
all_years = pd.concat(frames, ignore_index=True).query('so != "SO" and ~date.isnull()')
all_years["so"] = all_years["so"].astype(int)

In [6]:
all_years["jumbo_jack"] = all_years["so"] >= 7

In [7]:
alt.Chart(all_years).mark_bar(binSpacing=0).encode(
    x=alt.X(
        "so:Q",
        axis=alt.Axis(format=".0f", values=[1, 7, 15, 21]),
        bin=alt.Bin(maxbins=20, nice=False),
        title="Strikeouts",
    ),
    y=alt.Y("count()", title="Games"),
).properties(width=200, height=100).facet("year:O", columns=6)

In [8]:
# Mean equals share of true
season_share = (
    all_years.groupby("year", as_index=False)["jumbo_jack"]
    .mean()
    .assign(pct_jumbo_jack=lambda d: (d["jumbo_jack"] * 100).round(1))
    .drop(columns="jumbo_jack")
)
season_share

Unnamed: 0,year,pct_jumbo_jack
0,2000,50.6
1,2001,64.2
2,2002,55.6
3,2003,69.1
4,2004,47.5
5,2005,43.8
6,2006,45.7
7,2007,59.3
8,2008,58.6
9,2009,66.7


In [9]:
# A more traditional route
season_summary = (
    all_years.groupby("year")
    .agg(
        n_games=("jumbo_jack", "size"),  # includes NaNs
        n_true=("jumbo_jack", "sum"),  # booleans sum to count of True
    )
    .assign(pct_jumbo_jack=lambda d: (d.n_true / d.n_games * 100).round(1))
    .reset_index()
)

In [10]:
# Notes: The 2020 season was shortened by COVID.
# The 2011 season only had 161 games because of a late-season rainout that wasn't rescheduled.
season_summary

Unnamed: 0,year,n_games,n_true,pct_jumbo_jack
0,2000,162,82,50.6
1,2001,162,104,64.2
2,2002,162,90,55.6
3,2003,162,112,69.1
4,2004,162,77,47.5
5,2005,162,71,43.8
6,2006,162,74,45.7
7,2007,162,96,59.3
8,2008,162,95,58.6
9,2009,162,108,66.7
