# LA Dodgers Standings, 1958-2023
> This notebook downloads historic standing tables from [Baseball Reference](https://www.baseball-reference.com/teams/LAD/2024-schedule-scores.shtml) and outputs them to CSV, JSON and Parquet formats for later analysis and visualization.

---

#### Import Python tools and Jupyter config

In [1]:
import pandas as pd
import jupyter_black
from time import sleep
from tqdm.notebook import tqdm

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

## Fetch

#### List comprehension of historic urls

In [3]:
urls = [
    f"https://www.baseball-reference.com/teams/LAD/{year}-schedule-scores.shtml"
    for year in range(1958, 2024)
]

#### Loop through urls, fetch standings table, store in list of dataframes

In [4]:
dfs = []

for url in tqdm(urls):
    year = url.split("/")[5].replace("-schedule-scores.shtml", "")
    src_df = (
        pd.read_html(url)[0]
        .query("Tm !='Tm' and Inn != 'Game Preview, and Matchups'")
        .drop(["Unnamed: 2", "Streak", "Orig. Scheduled"], axis=1)
        .rename(columns={"Unnamed: 4": "home_away"})
        .assign(season=year)
    )
    dfs.append(src_df)
    sleep(4)

  0%|          | 0/66 [00:00<?, ?it/s]

#### Concatenate into one historic dataframe

In [5]:
src = pd.concat(dfs)

---

## Process

#### Clean columns

In [6]:
src.columns = src.columns.str.lower().str.replace("/", "_").str.replace("-", "-")

In [7]:
src.columns = [
    "gm",
    "date",
    "tm",
    "home_away",
    "opp",
    "result",
    "r",
    "ra",
    "inn",
    "record",
    "rank",
    "gb",
    "win",
    "loss",
    "save",
    "time",
    "day_night",
    "attendance",
    "cli",
    "year",
]

#### Split, format date

In [8]:
src[["weekday", "date"]] = src["date"].str.split(", ", expand=True)

In [9]:
src["date"] = src["date"].str.replace(" (1)", "").str.replace(" (2)", "")

In [10]:
src["game_date"] = pd.to_datetime(src["date"] + ", " + src["year"], format="%b %d, %Y")

#### Clean home-away column

In [11]:
src.loc[src.home_away == "@", "home_away"] = "away"
src.loc[src.home_away.isna(), "home_away"] = "home"

#### Format "games back" as a number (positive = lead; negative = behind)

In [12]:
src["gb"] = (
    src["gb"].str.replace("up ", "up").str.replace("up", "+").str.replace("Tied", "0")
)

In [13]:
src["gb"] = src["gb"].apply(
    lambda x: float(x) if x.startswith("+") else -float(x) if float(x) != 0 else 0
)

#### The *number* of games

In [14]:
src["gm"] = src["gm"].astype(int)

#### Just the columns we need, in a clean dataframe

In [15]:
df = src[
    [
        "gm",
        "game_date",
        "home_away",
        "opp",
        "result",
        "r",
        "ra",
        "record",
        "rank",
        "gb",
        "time",
        "day_night",
        "attendance",
        "year",
    ]
].copy()

---

#### Limit dataframe to latest game number this season

In [16]:
game_number = df.query("game_date == game_date.max()")["gm"].iloc[0]

In [17]:
limit_df = df.query(f"gm <= {game_number}").copy()

In [18]:
past = (
    alt.Chart(df.query("year != '2024'"))
    .mark_line(size=1)
    .encode(
        x=alt.X(
            "gm",
            title="Game number in season",
            axis=alt.Axis(values=[20, 40, 60, 80, 100, 120, 140, 160]),
        ),
        y=alt.Y("gb:Q", title=""),
        color=alt.Color("year", scale={"range": ["#e2e2e2"]}, legend=None),
    )
    .properties(
        width=900,
        height=400,
        title="LA Dodgers historical standings: Games back by game in the season: 1958-2024",
    )
)

current = (
    alt.Chart(df.query("year == '2024'"))
    .mark_line(size=2, color="#005A9C")
    .encode(
        x="gm",
        y="gb:Q",
    )
)

past + current

NameError: name 'alt' is not defined

In [None]:
alt.Chart(limit_df.query(f"gm == {game_number}")).mark_bar().encode(
    x=alt.Y(
        "year:O",
        axis=alt.Axis(
            values=["1960", "1970", "1980", "1990", "2000", "2010", "2020", "2024"],
            title="",
        ),
    ),
    y=alt.Y("gb:Q", title=""),
    color=alt.condition(
        alt.datum.gb > 0,
        alt.value("#005A9C"),  # The positive color
        alt.value("#EF3E42"),  # The negative color
    ),
).properties(
    width=900,
    title=f"LA Dodgers historical standings: Games back by game {game_number} of the season: 1958-2024",
)

#### Runs scrored to this point

In [None]:
limit_df["r"] = limit_df["r"].astype(int)

In [None]:
runs_so_far = (
    limit_df.groupby("year")["r"]
    .sum()
    .reset_index(name="runs_to_date")
    .sort_values("year", ascending=False)
)

In [None]:
base = (
    alt.Chart(runs_so_far)
    .encode(
        x=alt.X("runs_to_date", title=f"Runs by game no. {game_number}"),
        y=alt.Y("year:O", title="").sort("-x"),
        color=alt.condition(
            alt.datum.year == "2024",
            alt.value("steelblue"),  # The positive color
            alt.value("#e3e3e3"),  # The negative color
        ),
        text=alt.Text("runs_to_date", title=""),
    )
    .properties(
        height=1100,
        width=300,
        title=f"Dodgers historical offense: Total runs through game {game_number}, 1958-2024",
    )
)

base.mark_bar(color="#005A9C") + base.mark_text(align="left", dx=2, color="#000")

---

## Exports

#### CSV format

In [19]:
df.to_csv("data/processed/dodgers_standings_1958_2023.csv", index=False)

#### JSON

In [20]:
df.to_json(
    "data/processed/dodgers_standings_1958_2023.json", indent=4, orient="records"
)

#### Parquet

In [21]:
df.to_parquet("data/processed/dodgers_standings_1958_2023.parquet", index=False)