# Dodgers Data Bot
> This notebook is a sketchpad for data collected in this project. Nothing to see here! 

---

In [2]:
import os
import pandas as pd
import jupyter_black
import altair as alt
import altair_stiles as altstiles
from IPython.display import Image

In [3]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
alt.data_transformers.disable_max_rows()
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('stiles')

In [4]:
profile_name = os.environ.get("AWS_PERSONAL_PROFILE")

---

## Fetch

#### Read wins, losses

In [5]:
df = pd.read_parquet(
    "https://stilesdata.com/dodgers/data/standings/dodgers_wins_losses_current.parquet"
)

In [6]:
game_number = df.query("game_date == game_date.max()")["gm"].iloc[0]
game_number

51

In [7]:
histogram = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        alt.X("r:Q", bin=alt.Bin(maxbins=20), title="Runs Scored"),
        alt.Y("count()", title="Number of games"),
        alt.Color(
            "result:N",
            scale=alt.Scale(domain=["W", "L"], range=["#005A9C", "#EF3E42"]),
            title="",
        ),
    )
    .properties(
        width=900, height=200, title="Distribution of Runs Scored in Wins and Losses"
    )
    .configure_legend(orient="top")
)

histogram.display()

---

## Summary tests

In [26]:
standings_test = pd.read_parquet(
    "https://stilesdata.com/dodgers/data/standings/dodgers_standings_1958_present.parquet"
)

In [37]:
standings_test["rank_ordinal"] = standings_test["rank"].map(to_ordinal)

In [38]:
standings_test["rank_ordinal"].iloc[0]

'1st'

In [28]:
\

Unnamed: 0,gm,game_date,home_away,opp,result,r,ra,record,rank,gb,time,time_minutes,day_night,attendance,year,wins,losses,win_pct,game_day
0,51,2024-05-21,home,ARI,L,3,7,33-18,1,8.0,2:30:00,150,N,46180,2024,33,18,0.65,Tuesday
1,50,2024-05-20,home,ARI,W,6,4,33-17,1,8.0,2:39:00,159,N,37634,2024,33,17,0.66,Monday
2,49,2024-05-19,home,CIN,W,3,2,32-17,1,7.5,2:54:00,174,D,52656,2024,32,17,0.65,Sunday
3,48,2024-05-18,home,CIN,W,4,0,31-17,1,7.5,2:07:00,127,N,49239,2024,31,17,0.65,Saturday
4,47,2024-05-17,home,CIN,W,7,3,30-17,1,7.0,2:19:00,139,N,46832,2024,30,17,0.64,Friday


4     1
6     1
10    1
13    6
15    3
16    4
17    2
Name: context_value, dtype: object

---

## Comparing seasons to this point

#### Limit dataframe to latest game number this season

In [6]:
limit_df = df.query(f"gm <= {game_number}").copy()

In [7]:
past = (
    alt.Chart(df.query("year != 2024"))
    .mark_line(size=0.8)
    .encode(
        x=alt.X(
            "gm",
            title="Game number in season",
            axis=alt.Axis(values=[20, 40, 60, 80, 100, 120, 140, 160]),
            scale=alt.Scale(domain=[0, 162]),
        ),
        y=alt.Y("gb:Q", title="Games ahead/back by game in the season: 1958-2024"),
        color=alt.Color("year:O", scale={"range": ["#bbbbbb"]}, legend=None),
    )
    .properties(
        width=800,
        height=400,
        title="LA Dodgers historical standings",
    )
)

current = (
    alt.Chart(df.query("year == '2024'"))
    .mark_line(size=2, color="#005A9C")
    .encode(
        x=alt.X("gm", scale=alt.Scale(domain=[0, 160])),  # Apply the same domain limit
        y="gb:Q",
    )
)

hline = (
    alt.Chart(pd.DataFrame({"y": [0]}))
    .mark_rule(color="black", strokeWidth=0.5)
    .encode(y="y")
)

# Define a text annotation just above the horizontal line
text = (
    alt.Chart(pd.DataFrame({"y": [0], "text": ["Leading ↑"]}))
    .mark_text(
        color="#666666",
        align="center",
        baseline="bottom",
        dy=-0,
        dx=370,
        fontSize=12,
        fontWeight="bold",
    )
    .encode(y="y:Q", text="text:N")
)

# Define a text annotation just above the horizontal line
anno_text = (
    alt.Chart(pd.DataFrame({"y": [20], "text": ["1958-2023"]}))
    .mark_text(
        color="#bbbbbb",
        align="center",
        baseline="bottom",
        dy=20,
        dx=20,
        fontSize=12,
        fontWeight="bold",
    )
    .encode(y="y:Q", text="text:N")
)

# Extract the last point of the 2024 season
last_point_df = df.query("year == '2024'").tail(1).copy()
last_point_df["annotation"] = "2024"

# Create a text annotation chart for the "current" line
current_text_annotation = (
    alt.Chart(last_point_df)
    .mark_text(
        align="left",
        baseline="middle",
        dx=15,
        dy=-30,
        fontSize=12,
        fontWeight="bold",
        color="#005A9C",  # Match the line color or choose a different one
    )
    .encode(x=alt.X("gm:Q"), y=alt.Y("gb:Q"), text="annotation:N")
)

# Combine everything, including the new text annotation
chart = past + hline + current + text + anno_text + current_text_annotation

# Show the chart
chart

In [8]:
chart.save("../visuals/standings.png")

In [9]:
titles = ["1959", "1963", "1965", "1981", "1988", "2020"]

In [10]:
pct_past = (
    alt.Chart(df.query("year != '2024'"))
    .mark_line(size=0.8)
    .encode(
        x=alt.X(
            "gm",
            title="Game number in season",
            axis=alt.Axis(values=[20, 40, 60, 80, 100, 120, 140, 160]),
            scale=alt.Scale(domain=[0, 162]),
        ),
        y=alt.Y(
            "win_pct:Q", title="Winning percentage by game in the season: 1958-2024"
        ),
        color=alt.Color("year:O", scale={"range": ["#bbbbbb"]}, legend=None),
    )
    .properties(
        width=800,
        height=400,
        title="LA Dodgers historical performance",
    )
)

pct_current = (
    alt.Chart(df.query("year == '2024'"))
    .mark_line(size=2, color="#005A9C")
    .encode(
        x=alt.X("gm", scale=alt.Scale(domain=[0, 160])),  # Apply the same domain limit
        y=alt.Y("win_pct:Q", axis=alt.Axis(format="%")),
    )
)

pct_hline = (
    alt.Chart(pd.DataFrame({"y": [0.5]}))
    .mark_rule(color="black", strokeWidth=0.5)
    .encode(y="y")
)

# Define a text annotation just above the horizontal line
pct_text = (
    alt.Chart(pd.DataFrame({"y": [0.51], "text": ["Winning ↑"]}))
    .mark_text(
        color="#666666",
        align="center",
        baseline="bottom",
        dy=-0,
        dx=370,
        fontSize=11,
        fontWeight="bold",
    )
    .encode(y="y:Q", text="text:N")
)

# Define a text annotation just above the horizontal line
pct_anno_text = (
    alt.Chart(pd.DataFrame({"y": [0.4], "text": ["1958-2023"]}))
    .mark_text(
        color="#bbbbbb",
        align="center",
        baseline="bottom",
        dy=20,
        dx=20,
        fontSize=12,
        fontWeight="bold",
    )
    .encode(y="y:Q", text="text:N")
)

# Extract the last point of the 2024 season
last_point_df = df.query("year == '2024'").tail(1).copy()
last_point_df["annotation"] = "2024"

# Create a text annotation chart for the "current" line
pct_current_text_annotation = (
    alt.Chart(last_point_df)
    .mark_text(
        align="left",
        baseline="middle",
        dx=120,
        dy=-60,
        fontSize=12,
        fontWeight="bold",
        color="#005A9C",
    )
    .encode(x=alt.X("gm:Q"), y=alt.Y("gb:Q"), text="annotation:N")
)

# Combine everything, including the new text annotation
pct_chart = (
    pct_past
    + pct_hline
    + pct_current
    + pct_text
    + pct_anno_text
    + pct_current_text_annotation
)

# Show the chart
pct_chart

In [None]:
alt.Chart(limit_df.query(f"gm == {game_number}")).mark_bar().encode(
    x=alt.Y(
        "year:O",
        axis=alt.Axis(
            values=[1960, 1970, 1980, 1990, 2000, 2010, 2024],
            title="",
        ),
    ),
    y=alt.Y("gb:Q", title=""),
    color=alt.condition(
        alt.datum.gb > 0,
        alt.value("#005A9C"),
        alt.value("#e9e9e9"),
    ),
).properties(
    width=650,
    height=200,
    title=f"LA Dodgers historical standings: Games back by game {game_number} of the season: 1958-2024",
)

---

#### Wins vs. Losses

In [54]:
wl_df = df.query("year == '2024'")[["gm", "game_date", "result", "r", "ra"]].copy()

In [55]:
wl_df["result"] = wl_df["result"].str.split("-", expand=True)[0]

In [57]:
wl_df["run_diff"] = wl_df["r"] - wl_df["ra"]

In [67]:
wl_df = df.query("year == '2024'")[["gm", "game_date", "result", "r", "ra"]].copy()
wl_df["result"] = wl_df["result"].str.split("-", expand=True)[0]
wl_df["run_diff"] = wl_df["r"] - wl_df["ra"]

win_loss_chart = (
    alt.Chart(wl_df)
    .mark_bar()
    .encode(
        x=alt.Y(
            "game_date:T",
            axis=alt.Axis(
                format="%B %-d",
                tickCount=6,
                title="",
            ),
        ),
        y=alt.Y("run_diff:Q", title=""),
        color=alt.condition(
            alt.datum.run_diff > 0,
            alt.value("#005A9C"),
            alt.value("#e9e9e9"),
        ),
    )
)

win_loss_chart.properties(
    width=900,
    height=100,
    title=f"LA Dodgers 2024: Wins/losses and run differential",
)

---

## Scoring

#### Group by season and sum runs, runs against

In [12]:
runs_season_limit = (
    df.groupby("year").agg({"r": "sum", "ra": "sum", "gm": "size"}).reset_index()
).rename(columns={"r": "runs", "ra": "runs_against", "gm": "games"})

#### Runs and runs against per game

In [13]:
runs_season_limit["runs_per_game"] = (
    runs_season_limit["runs"] / runs_season_limit["games"]
).round(2)

In [14]:
runs_season_limit["runs_against_per_game"] = (
    runs_season_limit["runs_against"] / runs_season_limit["games"]
).round(2)

#### Difference

In [15]:
runs_season_limit["runs_per_game_diff"] = (
    runs_season_limit["runs_per_game"] - runs_season_limit["runs_against_per_game"]
)

---

#### Runs scrored to this point

In [16]:
limit_df["r"] = limit_df["r"].astype(int)

In [17]:
runs_so_far = (
    limit_df.groupby("year")["r"]
    .sum()
    .reset_index(name="runs_to_date")
    .sort_values("year", ascending=False)
)

In [18]:
runs_this_season = int(runs_so_far.query("year == year.max()")["runs_to_date"].iloc[0])

In [19]:
base = (
    alt.Chart(runs_so_far)
    .encode(
        x=alt.X(
            "runs_to_date",
            title=f"Runs by game no. {game_number}",
            axis=alt.Axis(tickCount=6),
        ),
        y=alt.Y("year:O", title="").sort("x"),
        color=alt.condition(
            alt.datum.year == "2024",
            alt.value("steelblue"),
            alt.value("#e3e3e3"),
        ),
        text=alt.Text("runs_to_date", title=""),
    )
    .properties(
        height=1100,
        width=650,
        title=f"Dodgers historical offense: Total runs through game {game_number}, 1958-2024",
    )
)

base.mark_bar(color="#005A9C") + base.mark_text(align="left", dx=2, color="#000")

# Define the vertical line for "runs_this_season"
vertical_line = (
    alt.Chart(pd.DataFrame({"x": [runs_this_season]}))
    .mark_rule(color="black", size=0.5)
    .encode(
        x="x:Q",
    )
)

# Define the text annotation for the vertical line
text_annotation = (
    alt.Chart(pd.DataFrame({"x": [runs_this_season], "y": [runs_so_far["year"].max()]}))
    .mark_text(
        text=[f"Runs this season: {runs_this_season}"],
        align="left",
        dx=5,  # Adjust text position horizontally
        dy=-1005,  # Adjust text position vertically
    )
    .encode(
        x="x:Q",
        y=alt.Y("y:O", axis=alt.Axis(title="")),
    )
)

# Combine your base chart with the vertical line and text annotation
final_chart = (
    base.mark_bar(color="#005A9C")
    + base.mark_text(align="left", dx=2, color="#000")
    + vertical_line
    + text_annotation
).properties(
    height=1100,
    width=650,
    title=f"Dodgers historical offense: Total runs through game {game_number}, 1958-2024",
)

In [20]:
final_chart.save("../visuals/runs.png")

In [21]:
# !jupyter nbconvert --to script --no-prompt --output ../03_viz_standings 03_viz_standings.ipynb