# Dodgers Data Bot
> This notebook is a sketchpad for data collected in this project. Nothing to see here! 

---

In [1]:
import os
import requests
import time
import pandas as pd
import jupyter_black
import altair as alt
from IPython.display import Image
from tqdm.notebook import tqdm

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
profile_name = os.environ.get("AWS_PERSONAL_PROFILE")

---

## Fetch

#### Read wins, losses

In [4]:
df = pd.read_parquet(
    "https://stilesdata.com/dodgers/data/standings/dodgers_wins_losses_current.parquet"
)

In [5]:
game_number = df.query("game_date == game_date.max()")["gm"].iloc[0]
game_number

16

In [6]:
histogram = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        alt.X("r:Q", bin=alt.Bin(maxbins=20), title="Runs Scored"),
        alt.Y("count()", title="Number of games"),
        alt.Color(
            "result:N",
            scale=alt.Scale(domain=["W", "L"], range=["#005A9C", "#EF3E42"]),
            title="",
        ),
    )
    .properties(
        width=400, height=200, title="Distribution of Runs Scored in Wins and Losses"
    )
    .configure_legend(orient="top")
)

histogram.display()

---

---

#### Game logs

In [9]:
# Read the table from the website and extract links
logs = pd.read_html(
    "https://www.baseball-reference.com/teams/tgl.cgi?team=LAD&t=b&year=2024#rowsum_desc",
    extract_links="body",
)[0]

# Extract the "Date" column which contains tuples of (text, link)
date_links = logs["Date"]

# Create a new dataframe with separate columns for the date and the link
date_link_df = pd.DataFrame(date_links.tolist(), columns=["date", "link"]).dropna(
    subset="link"
)

# If you want to merge this back with the original dataframe:
logs["date"] = date_link_df["date"]
logs["link"] = "https://www.baseball-reference.com/" + date_link_df["link"]

In [10]:
urls = list(logs["link"].dropna())

In [11]:
boxscores = []

for u in tqdm(urls):
    url = u
    date = date = pd.to_datetime(u[-15:].replace("0.shtml", "")).strftime("%Y-%m-%d")
    box_df = (
        pd.read_html(url, header=0)[0]
        .drop(2)
        .drop("Unnamed: 0", axis=1)
        .rename(columns={"Unnamed: 1": "team"})
    ).assign(date=date)
    boxscores.append(box_df)
    time.sleep(3)

  0%|          | 0/117 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
opp_boxes_df = (
    pd.concat(boxscores)
    .query('team != "Los Angeles Dodgers" and ~team.str.contains("Winning")')
    .reset_index(drop=True)[["date", "1", "2", "3", "4", "5", "6", "7", "8", "9"]]
    .replace("X", 0)
    .assign(team="Opponent")
)
opp_boxes_df

In [None]:
lad_boxes_df = (
    pd.concat(boxscores)
    .query('team == "Los Angeles Dodgers"')
    .reset_index(drop=True)[["date", "1", "2", "3", "4", "5", "6", "7", "8", "9"]]
    .replace("X", 0)
    .assign(team="Dodgers")
)
lad_boxes_df

In [None]:
team_box = pd.concat([lad_boxes_df, opp_boxes_df])

In [None]:
team_box[["1", "2", "3", "4", "5", "6", "7", "8", "9"]] = team_box[
    ["1", "2", "3", "4", "5", "6", "7", "8", "9"]
].astype(int)

In [None]:
# Add more data for 'Opponent' team similarly.

# Melt the dataframe to get a long format
melted = team_box.melt(
    id_vars=["date", "team"],
    value_vars=[str(i) for i in range(1, 10)],
    var_name="inning",
    value_name="runs",
)

# Convert the inning to an integer for sorting
melted["inning"] = melted["inning"].astype(int)

# Aggregate the runs by team and inning
agg_runs = melted.groupby(["team", "inning"])["runs"].sum().reset_index()

In [None]:
# Create the heatmap
heatmap = (
    alt.Chart(agg_runs)
    .mark_rect()
    .encode(
        x=alt.X("inning:O", title=""),
        y=alt.Y("team:N", title=""),
        color=alt.Color(
            "runs:Q",
            title="Total Runs",
            scale=alt.Scale(scheme="greys"),
            legend=None,
        ),
        tooltip=["team", "inning", "runs"],
    )
    .properties(width=500, height=50, title="Aggregate runs scored by inning")
)

heatmap

# Implement this heatmap ^ 

# BAR CODE CHART FOR EACH AT BAT FOR EACH STAR PLAYER WITH OUT VS. NOT OUT or OUT VS. HIT

# TEAM BAR CODE CHART WITH HITS OR HRs OR OUT VS. POSITIVE OUTCOME

#### Batting from MLB

In [15]:
headers = {
    "accept": "*/*",
    "accept-language": "en-US,en;q=0.9,es;q=0.8",
    "origin": "https://www.mlb.com",
    "priority": "u=1, i",
    "referer": "https://www.mlb.com/",
    "sec-ch-ua": '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"macOS"',
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "cross-site",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
}

batting_response = requests.get(
    "https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?&env=prod&season=2024&sportId=1&stats=season&group=hitting&gameType=R&limit=25&offset=0&sortStat=homeRuns&order=desc&playerPool=ALL&teamId=119",
    headers=headers,
)

In [18]:
batting_df = pd.DataFrame(batting_response.json()["stats"])

In [22]:
batting_df.head()

Unnamed: 0,year,playerId,playerName,type,rank,playerFullName,playerFirstName,playerLastName,playerUseName,playerInitLastName,teamId,teamAbbrev,teamName,teamShortName,leagueName,leagueId,positionAbbrev,position,primaryPositionAbbrev,plateAppearances,totalBases,leftOnBase,sacBunts,sacFlies,babip,extraBaseHits,hitByPitch,gidp,gidpOpp,numberOfPitches,pitchesPerPlateAppearance,walksPerPlateAppearance,strikeoutsPerPlateAppearance,homeRunsPerPlateAppearance,walksPerStrikeout,iso,reachedOnError,walkOffs,flyOuts,totalSwings,swingAndMisses,ballsInPlay,popOuts,lineOuts,groundOuts,flyHits,popHits,lineHits,groundHits,gamesPlayed,airOuts,runs,doubles,triples,homeRuns,strikeOuts,baseOnBalls,intentionalWalks,hits,avg,atBats,obp,slg,ops,caughtStealing,stolenBases,stolenBasePercentage,groundIntoDoublePlay,rbi,groundOutsToAirouts,catchersInterference,atBatsPerHomeRun
0,2024,606192,Teoscar Hernández,player,1,Teoscar Hernández,Teoscar,Hernández,Teoscar,T Hernández,119,LAD,Los Angeles Dodgers,Dodgers,NL,104,RF,Outfielder,RF,279,126,142,0,0,0.325,29,3,3,34,1100,3.943,0.082,0.287,0.054,0.288,0.238,5,0,26,575,191,172,9,16,55,21,0,21,24,66,51,38,13,1,15,80,23,1,66,0.262,252,0.331,0.5,0.831,2,4,.667,3,47,1.08,1,16.8
1,2024,660271,Shohei Ohtani,player,1,Shohei Ohtani,Shohei,Ohtani,Shohei,S Ohtani,119,LAD,Los Angeles Dodgers,Dodgers,NL,104,DH,Designated Hitter,TWP,287,145,116,0,2,0.356,34,1,5,49,1135,3.955,0.098,0.213,0.052,0.459,0.26,4,1,34,547,150,195,8,17,57,17,0,40,22,63,59,46,17,2,15,61,28,1,79,0.311,254,0.379,0.571,0.95,1,14,.933,5,41,0.97,2,16.93
2,2024,605141,Mookie Betts,player,3,Mookie Betts,Markus,Betts,Mookie,M Betts,119,LAD,Los Angeles Dodgers,Dodgers,NL,104,SS,Shortstop,SS,302,130,73,0,0,0.321,27,0,3,32,1232,4.079,0.149,0.096,0.033,1.552,0.195,0,0,63,459,62,228,16,29,40,17,0,43,20,65,108,48,14,3,10,29,45,0,80,0.311,257,0.414,0.506,0.92,1,9,.900,3,35,0.37,0,25.7
3,2024,571970,Max Muncy,player,4,Max Muncy,Maxwell,Muncy,Max,M Muncy,119,LAD,Los Angeles Dodgers,Dodgers,NL,104,3B,Third Base,3B,167,66,97,0,5,0.259,17,2,1,25,704,4.216,0.126,0.299,0.054,0.42,0.252,0,0,28,292,94,94,13,5,17,13,0,12,6,40,46,24,8,0,9,50,21,2,31,0.223,139,0.323,0.475,0.798,0,0,.---,1,28,0.37,0,15.44
4,2024,669257,Will Smith,player,4,Will Smith,William,Smith,Will,W Smith,119,LAD,Los Angeles Dodgers,Dodgers,NL,104,C,Catcher,C,238,104,102,0,4,0.311,25,3,3,43,944,3.966,0.092,0.155,0.038,0.595,0.206,1,0,46,455,85,176,11,16,42,17,0,29,15,55,73,33,16,0,9,37,22,3,61,0.292,209,0.361,0.498,0.859,1,0,.000,3,38,0.58,0,23.22


In [19]:
pitching_response = requests.get(
    "https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?&env=prod&season=2024&sportId=1&stats=season&group=pitching&gameType=R&limit=25&offset=0&sortStat=strikeouts&order=desc&teamId=119",
    headers=headers,
)

In [20]:
pitching_df = pd.DataFrame(pitching_response.json()["stats"])

---

## Comparing seasons to this point

#### Limit dataframe to latest game number this season

In [None]:
limit_df = df.query(f"gm <= {game_number}").copy()

In [None]:
past = (
    alt.Chart(df.query("year != 2024"))
    .mark_line(size=0.8)
    .encode(
        x=alt.X(
            "gm",
            title="Game number in season",
            axis=alt.Axis(values=[20, 40, 60, 80, 100, 120, 140, 160]),
            scale=alt.Scale(domain=[0, 162]),
        ),
        y=alt.Y("gb:Q", title="Games ahead/back by game in the season: 1958-2024"),
        color=alt.Color("year:O", scale={"range": ["#bbbbbb"]}, legend=None),
    )
    .properties(
        width=800,
        height=400,
        title="LA Dodgers historical standings",
    )
)

current = (
    alt.Chart(df.query("year == '2024'"))
    .mark_line(size=2, color="#005A9C")
    .encode(
        x=alt.X("gm", scale=alt.Scale(domain=[0, 160])),  # Apply the same domain limit
        y="gb:Q",
    )
)

hline = (
    alt.Chart(pd.DataFrame({"y": [0]}))
    .mark_rule(color="black", strokeWidth=0.5)
    .encode(y="y")
)

# Define a text annotation just above the horizontal line
text = (
    alt.Chart(pd.DataFrame({"y": [0], "text": ["Leading ↑"]}))
    .mark_text(
        color="#666666",
        align="center",
        baseline="bottom",
        dy=-0,
        dx=370,
        fontSize=12,
        fontWeight="bold",
    )
    .encode(y="y:Q", text="text:N")
)

# Define a text annotation just above the horizontal line
anno_text = (
    alt.Chart(pd.DataFrame({"y": [20], "text": ["1958-2023"]}))
    .mark_text(
        color="#bbbbbb",
        align="center",
        baseline="bottom",
        dy=20,
        dx=20,
        fontSize=12,
        fontWeight="bold",
    )
    .encode(y="y:Q", text="text:N")
)

# Extract the last point of the 2024 season
last_point_df = df.query("year == '2024'").tail(1).copy()
last_point_df["annotation"] = "2024"

# Create a text annotation chart for the "current" line
current_text_annotation = (
    alt.Chart(last_point_df)
    .mark_text(
        align="left",
        baseline="middle",
        dx=15,
        dy=-30,
        fontSize=12,
        fontWeight="bold",
        color="#005A9C",  # Match the line color or choose a different one
    )
    .encode(x=alt.X("gm:Q"), y=alt.Y("gb:Q"), text="annotation:N")
)

# Combine everything, including the new text annotation
chart = past + hline + current + text + anno_text + current_text_annotation

# Show the chart
chart

In [None]:
chart.save("../visuals/standings.png")

In [None]:
titles = ["1959", "1963", "1965", "1981", "1988", "2020"]

In [None]:
pct_past = (
    alt.Chart(df.query("year != '2024'"))
    .mark_line(size=0.8)
    .encode(
        x=alt.X(
            "gm",
            title="Game number in season",
            axis=alt.Axis(values=[20, 40, 60, 80, 100, 120, 140, 160]),
            scale=alt.Scale(domain=[0, 162]),
        ),
        y=alt.Y(
            "win_pct:Q", title="Winning percentage by game in the season: 1958-2024"
        ),
        color=alt.Color("year:O", scale={"range": ["#bbbbbb"]}, legend=None),
    )
    .properties(
        width=800,
        height=400,
        title="LA Dodgers historical performance",
    )
)

pct_current = (
    alt.Chart(df.query("year == '2024'"))
    .mark_line(size=2, color="#005A9C")
    .encode(
        x=alt.X("gm", scale=alt.Scale(domain=[0, 160])),  # Apply the same domain limit
        y=alt.Y("win_pct:Q", axis=alt.Axis(format="%")),
    )
)

pct_hline = (
    alt.Chart(pd.DataFrame({"y": [0.5]}))
    .mark_rule(color="black", strokeWidth=0.5)
    .encode(y="y")
)

# Define a text annotation just above the horizontal line
pct_text = (
    alt.Chart(pd.DataFrame({"y": [0.51], "text": ["Winning ↑"]}))
    .mark_text(
        color="#666666",
        align="center",
        baseline="bottom",
        dy=-0,
        dx=370,
        fontSize=11,
        fontWeight="bold",
    )
    .encode(y="y:Q", text="text:N")
)

# Define a text annotation just above the horizontal line
pct_anno_text = (
    alt.Chart(pd.DataFrame({"y": [0.4], "text": ["1958-2023"]}))
    .mark_text(
        color="#bbbbbb",
        align="center",
        baseline="bottom",
        dy=20,
        dx=20,
        fontSize=12,
        fontWeight="bold",
    )
    .encode(y="y:Q", text="text:N")
)

# Extract the last point of the 2024 season
last_point_df = df.query("year == '2024'").tail(1).copy()
last_point_df["annotation"] = "2024"

# Create a text annotation chart for the "current" line
pct_current_text_annotation = (
    alt.Chart(last_point_df)
    .mark_text(
        align="left",
        baseline="middle",
        dx=120,
        dy=-60,
        fontSize=12,
        fontWeight="bold",
        color="#005A9C",
    )
    .encode(x=alt.X("gm:Q"), y=alt.Y("gb:Q"), text="annotation:N")
)

# Combine everything, including the new text annotation
pct_chart = (
    pct_past
    + pct_hline
    + pct_current
    + pct_text
    + pct_anno_text
    + pct_current_text_annotation
)

# Show the chart
pct_chart

In [None]:
alt.Chart(limit_df.query(f"gm == {game_number}")).mark_bar().encode(
    x=alt.Y(
        "year:O",
        axis=alt.Axis(
            values=[1960, 1970, 1980, 1990, 2000, 2010, 2024],
            title="",
        ),
    ),
    y=alt.Y("gb:Q", title=""),
    color=alt.condition(
        alt.datum.gb > 0,
        alt.value("#005A9C"),
        alt.value("#e9e9e9"),
    ),
).properties(
    width=650,
    height=200,
    title=f"LA Dodgers historical standings: Games back by game {game_number} of the season: 1958-2024",
)

---

#### Wins vs. Losses

In [None]:
wl_df = df.query("year == '2024'")[["gm", "game_date", "result", "r", "ra"]].copy()

In [None]:
wl_df["result"] = wl_df["result"].str.split("-", expand=True)[0]

In [None]:
wl_df["run_diff"] = wl_df["r"] - wl_df["ra"]

In [None]:
wl_df = df.query("year == '2024'")[["gm", "game_date", "result", "r", "ra"]].copy()
wl_df["result"] = wl_df["result"].str.split("-", expand=True)[0]
wl_df["run_diff"] = wl_df["r"] - wl_df["ra"]

win_loss_chart = (
    alt.Chart(wl_df)
    .mark_bar()
    .encode(
        x=alt.Y(
            "game_date:T",
            axis=alt.Axis(
                format="%B %-d",
                tickCount=6,
                title="",
            ),
        ),
        y=alt.Y("run_diff:Q", title=""),
        color=alt.condition(
            alt.datum.run_diff > 0,
            alt.value("#005A9C"),
            alt.value("#e9e9e9"),
        ),
    )
)

win_loss_chart.properties(
    width=900,
    height=100,
    title=f"LA Dodgers 2024: Wins/losses and run differential",
)

---

## Scoring

#### Group by season and sum runs, runs against

In [None]:
runs_season_limit = (
    df.groupby("year").agg({"r": "sum", "ra": "sum", "gm": "size"}).reset_index()
).rename(columns={"r": "runs", "ra": "runs_against", "gm": "games"})

#### Runs and runs against per game

In [None]:
runs_season_limit["runs_per_game"] = (
    runs_season_limit["runs"] / runs_season_limit["games"]
).round(2)

In [None]:
runs_season_limit["runs_against_per_game"] = (
    runs_season_limit["runs_against"] / runs_season_limit["games"]
).round(2)

#### Difference

In [None]:
runs_season_limit["runs_per_game_diff"] = (
    runs_season_limit["runs_per_game"] - runs_season_limit["runs_against_per_game"]
)

---

#### Runs scrored to this point

In [None]:
limit_df["r"] = limit_df["r"].astype(int)

In [None]:
runs_so_far = (
    limit_df.groupby("year")["r"]
    .sum()
    .reset_index(name="runs_to_date")
    .sort_values("year", ascending=False)
)

In [None]:
runs_this_season = int(runs_so_far.query("year == year.max()")["runs_to_date"].iloc[0])

In [None]:
base = (
    alt.Chart(runs_so_far)
    .encode(
        x=alt.X(
            "runs_to_date",
            title=f"Runs by game no. {game_number}",
            axis=alt.Axis(tickCount=6),
        ),
        y=alt.Y("year:O", title="").sort("x"),
        color=alt.condition(
            alt.datum.year == "2024",
            alt.value("steelblue"),
            alt.value("#e3e3e3"),
        ),
        text=alt.Text("runs_to_date", title=""),
    )
    .properties(
        height=1100,
        width=650,
        title=f"Dodgers historical offense: Total runs through game {game_number}, 1958-2024",
    )
)

base.mark_bar(color="#005A9C") + base.mark_text(align="left", dx=2, color="#000")

# Define the vertical line for "runs_this_season"
vertical_line = (
    alt.Chart(pd.DataFrame({"x": [runs_this_season]}))
    .mark_rule(color="black", size=0.5)
    .encode(
        x="x:Q",
    )
)

# Define the text annotation for the vertical line
text_annotation = (
    alt.Chart(pd.DataFrame({"x": [runs_this_season], "y": [runs_so_far["year"].max()]}))
    .mark_text(
        text=[f"Runs this season: {runs_this_season}"],
        align="left",
        dx=5,  # Adjust text position horizontally
        dy=-1005,  # Adjust text position vertically
    )
    .encode(
        x="x:Q",
        y=alt.Y("y:O", axis=alt.Axis(title="")),
    )
)

# Combine your base chart with the vertical line and text annotation
final_chart = (
    base.mark_bar(color="#005A9C")
    + base.mark_text(align="left", dx=2, color="#000")
    + vertical_line
    + text_annotation
).properties(
    height=1100,
    width=650,
    title=f"Dodgers historical offense: Total runs through game {game_number}, 1958-2024",
)

In [None]:
final_chart.save("../visuals/runs.png")

In [None]:
# !jupyter nbconvert --to script --no-prompt --output ../03_viz_standings 03_viz_standings.ipynb