# LA Dodgers toplines
> This notebook extracts key statistics from the project's processed tables for display in a dashboard.

---

#### Import Python tools and Jupyter config

In [1]:
import os
import boto3
import pandas as pd
import jupyter_black
import altair as alt
from io import BytesIO

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

## Read

#### Standings

In [32]:
standings = pd.read_parquet(
    "https://stilesdata.com/dodgers/data/standings/dodgers_standings_1958_present.parquet"
).query("year == '2024'")
standings_past = pd.read_parquet(
    "https://stilesdata.com/dodgers/data/standings/dodgers_standings_1958_present.parquet"
).query("year != '2024'")
standings_now = standings.query("game_date == game_date.max()").copy()

In [33]:
standings_now.loc[standings_now.result == "L", "result_clean"] = "loss"
standings_now.loc[standings_now.result == "W", "result_clean"] = "win"

In [34]:
batting = pd.read_parquet(
    "https://stilesdata.com/dodgers/data/batting/dodgers_team_batting_1958_present.parquet"
)

In [35]:
batting_past = batting.query("season != '2024'").copy()
batting_now = batting.query("season == '2024'").copy()

In [38]:
batting

Unnamed: 0,name,age,g,pa,ab,r,h,2b,3b,hr,rbi,sb,cs,bb,so,ba,obp,slg,ops,ops_plus,tb,gdp,hbp,sh,sf,ibb,season
0,Team Totals,30.6,33,1315,1142,177,310,72,3,38,171,22,6,143,279,0.271,0.354,0.44,0.793,125,502,22,11,1,16,4,2024
1,Team Totals,31.0,162,6333,5524,906,1422,303,20,249,877,105,25,644,1359,0.257,0.34,0.455,0.795,113,2512,98,85,5,69,29,2023
2,Team Totals,29.7,162,6247,5526,847,1418,325,31,212,812,98,18,607,1374,0.257,0.333,0.442,0.775,115,2441,85,56,3,53,22,2022
3,Team Totals,29.3,162,6239,5445,830,1330,247,24,237,799,65,17,613,1408,0.244,0.33,0.429,0.759,101,2336,96,104,32,45,36,2021
4,Team Totals,28.0,60,2316,2042,349,523,97,6,118,327,29,8,228,471,0.256,0.338,0.483,0.821,119,986,46,30,3,12,7,2020
5,Team Totals,27.9,162,6282,5493,886,1414,302,20,279,861,57,10,607,1356,0.257,0.338,0.472,0.81,111,2593,100,81,55,45,47,2019
6,Team Totals,28.1,163,6358,5572,804,1394,296,33,235,756,75,24,647,1436,0.25,0.333,0.442,0.774,109,2461,119,61,39,39,47,2018
7,Team Totals,27.9,162,6191,5408,770,1347,312,20,221,730,77,28,649,1380,0.249,0.334,0.437,0.771,104,2362,119,64,31,38,41,2017
8,Team Totals,28.9,162,6164,5518,725,1376,272,21,189,680,45,26,525,1321,0.249,0.319,0.409,0.728,95,2257,120,58,30,32,31,2016
9,Team Totals,29.7,162,6090,5385,667,1346,263,26,187,638,59,34,563,1258,0.25,0.326,0.413,0.739,106,2222,135,60,49,30,31,2015


---

## Key statistics

#### 1. Current season record (Wins-Losses)
> Provides an immediate understanding of the team's overall performance for the season.

In [8]:
games = standings_now["gm"].loc[0]
wins = standings_now["wins"].loc[0]
losses = standings_now["losses"].loc[0]

In [9]:
record = standings_now["record"].loc[0]

#### 2. Win percentage
> Allows for normalization of success to compare across different seasons or different numbers of games played.

In [10]:
win_pct = int(standings_now["win_pct"].loc[0] * 100)
win_pct_decade_thispoint = int(
    standings_past.query(f"gm == {games}").head(10)["win_pct"].mean().round(2) * 100
)

#### 3. Run differential
> A positive run differential generally correlates with a stronger team performance and is predictive of future success.

In [11]:
runs = standings["r"].sum()
runs_against = standings["ra"].sum()

In [12]:
run_diff = runs - runs_against

#### 4. Home runs and home runs per game
> Reflects the team's power-hitting capabilities, significant for scoring strategies.

In [13]:
batting_past["hr_game"] = (
    batting_past["hr"].astype(int) / batting_past["g"].astype(int)
).round(2)

In [14]:
home_runs = int(batting_now["hr"].sum())
home_runs_game = (home_runs / games).round(2)
home_runs_game_last = batting_past.query('season == "2023"')["hr_game"].iloc[0]

In [15]:
games_decade = batting_past.head(10)["g"].astype(int).sum()
home_runs_decade = batting_past.head(10)["hr"].astype(int).sum()

In [16]:
home_runs_game_decade = (home_runs_decade / games_decade).round(2)

#### 5. Earned run average (ERA)
> A key measure of pitching staff effectiveness, with a lower ERA indicating better performance.

#### Batting average and on
> Summarizes players' strength in getting on base — and hopefully scoring runs.

In [17]:
batting_average = batting_now["ba"].iloc[0]

In [18]:
batting_average_decade = (
    batting_past.head(10)["ba"]
    .astype(float)
    .mean()
    .round(3)
    .astype(str)
    .replace("0.", ".")
)

#### 7. Stolen bases
> Stolen bases can significantly impact game dynamics and indicate the team's strategic play.

In [19]:
stolen_bases = int(batting_now["sb"].iloc[0])
stolen_bases_game = (stolen_bases / games).round(2)

In [20]:
stolen_decade = batting_past.head(10)["sb"].astype(int).sum()
games_decade = batting_past.head(10)["g"].astype(int).sum()
stolen_bases_decade_game = (stolen_decade / games_decade).round(2)

#### 8. Fielding percentage
> Indicates the team's defensive capabilities, with a higher percentage reflecting better performance.

#### 9. Recent trend (last 10 games)
> Provides insight into the team's current form and momentum, which is essential for assessing changes in performance.

In [21]:
last_10 = standings["result"].head(10)
win_count_trend = last_10[last_10 == "W"].count()
loss_count_trend = last_10[last_10 == "L"].count()

In [22]:
win_loss_trend = f"Recent trend: {win_count_trend} wins, {loss_count_trend} losses"

#### 10. Summary
> Creates one file to import for topline statistics and a narrative summary of the standings now.

In [23]:
summary = f"The Dodgers have played {games} games this season compiling a {record} record — a winning percentage of {win_pct}%. The team's last game was a {standings_now['r'].iloc[0]}-{standings_now['ra'].iloc[0]} {standings_now['home_away'].iloc[0]} {standings_now['result_clean'].iloc[0]} to the {standings_now['opp'].iloc[0]} in front of {'{:,}'.format(standings_now['attendance'].iloc[0])} fans. The team has won {win_count_trend} of its last 10 games."

In [24]:
summary_data = [
    {"stat": "wins", "value": wins, "category": "standings"},
    {"stat": "losses", "value": losses, "category": "standings"},
    {"stat": "record", "value": record, "category": "standings"},
    {"stat": "win_pct", "value": f"{win_pct}%", "category": "standings"},
    {
        "stat": "win_pct_decade_thispoint",
        "value": f"{win_pct_decade_thispoint}%",
        "category": "standings",
    },
    {"stat": "runs", "value": runs, "category": "standings"},
    {"stat": "runs_against", "value": runs_against, "category": "standings"},
    {"stat": "run_differential", "value": run_diff, "category": "standings"},
    {"stat": "home_runs", "value": home_runs, "category": "batting"},
    {"stat": "home_runs_game", "value": home_runs_game, "category": "batting"},
    {
        "stat": "home_runs_game_last",
        "value": home_runs_game_last,
        "category": "batting",
    },
    {
        "stat": "home_runs_game_decade",
        "value": home_runs_game_decade,
        "category": "batting",
    },
    {"stat": "stolen_bases", "value": stolen_bases, "category": "batting"},
    {"stat": "stolen_bases_game", "value": stolen_bases_game, "category": "batting"},
    {
        "stat": "stolen_bases_decade_game",
        "value": stolen_bases_decade_game,
        "category": "batting",
    },
    {"stat": "batting_average", "value": batting_average, "category": "batting"},
    {
        "stat": "batting_average_decade",
        "value": batting_average_decade,
        "category": "batting",
    },
    {"stat": "summary", "value": summary, "category": "standings"},
]

In [25]:
summary_df = pd.DataFrame(summary_data)

In [26]:
summary_df

Unnamed: 0,stat,value,category
0,wins,17,standings
1,losses,11,standings
2,record,17-11,standings
3,win_pct,61%,standings
4,win_pct_decade_thispoint,57%,standings
5,runs,153,standings
6,runs_against,115,standings
7,run_differential,38,standings
8,home_runs,34,batting
9,home_runs_game,1.21,batting


In [27]:
summary_df.to_csv("../data/standings/season_summary_latest.csv", index=False)
summary_df.to_json(
    "../data/standings/season_summary_latest.csv", indent=4, orient="records"
)

#### S3

In [28]:
def save_to_s3(df, base_path, s3_bucket, formats=["csv", "json"]):
    """
    Save Pandas DataFrame in specified formats and upload to S3 bucket using environment credentials.

    :param df: DataFrame to save.
    :param base_path: Base file path without format extension.
    :param s3_bucket: S3 bucket name.
    :param formats: List of formats to save -- 'csv', 'json'.
    """
    # Create session using environment variables directly
    session = boto3.Session(
        aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
        aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
        region_name="us-west-1",
        profile_name="haekeo",
    )
    s3_resource = session.resource("s3")

    for fmt in formats:
        file_path = f"{base_path}.{fmt}"
        buffer = BytesIO()
        if fmt == "csv":
            df.to_csv(buffer, index=False)
            content_type = "text/csv"
        elif fmt == "json":
            df.to_json(buffer, orient="records", lines=True)
            content_type = "application/json"

        buffer.seek(0)
        s3_resource.Bucket(s3_bucket).put_object(
            Key=file_path, Body=buffer, ContentType=content_type
        )
        print(f"Uploaded {fmt} to {s3_bucket}/{file_path}")


# Save to S3
save_to_s3(
    summary_df,
    "dodgers/data/standings/season_summary_latest",
    "stilesdata.com",
)

Uploaded csv to stilesdata.com/dodgers/data/standings/season_summary_latest.csv
Uploaded json to stilesdata.com/dodgers/data/standings/season_summary_latest.json


In [31]:
!jupyter nbconvert --to script --no-prompt --output ../scripts/06-create-toplines-summary 07-create-toplines-summary.ipynb

[NbConvertApp] Converting notebook 07-create-toplines-summary.ipynb to script
[NbConvertApp] Writing 8094 bytes to ../scripts/06-create-toplines-summary.py
