# LA Dodgers toplines
> This notebook extracts key statistics from the project's processed tables for display in a dashboard.

---

In [53]:
import os
import pandas as pd
import boto3
from io import BytesIO
from datetime import datetime, timezone, timedelta

In [None]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [54]:
def get_pacific_time():
    utc_zone = timezone.utc
    utc_time = datetime.now(utc_zone)
    pacific_offset = timedelta(hours=-8)
    if utc_time.astimezone(timezone.utc).replace(tzinfo=None).month in {3, 4, 5, 6, 7, 8, 9, 10}:
        pacific_offset = timedelta(hours=-7)  # Assuming daylight saving time from March to October
    pacific_zone = timezone(pacific_offset)
    pacific_time = utc_time.astimezone(pacific_zone)
    formatted_time = pacific_time.strftime("Last updated at %-I:%M %p PT, %B %-d, %Y")
    return formatted_time

In [None]:
# Store the update time
update_time = get_pacific_time()

In [32]:
mlb_teams = {
    "ARI": "Arizona Diamondbacks",
    "ATL": "Atlanta Braves",
    "BAL": "Baltimore Orioles",
    "BOS": "Boston Red Sox",
    "CHC": "Chicago Cubs",
    "CHW": "Chicago White Sox",
    "CIN": "Cincinnati Reds",
    "CLE": "Cleveland Guardians",
    "COL": "Colorado Rockies",
    "DET": "Detroit Tigers",
    "HOU": "Houston Astros",
    "KCR": "Kansas City Royals",
    "LAA": "Los Angeles Angels",
    "LAD": "Los Angeles Dodgers",
    "MIA": "Miami Marlins",
    "MIL": "Milwaukee Brewers",
    "MIN": "Minnesota Twins",
    "NYM": "New York Mets",
    "NYY": "New York Yankees",
    "OAK": "Oakland Athletics",
    "PHI": "Philadelphia Phillies",
    "PIT": "Pittsburgh Pirates",
    "SDP": "San Diego Padres",
    "SFG": "San Francisco Giants",
    "SEA": "Seattle Mariners",
    "STL": "St. Louis Cardinals",
    "TBR": "Tampa Bay Rays",
    "TEX": "Texas Rangers",
    "TOR": "Toronto Blue Jays",
    "WSN": "Washington Nationals"
}

In [3]:
def read_parquet_s3(url, sort_by=None):
    """Read a Parquet file from the S3 URL.
    Only sort the dataframe if a sort column is provided.
    Batting doesn't have game dates because it's annual totals."""
    df = pd.read_parquet(url)
    if sort_by and sort_by in df.columns:
        df.sort_values(sort_by, ascending=False, inplace=True)
    return df

In [None]:
# URLs for data
standings_url = "https://stilesdata.com/dodgers/data/standings/dodgers_standings_1958_present.parquet"
batting_url = "https://stilesdata.com/dodgers/data/batting/dodgers_team_batting_1958_present.parquet"
pitching_url = 'https://stilesdata.com/dodgers/data/pitching/dodgers_pitching_totals_current.parquet'
pitching_ranks_url = 'https://stilesdata.com/dodgers/data/pitching/dodgers_pitching_ranks_current.parquet'
batting_ranks_url = 'https://stilesdata.com/dodgers/data/batting/dodgers_team_batting_ranks_1958_present.parquet'

In [93]:
home_games_count = len(standings.query('home_away == "home"'))

In [94]:
home_games_count

23

In [65]:
# Load the data

# Standings
standings = read_parquet_s3(standings_url, sort_by='game_date').query("year == '2024'")
standings['result'] = standings['result'].str.split('-wo', expand=True)[0]
standings['opp_name'] = standings['opp'].map(mlb_teams)
standings.loc[standings.result == "L", "result_clean"] = "loss"
standings.loc[standings.result == "W", "result_clean"] = "win"
standings_past = read_parquet_s3(standings_url, sort_by='game_date').query("year != '2024'")
standings_now = standings.query("game_date == game_date.max()").copy()
game_number = standings_now['gm'].iloc[0]
standings_last = standings_past.query(f"gm == {game_number}").head(1).reset_index(drop=True).copy()
standings_last_season = standings_past.query(f"gm <= {game_number} and year=='2023'").reset_index(drop=True).copy()
standings_division_rank = standings['rank'].iloc[0]
standings_division_rank_games_back = standings['gb'].iloc[0]
mean_attendance = standings.query('home_away == "home"')['attendance'].mean()
formatted_mean_attendance = f"{mean_attendance:,.0f}"

# Batting
batting = read_parquet_s3(batting_url)
batting_past = batting.query("season != '2024'").copy()
batting_now = batting.query("season == '2024'").copy()
batting_ranks = read_parquet_s3(batting_ranks_url, sort_by='game_date').query("season == '2024'")

# Pitching
pitching = read_parquet_s3(pitching_url)
pitching_ranks = read_parquet_s3(pitching_ranks_url)

In [34]:
def current_season_stats(standings_now, standings_past, pitching, pitching_ranks, standings_last):
    games = standings_now["gm"].iloc[0]
    wins = standings_now["wins"].iloc[0]
    wins_last = standings_last["wins"].iloc[0]
    losses = standings_now["losses"].iloc[0]
    losses_last = standings_last["losses"].iloc[0]
    record = standings_now["record"].iloc[0]
    record_last = standings_last["record"].iloc[0]
    win_pct = int(standings_now["win_pct"].iloc[0] * 100)
    win_pct_last = int(standings_last["win_pct"].iloc[0] * 100)
    win_pct_decade_thispoint = int(
        standings_past.query(f"gm == {games}").head(10)["win_pct"].mean().round(2) * 100
    )
    era = pitching['era'].iloc[0]
    era_rank = pitching_ranks['era'].iloc[0]
    strikeouts = pitching['so'].iloc[0]
    strikeouts_rank = pitching_ranks['so'].iloc[0]
    walks = pitching['bb'].iloc[0]
    walks_rank = pitching_ranks['bb'].iloc[0]
    home_runs_allowed = pitching['hr'].iloc[0]
    home_runs_allowed_rank = pitching_ranks['hr'].iloc[0]

    return games, wins, losses, record, win_pct, win_pct_decade_thispoint, era, era_rank, strikeouts, strikeouts_rank, walks, walks_rank, wins_last, losses_last, record_last, win_pct_last, home_runs_allowed, home_runs_allowed_rank

In [35]:
def run_differential(standings, batting_ranks):
    runs = standings["r"].sum()
    runs_last = standings_last_season['r'].sum()
    runs_rank = batting_ranks['r'].iloc[0]
    runs_against = standings["ra"].sum()
    runs_against_last = standings_last_season['ra'].sum()
    run_diff = runs - runs_against
    run_diff_last = runs_last - runs_against_last
    return runs, runs_last, runs_rank, runs_against, runs_against_last, run_diff, run_diff_last

In [36]:
def home_run_stats(batting_now, batting_past, batting_ranks):
    games = int(batting_now["g"].iloc[0])
    home_runs = int(batting_now["hr"].sum())
    home_runs_rank = batting_ranks['hr'].iloc[0]
    home_runs_game = round(home_runs / games, 2)
    batting_past["hr_game"] = batting_past["hr"].astype(int) / batting_past["g"].astype(int).round(2)
    home_runs_game_last = batting_past.query('season == "2023"')["hr_game"].iloc[0]
    games_decade = batting_past.head(10)["g"].astype(int).sum()
    home_runs_decade = batting_past.head(10)["hr"].astype(int).sum()
    home_runs_game_decade = round(home_runs_decade / games_decade, 2)
    return home_runs, home_runs_game, home_runs_game_last, home_runs_game_decade, home_runs_rank

In [None]:
def batting_and_stolen_base_stats(batting_now, batting_past, games, batting_ranks):
    batting_average = batting_now["ba"].iloc[0]
    batting_average_decade = round(
        batting_past.head(10)["ba"].astype(float).mean(), 3
    ).astype(str).replace("0.", ".")
    stolen_bases = int(batting_now["sb"].iloc[0])
    stolen_bases_rank = batting_ranks['sb'].iloc[0]
    stolen_bases_game = round(stolen_bases / games, 2)
    stolen_bases_last_rate = round(batting_past.head(1)["sb"].astype(int).sum() / batting_past.head(1)["g"].astype(int).sum(), 2)
    return batting_average, batting_average_decade, stolen_bases, stolen_bases_rank, stolen_bases_game, stolen_bases_last_rate

In [None]:
# Get the update date
def get_pacific_date():
    utc_zone = timezone.utc
    utc_time = datetime.now(utc_zone)
    pacific_offset = timedelta(hours=-8)
    if utc_time.astimezone(timezone.utc).replace(tzinfo=None).month in {4, 5, 6, 7, 8, 9, 10}:
        pacific_offset = timedelta(hours=-7)
    pacific_zone = timezone(pacific_offset)
    pacific_time = utc_time.astimezone(pacific_zone)
    formatted_date = pacific_time.strftime("%B %-d, %Y")
    return formatted_date

# Store the update date
update_date = get_pacific_date()

In [101]:
update_date

'May 14, 2024'

In [46]:
def generate_summary(standings_now, wins, losses, win_pct):
    last_game = standings_now.iloc[0]
    summary = (
        f"The Dodgers have played <span class='highlight'>{games}</span> games this season, compiling a {record} record and "
        f"a winning percentage of <span class='highlight'>{win_pct}</span>%. The team's latest game was a "
        f"{last_game['r']}-{last_game['ra']} {last_game['home_away']} {last_game['result_clean']} "
        f"to the {last_game['opp_name']} in front of {'{:,}'.format(last_game['attendance'])} fans. "
        f"They've won <span class='highlight'>{win_count_trend} of the last 10 games</span>."
    )
    return summary

In [47]:
def recent_trend(standings):
    last_10 = standings.iloc[:10]['result']  # Ensuring the last 10 games are considered
    win_count_trend = last_10[last_10 == "W"].count()
    loss_count_trend = last_10[last_10 == "L"].count()
    return win_count_trend, loss_count_trend, f"Recent trend: {win_count_trend} wins, {loss_count_trend} losses"

In [48]:
games, wins, losses, record, win_pct, win_pct_decade_thispoint, era, era_rank, strikeouts, strikeouts_rank, walks, walks_rank, wins_last, losses_last, record_last, win_pct_last, home_runs_allowed, home_runs_allowed_rank = current_season_stats(standings_now, standings_past, pitching, pitching_ranks, standings_last)
runs, runs_last, runs_rank, runs_against, runs_against_last, run_diff, run_diff_last = run_differential(standings, batting_ranks)
home_runs, home_runs_game, home_runs_game_last, home_runs_game_decade, home_runs_rank = home_run_stats(batting_now, batting_past, batting_ranks)
batting_average, batting_average_decade, stolen_bases, stolen_bases_rank, stolen_bases_game, stolen_bases_last_rate = batting_and_stolen_base_stats(batting_now, batting_past, games, batting_ranks)
win_count_trend, loss_count_trend, win_loss_trend = recent_trend(standings.iloc[:10])

In [49]:
summary = generate_summary(standings, wins, losses, win_pct)

In [86]:
summary_data = [
    {"stat_label": "Wins", "stat": "wins", "value": wins, "category": "standings", "context_value": wins_last, "context_value_label": "This point last season"},
    {"stat_label": "Losses", "stat": "losses", "value": losses, "category": "standings", "context_value": losses_last, "context_value_label": "This point last season"},
    {"stat_label": "Record", "stat": "record", "value": record, "category": "standings", "context_value": record_last, "context_value_label": "This point last season"},
    {"stat_label": "Win percentage", "stat": "win_pct", "value": f"{win_pct}%", "category": "standings", "context_value": f"{win_pct_last}%", "context_value_label": "This point last season"},
    {"stat_label": "Runs", "stat": "runs", "value": runs, "category": "standings", "context_value": runs_rank, "context_value_label": "League rank"},
    {"stat_label": "Runs against", "stat": "runs_against", "value": runs_against, "category": "standings", "context_value": runs_against_last, "context_value_label": "This point last season"},
    {"stat_label": "Run differential", "stat": "run_differential", "value": run_diff, "category": "standings", "context_value": run_diff_last, "context_value_label": "This point last season"},
    {"stat_label": "Home runs", "stat": "home_runs", "value": home_runs, "category": "batting", "context_value": home_runs_rank, "context_value_label": "League rank"},
    {"stat_label": "Home runs/game", "stat": "home_runs_game", "value": home_runs_game, "category": "batting", "context_value": home_runs_game_decade, "context_value_label": "Last decade average"},
    {"stat_label": "Stolen bases", "stat": "stolen_bases", "value": stolen_bases, "category": "batting", "context_value": stolen_bases_rank, "context_value_label": "League rank"},
    {"stat_label": "Stolen bases per game", "stat": "stolen_bases_game", "value": stolen_bases_game, "category": "batting", "context_value": stolen_bases_last_rate, "context_value_label": "Rate all last season"},
    {"stat_label": "Batting average", "stat": "batting_average", "value": batting_average, "category": "batting", "context_value": batting_average_decade, "context_value_label": "Last decade average"},
    {"stat_label": "ERA", "stat": "era", "value": era, "category": "pitching", "context_value_label": "Rank", "context_value": era_rank, "context_value_label": "League rank"},
    {"stat_label": "Strikeouts", "stat": "strikeouts", "value": strikeouts, "category": "pitching", "context_value": strikeouts_rank, "context_value_label": "League rank"},
    {"stat_label": "Walks", "stat": "walks", "value": walks, "category": "pitching", "context_value_label": "Rank", "context_value": walks_rank, "context_value_label": "League rank"},
    {"stat_label": "Home runs allowed", "stat": "home_runs_allowed", "value": home_runs_allowed, "category": "pitching", "context_value": home_runs_allowed_rank, "context_value_label": "League rank"},
    {"stat_label": "Games up/back", "stat": "games_up_back", "value": standings_division_rank_games_back, "category": "standings", "context_value": standings_division_rank, "context_value_label": 'Division rank'},
    {"stat_label": "Attendance", "stat": "mean_attendance", "value": formatted_mean_attendance, "category": "standings", "context_value": '', "context_value_label": 'Home average'},
    {"stat_label": "Last updated", "stat": "last_updated", "value": update_time, "category": "summary", "context_value": "", "context_value_label": ''},
    {"stat_label": "Team summary", "stat": "summary", "value": summary, "category": "summary", "context_value": "", "context_value_label": ''},
]

In [87]:
summary_df = pd.DataFrame(summary_data)
summary_df

Unnamed: 0,stat_label,stat,value,category,context_value,context_value_label
0,Wins,wins,28,standings,27,This point last season
1,Losses,losses,15,standings,16,This point last season
2,Record,record,28-15,standings,27-16,This point last season
3,Win percentage,win_pct,65%,standings,63%,This point last season
4,Runs,runs,226,standings,1,League rank
5,Runs against,runs_against,150,standings,177,This point last season
6,Run differential,run_differential,76,standings,53,This point last season
7,Home runs,home_runs,58,batting,1,League rank
8,Home runs/game,home_runs_game,1.35,batting,1.36,Last decade average
9,Stolen bases,stolen_bases,27,batting,6,League rank


In [88]:
len(summary_df)

20