# LA Dodgers Standings, 1958-present
> This notebook downloads the team's current standings table from [Baseball Reference](https://www.baseball-reference.com/teams/LAD/2024-schedule-scores.shtml) and combines it with historic records for later analysis and visualization.

---

#### Import Python tools and Jupyter config

In [24]:
import os
import numpy as np
import pandas as pd
import jupyter_black
from time import sleep
from tqdm.notebook import tqdm

In [25]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = None

In [26]:
profile_name = os.environ.get("AWS_PERSONAL_PROFILE")

In [27]:
today = pd.Timestamp("today").strftime("%Y-%m-%d")

---

## Fetch

#### Import historic game-by-game results, 1958-2023

In [28]:
historic_df = pd.read_parquet(
    "../data/standings/archive/dodgers_standings_1958_2023.parquet"
)

#### Define some variables we need for the request

In [29]:
year = 2024
url = f"https://www.baseball-reference.com/teams/LAD/{year}-schedule-scores.shtml"

#### Get the current year's table

In [30]:
src = (
    pd.read_html(url)[0]
    .query("Tm !='Tm' and Inn != 'Game Preview, and Matchups'")
    .drop(["Unnamed: 2", "Streak", "Orig. Scheduled"], axis=1)
    .rename(columns={"Unnamed: 4": "home_away"})
    .assign(season=year)
)

---

## Process

#### Clean columns

In [31]:
src.columns = src.columns.str.lower().str.replace("/", "_").str.replace("-", "-")

In [32]:
src.columns = [
    "gm",
    "date",
    "tm",
    "home_away",
    "opp",
    "result",
    "r",
    "ra",
    "inn",
    "record",
    "rank",
    "gb",
    "win",
    "loss",
    "save",
    "time",
    "day_night",
    "attendance",
    "cli",
    "year",
]

#### Convert date types where needed

In [33]:
src["gm"] = src["gm"].astype(int)
src["year"] = src["year"].astype(str)

#### Split, format date

In [34]:
src[["weekday", "date"]] = src["date"].str.split(", ", expand=True)

In [35]:
src["date"] = src["date"].str.replace(" (1)", "").str.replace(" (2)", "")

In [36]:
src["game_date"] = pd.to_datetime(src["date"] + ", " + src["year"], format="%b %d, %Y")

#### Clean home-away column

In [37]:
src.loc[src.home_away == "@", "home_away"] = "away"
src.loc[src.home_away.isna(), "home_away"] = "home"

#### Games back figures as a number

In [38]:
src["gb"] = (
    src["gb"].str.replace("up ", "up").str.replace("up", "+").str.replace("Tied", "0")
)

In [39]:
src["gb"] = (
    src["gb"]
    .apply(
        lambda x: float(x) if x.startswith("+") else -float(x) if float(x) != 0 else 0
    )
    .astype(float)
)

#### Just the columns we need

In [40]:
src_df = src[
    [
        "gm",
        "game_date",
        "home_away",
        "opp",
        "result",
        "r",
        "ra",
        "record",
        "rank",
        "gb",
        "time",
        "day_night",
        "attendance",
        "year",
    ]
].copy()

----

## Concatenate

#### Historic and current dataframes combined into one

In [41]:
df = pd.concat([src_df, historic_df]).sort_values("game_date", ascending=False)

In [42]:
df["r"] = df["r"].fillna(np.nan).astype(float)
df["ra"] = df["ra"].fillna(np.nan).astype(float)
df["attendance"] = df["attendance"].fillna(np.nan).astype(float)

In [43]:
df.head()

Unnamed: 0,gm,game_date,home_away,opp,result,r,ra,record,rank,gb,time,day_night,attendance,year,time_minutes
17,17,2024-04-13,home,SDP,W,5.0,2.0,11-6,1,3.0,2:36,N,44582.0,2024,
16,16,2024-04-12,home,SDP,L,7.0,8.0,10-6,1,2.0,3:14,N,49606.0,2024,
15,15,2024-04-10,away,MIN,L,2.0,3.0,10-5,1,3.0,2:31,D,18640.0,2024,
14,14,2024-04-09,away,MIN,W,6.0,3.0,10-4,1,4.0,2:23,N,17024.0,2024,
13,13,2024-04-08,away,MIN,W,4.0,2.0,9-4,1,3.0,2:15,N,15177.0,2024,


In [44]:
df.tail()

Unnamed: 0,gm,game_date,home_away,opp,result,r,ra,record,rank,gb,time,day_night,attendance,year,time_minutes
4,5,1958-04-19,home,SFG,L,4.0,11.0,2-3,5,-2.5,2:37:00,D,41303.0,1958,157.0
3,4,1958-04-18,home,SFG,W,6.0,5.0,2-2,3,-1.5,3:00:00,D,78672.0,1958,180.0
2,3,1958-04-17,away,SFG,L,4.0,7.0,1-2,6,-1.5,2:50:00,D,12520.0,1958,170.0
1,2,1958-04-16,away,SFG,W,13.0,1.0,1-1,4,-0.5,3:03:00,N,22735.0,1958,183.0
0,1,1958-04-15,away,SFG,L,0.0,8.0,0-1,5,-1.0,2:29:00,D,23448.0,1958,149.0


---

## Exports

#### CSV format

In [45]:
# df.to_csv("data/processed/dodgers_standings_1958_present.csv", index=False)

#### JSON

In [46]:
# df.to_json(
#     "data/processed/dodgers_standings_1958_present.json", indent=4, orient="records"
# )

#### Parquet

In [47]:
# df.to_parquet("data/processed/dodgers_standings_1958_present.parquet", index=False)

#### S3

In [48]:
# !aws s3 cp data/processed/dodgers_standings_1958_present.json s3://stilesdata.com/dodgers/dodgers_standings_1958_present.json --profile {profile_name}