# Dodgers game by game results: 1960-2022, via [Baseball Reference](https://www.baseball-reference.com/teams/LAD/2022-schedule-scores.shtml)

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_stiles as altstiles
import numpy as np

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

---

## Read data 

#### Loop through year URLs, read standings table, place into a list of dataframes and append a column with the season

In [5]:
tables = []

for year in range(1960, 2023):
    season = (
        pd.read_html(
            f"https://www.baseball-reference.com/teams/LAD/{year}-schedule-scores.shtml"
        )[0]
    ).assign(season=year)
    tables.append(season)

#### Make a big ol' dataframe from all the seasons

In [6]:
table = pd.concat(tables)

#### The column names values are messy and/or using codes. Many steps to clean up the dataframe. 

In [7]:
src = table[table["Gm#"] != "Gm#"].copy()
src_df = src[(src["Inn"] != "Game Preview, and Matchups")].copy()

In [8]:
src_df.columns = (
    src_df.columns.str.lower()
    .str.replace("w/l", "win_loss")
    .str.replace("w-l", "record")
    .str.replace("d/n", "day_night")
    .str.replace("#", "")
    .str.replace("unnamed: 4", "home_away")
)

#### Dates

In [9]:
src_df[["day", "nudate"]] = src_df["date"].str.split(", ", expand=True)

In [10]:
src_df[["nudate", "gm_of_dh"]] = src_df["nudate"].str.split(" \(", expand=True)

In [11]:
src_df["gm_of_dh"] = src_df["gm_of_dh"].str.replace("\)", "", regex=False)

In [12]:
src_df["season"] = src_df["season"].astype(str)

In [13]:
src_df["date"] = pd.to_datetime(src_df["nudate"] + ", " + src_df["season"])

#### Clean up the column names

In [14]:
src_df.rename(
    columns={
        "gm": "game",
        "tm": "team",
        "opp": "opponent",
        "r": "runs",
        "ra": "runs_against",
        "inn": "ex_inn",
        "d_n": "day_night",
    },
    inplace=True,
)

#### How many games back?

In [15]:
src_df["games_back"] = (
    src_df["gb"]
    .str.replace("up ", "up")
    .str.replace("Tied", "0")
    .str.replace("up", "+")
)

In [16]:
def games_back_cat(row):
    if "+" in row:
        return "up"
    elif row == "0":
        return "tied"
    else:
        return "down"

In [17]:
src_df["gb_status"] = src_df["games_back"].apply(games_back_cat)

In [18]:
src_df["games_back"] = src_df["games_back"].astype(float)

In [19]:
src_df.loc[src_df["gb_status"] == "down", "games_back"] *= -1

#### Home vs. Away?

In [20]:
def home_away_clean(row):
    if row == "@":
        return "Away"
    else:
        return "Home"

In [21]:
src_df["home_away"] = src_df["home_away"].apply(home_away_clean)

#### Win vs. Loss?

In [22]:
src_df["win_loss_note"] = src_df["win_loss"].str[1:]

In [23]:
src_df["win_loss"] = src_df["win_loss"].str[:1]

#### Calculate the streak column

In [24]:
src_df["streak_num"] = src_df["streak"].str.len()

In [25]:
src_df.loc[src_df["streak"].str.contains("-"), "streak_num"] *= -1

#### Drop columns we don't need

In [26]:
df = src_df[
    [
        "season",
        "game",
        "date",
        "home_away",
        "opponent",
        "win_loss",
        "runs",
        "runs_against",
        "ex_inn",
        "record",
        "rank",
        "win",
        "loss",
        "save",
        "time",
        "day_night",
        "attendance",
        "day",
        "gm_of_dh",
        "games_back",
        "win_loss_note",
        "streak_num",
    ]
].copy()

---

## Export

In [27]:
df.to_csv(f"data/processed/dodgers_game_by_game_1960_2022.csv", index=False)