# MLB ame by game results: 2015-2022, via [Baseball Reference](https://www.baseball-reference.com/teams/LAD/2022-schedule-scores.shtml)

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_stiles as altstiles
import numpy as np
import requests
from bs4 import BeautifulSoup

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

---

## Read data 

#### Get a list of teams

In [5]:
url = "https://www.baseball-reference.com/leagues/MLB-standings.shtml"

In [6]:
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
rows = soup.select("table > tbody > tr")

In [7]:
teams = []

for row in rows[1:]:
    cols = row.find_all("td")
    fields = [td.text.strip() for td in cols if td.text.strip()]
    if fields:
        links = row.find("a")["href"]
        record = links
        teams.append(record.replace("/teams/", "").replace("/2022.shtml", ""))

In [8]:
teams = [
    "NYY",
    "TOR",
    "TBR",
    "BAL",
    "BOS",
    "CLE",
    "CHW",
    "MIN",
    "KCR",
    "DET",
    "HOU",
    "SEA",
    "TEX",
    "LAA",
    "OAK",
    "NYM",
    "ATL",
    "PHI",
    "MIA",
    "WSN",
    "STL",
    "MIL",
    "CHC",
    "CIN",
    "PIT",
    "LAD",
    "SDP",
    "SFG",
    "ARI",
    "COL",
]

In [9]:
len(teams)

30

---

#### Loop through year URLs and teams, read standings table, place into a list of dataframes and append a column with the season

In [10]:
tables = []

for t in teams:
    for year in range(2015, 2023):
        season = (
            pd.read_html(
                f"https://www.baseball-reference.com/teams/{t}/{year}-schedule-scores.shtml"
            )[0]
        ).assign(season=year)
        tables.append(season)

#### Make a big ol' dataframe from all the seasons

In [11]:
table = pd.concat(tables)

#### The column names values are messy and/or using codes. Many steps to clean up the dataframe. 

In [12]:
src = table[table["Gm#"] != "Gm#"].copy()
src_df = src[(src["Inn"] != "Game Preview, and Matchups")].copy()

In [13]:
src_df.columns = (
    src_df.columns.str.lower()
    .str.replace("w/l", "win_loss")
    .str.replace("w-l", "record")
    .str.replace("d/n", "day_night")
    .str.replace("#", "")
    .str.replace("unnamed: 4", "home_away")
)

#### Dates

In [14]:
src_df[["day", "nudate"]] = src_df["date"].str.split(", ", expand=True)

In [15]:
src_df[["nudate", "gm_of_dh"]] = src_df["nudate"].str.split(" \(", expand=True)

In [16]:
src_df["gm_of_dh"] = src_df["gm_of_dh"].str.replace("\)", "", regex=False)

In [17]:
src_df["season"] = src_df["season"].astype(str)

In [18]:
src_df["date"] = pd.to_datetime(src_df["nudate"] + ", " + src_df["season"])

#### Clean up the column names

In [19]:
src_df.rename(
    columns={
        "gm": "game",
        "tm": "team",
        "opp": "opponent",
        "r": "runs",
        "ra": "runs_against",
        "inn": "ex_inn",
        "d_n": "day_night",
    },
    inplace=True,
)

#### How many games back?

In [20]:
src_df["games_back"] = (
    src_df["gb"]
    .str.replace("up ", "up")
    .str.replace("Tied", "0")
    .str.replace("up", "+")
)

In [21]:
def games_back_cat(row):
    if "+" in row:
        return "up"
    elif row == "0":
        return "tied"
    else:
        return "down"

In [22]:
src_df["gb_status"] = src_df["games_back"].apply(games_back_cat)

In [23]:
src_df["games_back"] = src_df["games_back"].astype(float)

In [24]:
src_df.loc[src_df["gb_status"] == "down", "games_back"] *= -1

#### Home vs. Away?

In [25]:
def home_away_clean(row):
    if row == "@":
        return "Away"
    else:
        return "Home"

In [26]:
src_df["home_away"] = src_df["home_away"].apply(home_away_clean)

#### Win vs. Loss?

In [27]:
src_df["win_loss_note"] = src_df["win_loss"].str[1:]

In [28]:
src_df["win_loss"] = src_df["win_loss"].str[:1]

#### Calculate the streak column

In [29]:
src_df["streak_num"] = src_df["streak"].str.len()

In [30]:
src_df.loc[src_df["streak"].str.contains("-"), "streak_num"] *= -1

In [31]:
src_df.columns

Index(['game', 'date', 'unnamed: 2', 'team', 'home_away', 'opponent',
       'win_loss', 'runs', 'runs_against', 'ex_inn', 'record', 'rank', 'gb',
       'win', 'loss', 'save', 'time', 'day_night', 'attendance', 'cli',
       'streak', 'orig. scheduled', 'season', 'day', 'nudate', 'gm_of_dh',
       'games_back', 'gb_status', 'win_loss_note', 'streak_num'],
      dtype='object')

#### Drop columns we don't need

In [32]:
df = src_df[
    [
        "season",
        "game",
        "date",
        "home_away",
        "team",
        "opponent",
        "win_loss",
        "runs",
        "runs_against",
        "ex_inn",
        "record",
        "rank",
        "win",
        "loss",
        "save",
        "time",
        "day_night",
        "attendance",
        "day",
        "gm_of_dh",
        "games_back",
        "win_loss_note",
        "streak_num",
    ]
].copy()

---

#### Divisions

In [33]:
divisions = {
    "NYY": "AL East",
    "TOR": "AL East",
    "TBR": "AL East",
    "BAL": "AL East",
    "BOS": "AL East",
    "CLE": "AL Central",
    "CHW": "AL Central",
    "MIN": "AL Central",
    "KCR": "AL Central",
    "DET": "AL Central",
    "HOU": "AL West",
    "SEA": "AL West",
    "TEX": "AL West",
    "LAA": "AL West",
    "OAK": "AL West",
    "NYM": "NL East",
    "ATL": "NL East",
    "PHI": "NL East",
    "MIA": "NL East",
    "WSN": "NL East",
    "STL": "NL Central",
    "MIL": "NL Central",
    "CHC": "NL Central",
    "CIN": "NL Central",
    "PIT": "NL Central",
    "LAD": "NL West",
    "SDP": "NL West",
    "SFG": "NL West",
    "ARI": "NL West",
    "COL": "NL West",
}

In [34]:
df["team_division"] = df["team"].map(divisions)

In [36]:
df[["cum_wins", "cum_losses"]] = (
    df["record"].str.split("-", expand=True, n=0).astype(int)
)

In [37]:
this_year = df[df["season"] == "2022"].copy()

In [38]:
this_year.head()

Unnamed: 0,season,game,date,home_away,team,opponent,win_loss,runs,runs_against,ex_inn,record,rank,win,loss,save,time,day_night,attendance,day,gm_of_dh,games_back,win_loss_note,streak_num,team_division,cum_wins,cum_losses
0,2022,1,2022-04-08,Home,NYY,BOS,W,6,5,11.0,1-0,1,King,Crawford,,3:56,D,46097,Friday,,0.0,-wo,1,AL East,1,0
1,2022,2,2022-04-09,Home,NYY,BOS,W,4,2,,2-0,1,Luetge,Pivetta,Chapman,2:58,D,46882,Saturday,,0.0,,2,AL East,2,0
2,2022,3,2022-04-10,Home,NYY,BOS,L,3,4,,2-1,2,Crawford,Schmidt,Diekman,3:40,N,40108,Sunday,,-1.0,,-1,AL East,2,1
3,2022,4,2022-04-11,Home,NYY,TOR,L,0,3,,2-2,3,Manoah,Taillon,Romano,3:03,N,26211,Monday,,-1.0,,-2,AL East,2,2
4,2022,5,2022-04-12,Home,NYY,TOR,W,4,0,,3-2,2,Holmes,Kikuchi,,3:07,N,25068,Tuesday,,-1.0,,1,AL East,3,2


#### Charts

In [67]:
colors = ["red", "steelblue", "chartreuse", "#F4D03F", "#D35400"]

alt.Chart(this_year).mark_line(interpolate="step-after", size=2).encode(
    x="date",
    y=alt.Y("cum_wins", title=" "),
    color=alt.Color("team", title=""),
    facet=alt.Facet(
        "team_division", columns=3, title="", header=alt.Header(labelFontSize=14)
    ),
).properties(width=200, height=400).configure_legend(
    orient="top", symbolType="stroke"
).resolve_scale(
    color="independent"
)

---

## Export

In [40]:
# df.to_csv(f"data/processed/dodgers_game_by_game_1960_2022.csv", index=False)