# Download historic MLB schedules

#### Load python tools

In [1]:
%load_ext lab_black

In [2]:
import requests
import pandas as pd
import altair as alt
import altair_stiles as altstiles
import zipfile
import glob
import os
import numpy as np

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

---

## Get data from Retrosheet [schedules](https://www.retrosheet.org/schedule/)

In [59]:
src = pd.read_csv("/Users/stiles/github/retrosheet/info.csv", dtype={"value": str})

#### Which team?

In [60]:
src["team"] = src["game_id"].str[:3]

#### Parse the dates

In [61]:
src["datestring"] = src["game_id"].str[-9:]

In [62]:
src["day"] = src["game_id"].str[9:-1].astype(str)

In [63]:
src["month"] = src["game_id"].str[7:-3]

In [64]:
src["year"] = src["game_id"].str[3:-5]

In [65]:
src["game_no"] = src["game_id"].str[-1:]

In [66]:
src["date"] = pd.to_datetime(src["year"] + src["month"] + src["day"])

In [67]:
src["decade"] = src["year"].str[:3] + str("0s")

In [68]:
df = src[src["year"] > "1980"].copy()

---

#### Just the stuff we need

In [69]:
variables = ["timeofgame", "daynight", "temp", "precip", "sky"]

In [70]:
weather = df[df["var"].isin(variables)]

In [71]:
weather.head()

Unnamed: 0,game_id,var,value,team,datestring,day,month,year,game_no,date,decade
2644304,BAL198104100,daynight,day,BAL,198104100,10,4,1981,0,1981-04-10,1980s
2644316,BAL198104100,temp,65,BAL,198104100,10,4,1981,0,1981-04-10,1980s
2644320,BAL198104100,precip,unknown,BAL,198104100,10,4,1981,0,1981-04-10,1980s
2644321,BAL198104100,sky,sunny,BAL,198104100,10,4,1981,0,1981-04-10,1980s
2644322,BAL198104100,timeofgame,166,BAL,198104100,10,4,1981,0,1981-04-10,1980s


In [72]:
day_games = weather[(weather["value"] == "day")]["game_id"].to_list()

In [73]:
temp = weather[(weather["game_id"].isin(day_games)) & (weather["var"] == "temp")].copy()

In [74]:
temp["value"] = temp["value"].fillna(np.nan).astype(int)

In [75]:
temp = temp[temp["value"] > 0].copy()

In [78]:
temp[temp['team'] = 'TEX']

SyntaxError: invalid syntax (2672952314.py, line 1)

ERROR:root:Cannot parse: 1:18: temp[temp['team'] = 'TEX']
Traceback (most recent call last):
  File "/Users/stiles/.local/share/virtualenvs/grid-notebooks-Amv9DX-2/lib/python3.8/site-packages/lab_black.py", line 218, in format_cell
    formatted_code = _format_code(cell)
  File "/Users/stiles/.local/share/virtualenvs/grid-notebooks-Amv9DX-2/lib/python3.8/site-packages/lab_black.py", line 29, in _format_code
    return format_str(src_contents=code, mode=FileMode())
  File "src/black/__init__.py", line 1154, in format_str
  File "src/black/__init__.py", line 1164, in _format_str_once
  File "src/black/parsing.py", line 128, in lib2to3_parse
black.parsing.InvalidInput: Cannot parse: 1:18: temp[temp['team'] = 'TEX']


---

In [26]:
temp["temp_category"] = pd.cut(
    temp.value,
    bins=[15, 32, 45, 55, 65, 75, 85, 95, 120],
    labels=[
        "freezing",
        "very cold",
        "cold",
        "cool",
        "comfortable",
        "warm",
        "hot",
        "sweltering",
    ],
)

In [45]:
tex = temp[(temp["team"] == "TEX")]

In [46]:
temp_categories = (
    tex.groupby(["decade", "temp_category"])
    .agg({"game_id": "count"})
    .reset_index()
    .rename(columns={"game_id": "count"})
)

In [47]:
temp_categories_pivot = temp_categories.pivot(
    index="decade", columns="temp_category"
).reset_index()

In [48]:
temp_categories_pivot.columns = [
    "_".join(col).strip() for col in temp_categories_pivot.columns.values
]

In [49]:
temp_categories_pivot

Unnamed: 0,decade_,count_freezing,count_very cold,count_cold,count_cool,count_comfortable,count_warm,count_hot,count_sweltering
0,1980s,0,0,1,0,4,1,4,3
1,1990s,0,0,1,4,17,31,26,2
2,2000s,0,0,3,5,32,60,47,3
3,2010s,0,1,2,10,28,44,82,14
4,2020s,0,0,0,1,36,0,0,0


In [50]:
cols = [
    "count_freezing",
    "count_very cold",
    "count_cold",
    "count_cool",
    "count_comfortable",
    "count_warm",
    "count_hot",
    "count_sweltering",
]

In [51]:
temp_categories_pivot["total"] = temp_categories_pivot[cols].sum(axis=1)

In [52]:
temp_categories_pivot

Unnamed: 0,decade_,count_freezing,count_very cold,count_cold,count_cool,count_comfortable,count_warm,count_hot,count_sweltering,total
0,1980s,0,0,1,0,4,1,4,3,13
1,1990s,0,0,1,4,17,31,26,2,81
2,2000s,0,0,3,5,32,60,47,3,150
3,2010s,0,1,2,10,28,44,82,14,181
4,2020s,0,0,0,1,36,0,0,0,37


In [53]:
temp_categories_pivot["share_sweltering"] = (
    (temp_categories_pivot["count_sweltering"] / temp_categories_pivot["total"]) * 100
).round(1)

In [54]:
temp_categories_pivot["share_hot"] = (
    (temp_categories_pivot["count_hot"] / temp_categories_pivot["total"]) * 100
).round(1)

In [55]:
temp_categories_pivot["share_warm"] = (
    (temp_categories_pivot["count_warm"] / temp_categories_pivot["total"]) * 100
).round(1)

In [56]:
temp_categories_pivot

Unnamed: 0,decade_,count_freezing,count_very cold,count_cold,count_cool,count_comfortable,count_warm,count_hot,count_sweltering,total,share_sweltering,share_hot,share_warm
0,1980s,0,0,1,0,4,1,4,3,13,23.1,30.8,7.7
1,1990s,0,0,1,4,17,31,26,2,81,2.5,32.1,38.3
2,2000s,0,0,3,5,32,60,47,3,150,2.0,31.3,40.0
3,2010s,0,1,2,10,28,44,82,14,181,7.7,45.3,24.3
4,2020s,0,0,0,1,36,0,0,0,37,0.0,0.0,0.0


---