# Daily temperature averages in select MLB cities

#### Load python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_stiles as altstiles
import glob
from pathlib import Path
import numpy as np

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

---

## Process weatherspark data

#### Make a list of all our decade-level temperature data files

In [5]:
all_files = []

for path in Path("/Users/stiles/data/climate/mlb/").rglob("*Summary*.csv"):
    all_files.append(str(path.parent) + "/" + str(path.name))

#### Read each file into a list of dataframes

In [6]:
dfs_list = []
for file in all_files:
    dfs = pd.read_csv(file, on_bad_lines="skip")
    dfs_list.append(dfs)

#### One big dataframe with all cities and decades

In [7]:
src = pd.concat(dfs_list)

#### Clean up the messy column headers

In [8]:
src.columns = (
    src.columns.str.lower()
    .str.strip()
    .str.replace("/", "_", regex=False)
    .str.replace(" (°f)", "", regex=False)
    .str.replace(" (%)", "_pct", regex=False)
    .str.replace(" (in)", "_inch", regex=False)
    .str.replace(" (hr)", "_hour", regex=False)
    .str.replace(" (mph)", "_mph", regex=False)
    .str.replace(" (string)", "", regex=False)
    .str.replace(" ", "_", regex=False)
    .str.replace("temperature_sampled_", "", regex=False)
    .str.strip()
)

#### Place names

In [9]:
src.rename(columns={"time_zone_id": "place"}, inplace=True)

In [10]:
src[["continent", "place"]] = src["place"].str.split("/", expand=True)

#### Call signs for weather stations

In [11]:
call_signs = {
    "KEWR": "New York",
    "KATL": "Atlanta",
    "KMDW": "Chicago",
    "KSTL": "St. Louis",
    "KBOS": "Boston",
    "KDFW": "Arlington",
    "KHOU": "Houston",
    "KDCA": "Washington, DC",
    "KIAD": "Washington, DC",
    "KMSP": "Minneapolis",
    "KMKE": "Milwaukee",
    "KPHX": "Phoenix",
    "KCLE": "Cleveland",
    "KOAK": "Oakland",
    "KMIA": "Miami",
    "KLAX": "Los Angeles",
    "CYYZ": "Toronto",
    "KPIE": "St. Petersburg",
    "KTPA": "St. Petersburg",
    "KSEA": "Seattle",
    "KSFO": "San Francisco",
    "KSAN": "San Diego",
    "KPIT": "Pittsburgh",
    "KPHL": "Philadelphia",
    "KMCI": "Kansas City",
    "KMKC": "Kansas City",
    "KDET": "Detroit",
    "KBWI": "Baltimore",
    "KCVG": "Cincinnati",
    "KDEN": "Denver",
    "KDNR": "Denver",
}

In [12]:
src["city"] = src["call_sign"].map(call_signs)

#### Make a simple dataframe with just the temp columns we need

In [13]:
src_slim = src[["date", "city", "low", "high", "precipitation_inch"]].copy()

#### Process dates

In [14]:
src_slim["date"] = pd.to_datetime(src_slim["date"]).dt.strftime("%Y-%m-%d")
src_slim["year"] = pd.to_datetime(src_slim["date"]).dt.strftime("%Y")
src_slim["month"] = pd.to_datetime(src_slim["date"]).dt.strftime("%m")
src_slim["day"] = pd.to_datetime(src_slim["date"]).dt.strftime("%d")

In [15]:
offseason = ["01", "02", "03", "11", "12"]

In [16]:
src_slim_season = src_slim[~src_slim["month"].isin(offseason)].copy()

#### Decade categories

In [17]:
src_slim_season["decade"] = src_slim_season["year"].str[:3] + "0s"

#### Get the mean temp

In [18]:
src_slim_season["mean"] = (src_slim_season["high"] + src_slim_season["low"]) / 2

#### Make a copy for analysis

In [19]:
df = (
    src_slim_season[
        (src_slim_season["decade"] != "2020s") & (src_slim_season["year"] >= "1960")
    ]
    .sort_values("date", ascending=False)
    .copy()
)

In [20]:
df = df.fillna(0)

In [21]:
df["high_category"] = pd.cut(
    df["high"],
    bins=[15, 55, 65, 75, 85, 95, 120],
    labels=[
        "3. cold",
        "4. cool",
        "5. comfortable",
        "6. warm",
        "7. hot",
        "8. sweltering",
    ],
)

In [22]:
len(df.city.unique())

27

---

## Aggregate

#### Group by decade and count the number of days in each category

In [23]:
decades_categories_mlb = (
    df.groupby(["decade", "high_category"])
    .agg({"mean": "count"})
    .reset_index()
    .rename(columns={"mean": "count"})
)

#### What's the percentage of days in each category?

In [24]:
decades_categories_mlb["decade_share"] = (
    (
        decades_categories_mlb["count"]
        / decades_categories_mlb.groupby("decade")["count"].transform("sum")
    )
).round(4)

In [25]:
decades_categories_mlb.head()

Unnamed: 0,decade,high_category,count,decade_share
0,1960s,3. cold,3438,0.061
1,1960s,4. cool,8707,0.1546
2,1960s,5. comfortable,13167,0.2338
3,1960s,6. warm,18113,0.3216
4,1960s,7. hot,11066,0.1965


In [26]:
bars = (
    alt.Chart()
    .mark_bar(size=30)
    .encode(
        x=alt.X("decade:O", title=""),
        y=alt.Y(
            "decade_share:Q",
            title="",
            axis=alt.Axis(format="%", tickCount=1, grid=False),
        ),
    )
    .properties(
        width=80,
        height=200,
        title="Days in each season by high temp category and decade",
    )
)

lines = (
    alt.Chart()
    .mark_line(color="#005f66", size=2)
    .encode(
        x=alt.X("decade:O", title=""),
        y=alt.Y(
            "decade_share",
            title="",
            axis=alt.Axis(format="%", labels=False, offset=-40),
        ),
    )
    .properties(
        width=80,
        height=200,
        title="Days in each season by high temp category and decade",
    )
)

text = bars.mark_text(align="left", baseline="middle", dx=-11, dy=11).encode(
    text=alt.Text("decade_share", format=".0%")
)

alt.layer(
    (bars + lines + text),
    data=decades_categories_mlb[
        decades_categories_mlb["decade"].str.contains("1960s|2010s")
    ],
).facet(
    facet=alt.Facet("high_category:N", title=" ", header=alt.Header(labelFontSize=13)),
    columns=6,
    title="Share of season days in each high temp category, by decade",
)

In [27]:
alt.Chart(decades_categories_mlb).mark_bar().encode(
    x=alt.X(
        "count",
        title="",
        axis=alt.Axis(format="%", tickCount=3),
        stack="normalize",
        sort=[
            "3. cold",
            "4. cool",
            "5. comfortable",
            "6. warm",
            "7. hot",
            "8. sweltering",
        ],
    ),
    y=alt.Y("decade", title=""),
    color=alt.Color(
        "high_category",
        title="Share of season in each category",
        scale=alt.Scale(
            domain=[
                "3. cold",
                "4. cool",
                "5. comfortable",
                "6. warm",
                "7. hot",
                "8. sweltering",
            ],
            range=[
                # "#2166ac",
                # "#4393c3",
                "#92c5de",
                "#d1e5f0",
                "#fddbc7",
                "#f4a582",
                "#d6604d",
                "#b2182b",
            ],
        ),
    ),
).properties(
    width=650,
    height=125,
    title="High temp ranges from April-October for all MLB cities, by decade",
).configure_legend(
    orient="top"
)

#### Group by decades and category counts for each decade

In [28]:
decades_categories = (
    df.groupby(["decade", "high_category", "city"])
    .agg({"mean": "count"})
    .reset_index()
    .rename(columns={"mean": "count"})
)

In [29]:
our_cities = ["Arlington", "Atlanta", "Denver", "Houston", "St. Louis", "Miami"]

In [55]:
decades_categories[decades_categories["city"] == "Arlington"]

Unnamed: 0,decade,high_category,city,count
0,1960s,3. cold,Arlington,5
27,1960s,4. cool,Arlington,53
54,1960s,5. comfortable,Arlington,220
81,1960s,6. warm,Arlington,672
108,1960s,7. hot,Arlington,851
135,1960s,8. sweltering,Arlington,339
162,1970s,3. cold,Arlington,10
189,1970s,4. cool,Arlington,66
216,1970s,5. comfortable,Arlington,219
243,1970s,6. warm,Arlington,642


In [30]:
alt.Chart(
    decades_categories[decades_categories["city"].isin(our_cities)]
).mark_bar().encode(
    x=alt.X(
        "count",
        title="",
        axis=alt.Axis(format="%", tickCount=3),
        stack="normalize",
        sort=[
            "3. cold",
            "4. cool",
            "5. comfortable",
            "6. warm",
            "7. hot",
            "8. sweltering",
        ],
    ),
    y=alt.Y("decade", title=""),
    color=alt.Color(
        "high_category",
        title="Share of season in each category",
        scale=alt.Scale(
            domain=[
                "3. cold",
                "4. cool",
                "5. comfortable",
                "6. warm",
                "7. hot",
                "8. sweltering",
            ],
            range=[
                # "#2166ac",
                # "#4393c3",
                "#92c5de",
                "#d1e5f0",
                "#fddbc7",
                "#f4a582",
                "#d6604d",
                "#b2182b",
            ],
        ),
    ),
    facet=alt.Facet("city", columns=3, title=""),
).properties(
    width=200,
    height=125,
    title="High temp ranges from April-October in select MLB cities, by decade",
).configure_legend(
    orient="top"
)

In [54]:
alt.Chart(decades_categories).mark_bar().encode(
    x=alt.X(
        "count",
        title="",
        axis=alt.Axis(format="%", tickCount=3),
        stack="normalize",
        sort=[
            "3. cold",
            "4. cool",
            "5. comfortable",
            "6. warm",
            "7. hot",
            "8. sweltering",
        ],
    ),
    y=alt.Y("decade", title=""),
    color=alt.Color(
        "high_category",
        title="Share of season in each category",
        legend=None,
        scale=alt.Scale(
            domain=[
                "3. cold",
                "4. cool",
                "5. comfortable",
                "6. warm",
                "7. hot",
                "8. sweltering",
            ],
            range=[
                # "#2166ac",
                # "#4393c3",
                "#92c5de",
                "#d1e5f0",
                "#fddbc7",
                "#f4a582",
                "#d6604d",
                "#b2182b",
            ],
        ),
    ),
    facet=alt.Facet("city", columns=7, title=""),
).properties(width=70, height=70, title=" ",).configure_legend(orient="top")

---

In [32]:
df.head()

Unnamed: 0,date,city,low,high,precipitation_inch,year,month,day,decade,mean,high_category
3589,2019-10-31,Kansas City,21.92,41.0,0.02,2019,10,31,2010s,31.46,3. cold
3584,2019-10-31,Toronto,42.8,50.0,0.0,2019,10,31,2010s,46.4,3. cold
3587,2019-10-31,Atlanta,39.02,75.92,0.2,2019,10,31,2010s,57.47,6. warm
3577,2019-10-31,Boston,59.0,71.06,0.07,2019,10,31,2010s,65.03,5. comfortable
3584,2019-10-31,Chicago,28.04,39.02,0.17,2019,10,31,2010s,33.53,3. cold


In [33]:
means = (
    df[df["decade"].str.contains("1960s|2010s")]
    .groupby(["decade", "city"])
    .agg({"mean": "mean"})
    .round(1)
    .reset_index()
    .rename(columns={"mean": "mean"})
)

In [34]:
means_pivot = means.pivot_table(
    index="city", columns="decade", values="mean"
).reset_index()

In [35]:
means_pivot["diff"] = means_pivot["2010s"] - means_pivot["1960s"]

In [36]:
means_pivot.sort_values("diff", ascending=False)

In [44]:
decades_categories_mlb.pivot_table(
    index="decade", columns="high_category", values="decade_share"
).reset_index()

high_category,decade,3. cold,4. cool,5. comfortable,6. warm,7. hot,8. sweltering
0,1960s,0.061,0.1546,0.2338,0.3216,0.1965,0.0326
1,1970s,0.059,0.1563,0.2378,0.3131,0.1971,0.0366
2,1980s,0.0573,0.1445,0.2352,0.3029,0.2122,0.0478
3,1990s,0.0542,0.1472,0.2405,0.3005,0.2158,0.0418
4,2000s,0.0574,0.1469,0.2336,0.3109,0.2096,0.0416
5,2010s,0.0494,0.1266,0.2199,0.3139,0.2415,0.0487


---