# MLB attendance

#### Python tools

In [1]:
import pandas as pd
import numpy as np
import altair as alt

#### Create list of years

In [2]:
years = list(range(1976, 2023))

#### Loop through years, grab attendance pages, append to list of dataframes

In [3]:
records = []

for y in years:
    records.append(pd.read_html(f'https://www.baseball-reference.com/leagues/majors/{y}-misc.shtml')[0].assign(year=y))

#### One big dataframe with each year

In [4]:
src = pd.concat(records)

#### Clean up dataframe

In [5]:
src.columns = [c.lower() for c in src.columns]

In [6]:
df = src[['tm', 'attendance', 'attend/g', 'year']].copy()

In [7]:
df.rename(columns={'attend/g':'avg_attend', 'tm':'team'}, inplace=True)

#### Standardize team names

In [8]:
df.loc[
    df["team"].str.contains("Rays"), "team"
] = "Tampa Bay Rays"

In [9]:
df.loc[
    df["team"].str.contains("Angels"), "team"
] = "Los Angeles Angels"

In [10]:
df.loc[
    df["team"].str.contains("Marlins"), "team"
] = "Miami Marlins"

In [11]:
df.loc[
    df["team"].str.contains("Indians"), "team"
] = "Cleveland Guardians"

In [12]:
df = df[~df['team'].str.contains('Expos')].copy()

#### Thanks, Covid

In [13]:
df.loc[
    df["year"] == 2020, "avg_attend"
] = np.nan

---

In [14]:
df["rank"] = df.groupby(["year"])["avg_attend"].rank(method='max')

In [15]:
oakland = df[df['team'] == 'Oakland Athletics']

In [16]:
oakland['rank'].max().round()

23.0

In [17]:
df_melt = df.melt(value_vars=['rank', 'avg_attend'], id_vars=['team', 'year'])

#### How does Oakland compare to other teams? 

In [18]:
others_line = alt.Chart(df_melt[(df_melt['variable'] == 'avg_attend') & (df_melt['team'] != 'Oakland Athletics')]).mark_line(size=1).encode(
    x=alt.X('year:O', title=' '),
    y=alt.Y('value', axis=alt.Axis(tickCount=5, title='Average attendance')),
    color=alt.Color('team', scale=alt.Scale(range=['#e9e9e9']), legend=None))

oakland_line = alt.Chart(df_melt[(df_melt['variable'] == 'avg_attend') & (df_melt['team'] == 'Oakland Athletics')]).mark_line(color='#006b5e', size=2).encode(
    x='year:O',
        y=alt.Y('value', axis=alt.Axis(tickCount=5, title='Average attendance')))

(others_line + oakland_line).properties(width=600, height=400).configure_legend(orient='top')

In [19]:
all_teams_mean_attend = df[df['team'] != 'Oakland Athletics'].groupby(['year']).agg({'avg_attend': 'mean'}).reset_index()
oakland_mean_attend = df[df['team'] == 'Oakland Athletics'].groupby(['year']).agg({'avg_attend': 'mean'}).reset_index()

In [20]:
line = alt.Chart(all_teams_mean_attend.query("year<2020")).mark_line(color='#e9e9e9', size=2).encode(
    x='year:O',
    y=alt.Y('avg_attend', axis=alt.Axis(tickCount=5, title='Average attendance')))

In [21]:
as_line = alt.Chart(oakland_mean_attend.query("year<2020")).mark_line(color='#006b5e').encode(
    x='year:O',
    y=alt.Y('avg_attend', axis=alt.Axis(tickCount=5, title='Average attendance')))

In [22]:
(line + as_line).properties(width=600, height=350, title='Oakland average attendance vs. rest of league').configure_legend(orient='top')

---

#### Chart the per game attendance for each team as a heatmap

In [25]:
alt.Chart(df[df['year'] > 1980]).mark_rect().encode(
    x=alt.X('year:O', title=' ', axis=alt.Axis(tickCount=10)),
    y=alt.Y('team:O', title=' '),
    tooltip=[
            alt.Tooltip("year", title="Year"),
            alt.Tooltip("team", title="Team"),
            alt.Tooltip("avg_attend", title="Average", format=','),
    ],
    color=alt.Color('avg_attend', title="Per-game attendance", scale=alt.Scale(scheme="goldgreen"))
).properties(width=800, height=500).configure_legend(orient='top')

----

## Export

In [None]:
df.to_csv('data/processed/mlb_attendance_1976_2022.csv', index=False)