## Import needed libraries

In [206]:
import requests
import csv
from datetime import timedelta, datetime
import pandas as pd
from scipy.stats import ttest_ind

## Set up some initial values

In [248]:
season = 2021
if season == 2021:
    start = datetime.strptime("2021-4-01", "%Y-%m-%d")
    end = datetime.strptime("2021-10-03", "%Y-%m-%d")
elif season == 2022:
    start = datetime.strptime("2022-04-07", "%Y-%m-%d")
    end = datetime.today()
date_generated = [start + timedelta(days=x) for x in range(0, (end-start).days + 1)]
date_generated = [x.strftime('%Y-%m-%d') for x in date_generated]

## Initialize the CSV File

In [None]:
with open('attendance.csv', 'w') as csvfile:
    attendance_writer = csv.writer(csvfile)
    attendance_writer.writerow(['Date', 'Away', 'Home', 'Attendance', 'Type'])


## Get the data from the MLB API

In [249]:
for the_game_date in date_generated:
    url = f'http://statsapi.mlb.com/api/v1/attendance?leagueId=103,104&date={the_game_date}'
    r = requests.get(url).json()
    for i in r.get('records'):
        try:
            gamepk = i.get('attendanceHighGame').get('gamePk')
            game_url = f'http://statsapi.mlb.com/api/v1.1/game/{gamepk}/feed/live'
            game_response = requests.get(game_url).json()
            game_type = game_response.get('gameData').get('game').get('type')
            attendance = game_response.get('gameData').get('gameInfo').get('attendance')
            game_date = game_response.get('gameData').get('datetime').get('officialDate')
            away_team = game_response.get('gameData').get('teams').get('away').get('name')
            home_team = game_response.get('gameData').get('teams').get('home').get('name')
            message = f"{game_date} {away_team} @ {home_team} - {attendance} attendance"
            with open('attendance.csv', 'a') as csvfile:
                attendance_writer = csv.writer(csvfile)
                attendance_writer.writerow([game_date, away_team, home_team, attendance, game_type])
        except:
            pass
    print(f"{the_game_date} attendance complete")
        


2021-10-03 attendance complete


## Read in the attendance CSV to a data frame so you can do statistics on it

In [250]:
df = pd.read_csv('attendance.csv')

## Create some Filter columns based on data

In [251]:
df['DodgerHomeGame'] = df['Home'] == 'Los Angeles Dodgers'
df['DodgerAwayGame'] = df['Away'] == 'Los Angeles Dodgers'
df['DodgerGame'] = (df['Away'] == 'Los Angeles Dodgers') | (df['Home'] == 'Los Angeles Dodgers')

## Get some statistics for Dodgers Home Games

In [252]:
dodgers_home_games_attendance = df[df['DodgerHomeGame']==True]['Attendance']
dodgers_home_games_attendance.agg(['std', 'mean', 'count'])

std      15894.560949
mean     34625.839506
count       81.000000
Name: Attendance, dtype: float64

## Get some statistics for Non-Dodgers Games

In [253]:
non_dodgers_games_attendance = df[df['DodgerGame']==False]['Attendance']
non_dodgers_games_attendance.agg(['std', 'mean', 'count'])

std      10122.776283
mean     18122.411951
count     2209.000000
Name: Attendance, dtype: float64

## Get some statistics for Dodgers Away Games

There are only 80 Away games. There seems to be an issue with getting a Double header that was played between Chicago Cubs and LA Dodgers at Chicago on May 4, 2021

In [254]:
dodgers_away_game_attendance = df[df['DodgerAwayGame']==True]['Attendance']
df[df['DodgerAwayGame']==True]['Attendance'].agg(['std', 'mean', 'count'])

std      12127.878185
mean     25349.187500
count       80.000000
Name: Attendance, dtype: float64

In [255]:
tstat, pvalue = ttest_ind(dodgers_away_game_attendance, non_dodgers_games_attendance, equal_var=False)

In [256]:
tstat, pvalue

(5.263734785156167, 1.0861695626801569e-06)

In [257]:
df['PirateHomeGame'] = df['Home'] == 'Pittsburgh Pirates'
df['PirateAwayGame'] = df['Away'] == 'Pittsburgh Pirates'
df['PirateGame'] = (df['Away'] == 'Pittsburgh Pirates') | (df['Home'] == 'Pittsburgh Pirates')

In [258]:
pirates_home_games_attendance = df[df['PirateHomeGame']==True]['Attendance']
pirates_home_games_attendance.agg(['std', 'mean', 'count'])

std       5522.559070
mean     10597.822785
count       79.000000
Name: Attendance, dtype: float64

In [259]:
non_pirates_games_attendance = df[df['PirateGame']==False]['Attendance']
non_pirates_games_attendance.agg(['std', 'mean', 'count'])

std      10952.399185
mean     19158.052893
count     2212.000000
Name: Attendance, dtype: float64

In [260]:
pirates_away_game_attendance = df[df['PirateAwayGame']==True]['Attendance']
df[df['PirateAwayGame']==True]['Attendance'].agg(['std', 'mean', 'count'])

std      10905.138068
mean     20888.544304
count       79.000000
Name: Attendance, dtype: float64

In [261]:
pirate_tstat, pirate_pvalue = ttest_ind(pirates_away_game_attendance, non_pirates_games_attendance, equal_var=False)

In [262]:
pirate_tstat, pirate_pvalue

(1.3856923163276749, 0.1695213562480913)