In [1]:
import statsapi
import pandas as pd
import numpy as np

<h3>Schedule API</h3>

In [68]:
sched = statsapi.schedule(start_date='01/01/2023',end_date='12/31/2024')
sched_df = pd.DataFrame(sched)
sched_df.to_csv('./test_output/schedule.csv', index=False)

<h3>Boxscore API</h3>

In [67]:
def get_game_boxscore_stats(game_id):
    boxscore = statsapi.boxscore_data(game_id)
    batter_stats_df = get_boxscore_game_batters_stats(boxscore)
    pitcher_stats_df = get_boxscore_game_pitchers_stats(boxscore)

    return batter_stats_df, pitcher_stats_df


def get_boxscore_game_pitchers_stats(boxscore_obj):
    team_info = get_boxscore_teams(boxscore_obj)

    home_pitcher_stats_df = get_boxscore_team_pitcher_stats(boxscore_obj['homePitchers'])
    away_pitcher_stats_df = get_boxscore_team_pitcher_stats(boxscore_obj['awayPitchers'])

    home_pitcher_stats_df['team_id'] = team_info['home_team_id']
    away_pitcher_stats_df['team_id'] = team_info['away_team_id']

    if list(home_pitcher_stats_df.columns) != list(away_pitcher_stats_df.columns):
        away_pitcher_stats_df = away_pitcher_stats_df[list(home_pitcher_stats_df.columns)]

    pitcher_stats_df = pd.concat([home_pitcher_stats_df, away_pitcher_stats_df], ignore_index=True)  
    return pitcher_stats_df 


def get_boxscore_team_pitcher_stats(pitchers_list):
    pitchers_column_mapper = {
        'ip': 'innings_pitched',
        'h': 'hits',
        'r': 'runs',
        'er': 'earned_runs',
        'bb': 'walks',
        'k': 'strikeouts',
        'hr': 'home_runs',
        'era': 'earned_run_average',
        'p': 'pitches',
        's': 'strikes',
        'personId': 'person_id'
    }

    pitcher_stats_df = pd.DataFrame(pitchers_list)
    pitcher_stats_df = pitcher_stats_df.rename(columns=pitchers_column_mapper)
    pitcher_stats_df = pitcher_stats_df.drop(columns=['namefield', 'name', 'note'])
    return pitcher_stats_df


def get_boxscore_game_batters_stats(boxscore_obj):
    team_info = get_boxscore_teams(boxscore_obj)

    home_batter_stats_df = get_boxscore_team_batter_stats(boxscore_obj['homeBatters'])
    away_batter_stats_df = get_boxscore_team_batter_stats(boxscore_obj['awayBatters'])

    home_batter_stats_df['team_id'] = team_info['home_team_id']
    away_batter_stats_df['team_id'] = team_info['away_team_id']

    if list(home_batter_stats_df.columns) != list(away_batter_stats_df.columns):
        away_batter_stats_df = away_batter_stats_df[list(home_batter_stats_df.columns)]

    batter_stats_df = pd.concat([home_batter_stats_df, away_batter_stats_df], ignore_index=True)  
    return batter_stats_df 


def get_boxscore_team_batter_stats(batters_list):
    batters_column_mapper = {
        'personId': 'person_id',
        'ab': 'at_bats',
        'r': 'runs',
        'h': 'hits',
        'hr': 'home_runs',
        'rbi': 'runs_batted_in',
        'sb': 'stolen_bases',
        'bb': 'walks',
        'k': 'strikeouts',
        'lob': 'left_on_base',
        'avg': 'batting_average',
        'ops': 'onbase_plus_slugging',
        'obp': 'onbase_percentage',
        'slg': 'slugging_percentage',
        'battingOrder': 'batting_order'
    }

    batters_df = pd.DataFrame(batters_list)
    batters_df = batters_df.rename(columns=batters_column_mapper)
    batters_df = batters_df.query('person_id != 0')
    batters_df = batters_df.drop(columns=['namefield', 'note', 'name', 'position'])
    return batters_df  


def get_boxscore_teams(boxscore_obj):
    teams = boxscore_obj['teamInfo']
    return {'away_team_id': teams['away']['id'], 'home_team_id': teams['home']['id']}


In [70]:
boxscore = statsapi.boxscore_data(719496)
boxscore.keys()

dict_keys(['gameId', 'teamInfo', 'playerInfo', 'away', 'home', 'awayBatters', 'homeBatters', 'awayBattingTotals', 'homeBattingTotals', 'awayBattingNotes', 'homeBattingNotes', 'awayPitchers', 'homePitchers', 'awayPitchingTotals', 'homePitchingTotals', 'gameBoxInfo'])

In [76]:
boxscore['gameBoxInfo']

[{'label': 'WP', 'value': 'Quinlivan, J 2; Morice.'},
 {'label': 'Groundouts-flyouts',
  'value': 'Quinlivan, J 0-1; Harrington 2-3; Jones 1-2; Morice 0-1; Walsh 1-2; Mosqueda 0-2; Feltman 1-1; Broadway 0-0; Shugart 2-0; Olds 1-0; Gomez, R 1-1; Cellucci 2-0.'},
 {'label': 'Batters faced',
  'value': 'Quinlivan, J 7; Harrington 10; Jones 4; Morice 4; Walsh 5; Mosqueda 3; Feltman 3; Broadway 3; Shugart 5; Olds 5; Gomez, R 3; Cellucci 4.'},
 {'label': 'Inherited runners-scored', 'value': 'Harrington 3-2.'},
 {'label': 'Umpires',
  'value': 'HP: Ben Fernandez. 1B: James Jean. 2B: Justin Juska. 3B: Trevor Mathews. '},
 {'label': 'Weather', 'value': '85 degrees, Sunny.'},
 {'label': 'Wind', 'value': '3 mph, L To R.'},
 {'label': 'First pitch', 'value': '1:06 PM.'},
 {'label': 'T', 'value': '2:08.'},
 {'label': 'Att', 'value': '5,922.'},
 {'label': 'Venue', 'value': 'JetBlue Park.'},
 {'label': 'February 24, 2023'}]

In [66]:
all_batter_stats_df = pd.DataFrame()
all_pitcher_stats_df = pd.DataFrame()

# TO DO: Make this incremental
for game_id in list(sched_df['game_id'])[0:5]:
    batter_stats_df, pitcher_stats_df = get_game_boxscore_stats(game_id)
    batter_stats_df['game_id'] = game_id
    pitcher_stats_df['game_id'] = game_id
    all_batter_stats_df = pd.concat([all_batter_stats_df, batter_stats_df], ignore_index=True)
    all_pitcher_stats_df = pd.concat([all_pitcher_stats_df, pitcher_stats_df], ignore_index=True)


all_batter_stats_df.to_csv('./test_output/boxscore_player_batter_stats.csv', index=False)
all_pitcher_stats_df.to_csv('./test_output/boxscore_player_pitcher_stats.csv', index=False)

<h3>Team API</h3>

In [77]:
def extract_field_from_json(json_obj, field_name):
    return None if field_name not in json_obj else json_obj[field_name]


def parse_json_field(json_obj, json_keys):
    # If no dictionary provide in row, return None values
    if pd.isna(json_obj):
        return pd.Series([None]*len(json_keys))
    
    value_list = []
    for json_key in json_keys:
        value_list.append(extract_field_from_json(json_obj, json_key)) 

    return pd.Series(value_list)


# Retrieve team data from API
teams = statsapi.get('teams', params={'ver': 'v1'})['teams']
teams_df = pd.DataFrame(teams)

# Create list of fields that contain dictionaries as values in the form {'id', ... , 'name': ...}
# to extract data from
dict_fields = ['venue', 'league', 'division', 'sport', 'springLeague', 'springVenue']
for field in dict_fields:
    id_field_name = f'{field.lower()}_id'
    name_field_name = f'{field.lower()}_name'
    teams_df[[id_field_name, name_field_name]] = teams_df[field].apply(parse_json_field, json_keys=['id', 'name'])

    teams_df.drop(columns=field, inplace=True)

# Only extract MLB teams
# Sport ID = 1 is the MLB
mlb_teams_df = teams_df[teams_df['sport_id'] == 1]

mlb_teams_df.to_csv('./test_output/teams.csv', index=False)

<h3>Team Rosters API</h3>

In [78]:
all_team_rosters_df = pd.DataFrame()

for team_id in mlb_teams_df['id'].unique():
    team_roster = statsapi.get('team_roster', params={'ver': 'v1', 'teamId': team_id})['roster']
    team_roster_df = pd.DataFrame(team_roster)
    team_roster_df[['person_id', 'person_name']] = team_roster_df['person'].apply(parse_json_field, json_keys=['id', 'fullName'])
    team_roster_df[['position_code', 'position_name', 'position_type', 'position_abbr']] = team_roster_df['position'].apply(parse_json_field, json_keys=['code', 'name', 'type', 'abbreviation'])
    team_roster_df[['status_code', 'status_description']] = team_roster_df['status'].apply(parse_json_field, json_keys=['code', 'description'])
    team_roster_df.drop(columns=['person', 'position', 'status'], inplace=True)
    team_roster_df.rename(columns={'parentTeamId': 'team_id'}, inplace=True)

    all_team_rosters_df = pd.concat([all_team_rosters_df, team_roster_df], ignore_index=True)

all_team_rosters_df.to_csv('./test_output/team_rosters.csv', index=False)

In [85]:
all_team_rosters_df.columns

Index(['jerseyNumber', 'team_id', 'person_id', 'person_name', 'position_code',
       'position_name', 'position_type', 'position_abbr', 'status_code',
       'status_description'],
      dtype='object')

<h3> Player Stats </h3>

In [109]:
person_info = all_team_rosters_df[['person_id', 'position_type']]
statsapi.player_stat_data(543305)

def process_stat_list_entry(stat_list_entry):
    if not stat_list_entry.get('stats', None):
        return None

    season = stat_list_entry.get('season', None)
    stats = stat_list_entry['stats']
    stats['season'] = season

    for position_field in ['name', 'type', 'abbreviation']:
        stats[f'position_{position_field}'] = stats['position'][position_field]

    stats.pop('position')

    return stats
        


player_id = 543305
player_stats = statsapi.player_stat_data(player_id)
stats = player_stats['stats']

for stat in stats:
    




# Field
# gamesStarted
# assists
# putOuts
# errors
# fielding
# position - json with code, name, type, abbreviation
# rangeFactorPerGame
# rangeFactorPer9Inn
# innings
# games
# doublePlays
# triplePlays
# throwingErrors

# Hitting
# gamesPlayed
# groundOuts
# airOuts
# runs
# doubles
# triples
# homeRuns
# strikeOuts
# baseOnBalls
# intentionalWalks
# hits
# hitByPitch
# avg
# atBats
# obp
# slg
# ops
# caughtStealing
# stoleBases
# stoleBasePercentage
# groundIntoDoublePlay
# numberOfPitches
# plateApperances
# totalBases
# rbi
# leftOnBase
# sacBunts
# sacFlies
# babip
# groupOutstoAirouts
# catchersInterference
# atBatsPerHomeRun


#  Pitcher - only get pitching stats
#  Hitter - only get hitting stats
#  Otherwise - get hitting and fielding stats


{'type': 'season', 'group': 'fielding', 'season': '2024', 'stats': {'gamesPlayed': 7, 'gamesStarted': 6, 'assists': 0, 'putOuts': 11, 'errors': 0, 'chances': 11, 'fielding': '1.000', 'position': {'code': '9', 'name': 'Outfielder', 'type': 'Outfielder', 'abbreviation': 'RF'}, 'rangeFactorPerGame': '1.57', 'rangeFactorPer9Inn': '1.90', 'innings': '52.0', 'games': 7, 'doublePlays': 0, 'triplePlays': 0, 'throwingErrors': 0}}
{'type': 'season', 'group': 'fielding', 'season': '2024', 'stats': {'gamesPlayed': 4, 'gamesStarted': 4, 'assists': 0, 'putOuts': 0, 'errors': 0, 'chances': 0, 'fielding': '.000', 'position': {'code': '10', 'name': 'Designated Hitter', 'type': 'Hitter', 'abbreviation': 'DH'}, 'rangeFactorPerGame': '0.00', 'rangeFactorPer9Inn': '-.--', 'innings': '0.0', 'games': 4, 'doublePlays': 0, 'triplePlays': 0, 'throwingErrors': 0}}
{'type': 'season', 'group': 'hitting', 'season': '2024', 'stats': {'gamesPlayed': 11, 'groundOuts': 11, 'airOuts': 5, 'runs': 4, 'doubles': 0, 'triple