<div class="alert alert-danger">
    <h4 style="font-weight: bold; font-size: 28px;">Basketball Reference API</h4>
    <p style="font-size: 20px;">Data Gathering</p>
</div>

<a name="BR"></a>

# Setup

In [135]:
import pandas as pd
from datetime import datetime, timedelta
import time

In [136]:
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import League, Location, Outcome, OutputType
from basketball_reference_web_scraper.data import OutputWriteOption, Position, PeriodType, Team

# Team Box Scores

In [None]:
pd.DataFrame(client.team_box_scores(day=4, month=11, year=2000))

In [None]:
start_time = time.time()

# initialize empty list
team_bs = []

# define start and end dates
start_date = datetime(1999, 11, 2) # start of 2000 season
end_date = datetime(2023, 12, 31) 

# loop over each day in the range
current_date = start_date
while current_date <= end_date:
    
    print(current_date)
    
    # fetch data for current day
    day_data = client.team_box_scores(day=current_date.day, 
                                      month=current_date.month, 
                                      year=current_date.year)
    
    # check if day_data is not empty
    if day_data:
        # add current date to each dictionary in day_data
        for record in day_data:
            record['date'] = current_date

        # append the day's data (list of dictionaries) to team_box_scores
        team_bs.extend(day_data)

    # add time delay between requests
    time.sleep(30)
    
    # move to next day
    current_date += timedelta(days=1)

# convert list of dictionaries to a DataFrame
team_bs_df = pd.DataFrame(team_bs)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

In [117]:
team_bs_df = pd.DataFrame(team_bs)
team_bs_df.tail()

Unnamed: 0,team,outcome,minutes_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,attempted_three_point_field_goals,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,date
839,Team.PORTLAND_TRAIL_BLAZERS,Outcome.WIN,240,37,75,9,18,25,31,10,36,25,11,6,20,27,108,1999-12-30
840,Team.SAN_ANTONIO_SPURS,Outcome.WIN,240,33,73,3,11,29,33,13,33,18,4,6,16,16,98,1999-12-30
841,Team.VANCOUVER_GRIZZLIES,Outcome.LOSS,240,35,79,5,11,13,16,12,23,25,7,3,13,26,88,1999-12-30
842,Team.NEW_YORK_KNICKS,Outcome.WIN,240,31,71,6,11,21,33,9,21,12,14,7,8,23,89,1999-12-30
843,Team.WASHINGTON_WIZARDS,Outcome.LOSS,240,32,67,8,16,14,25,13,34,17,7,7,18,26,86,1999-12-30


In [86]:
# make 'date' the first column
date_column = team_bs_df.pop('date')
team_bs_df.insert(0, 'date', date_column)

In [None]:
# remove prefixes from categorical columns
columns_to_process = ['team', 'outcome']

for col in columns_to_process:
    team_bs_df[col] = team_bs_df[col].apply(lambda x: x.split('.')[1] if pd.notnull(x) and '.' in x else x)
    team_bs_df[col] = team_bs_df[col].str.replace('_', ' ').str.title()

In [None]:
team_bs_df.head()

In [114]:
team_bs_df.to_csv('../data/original/br_team_box_scores_2000_2024.csv', index=False)

# Player Box Scores

In [None]:
pd.DataFrame(client.player_box_scores(day=11, month=1, year=2024))

In [None]:
start_time = time.time()

# initialize empty list
player_bs = []

# define start and end dates
start_date = datetime(1999, 11, 2) # start of 2000 season
end_date = datetime(2023, 12, 31)

# loop over each day in the range
current_date = start_date
while current_date <= end_date:
    
    print(current_date)
    
    # fetch data for current day
    day_data = client.player_box_scores(day=current_date.day, 
                                        month=current_date.month, 
                                        year=current_date.year)
    
    # check if day_data is not empty
    if day_data:
        # add current date to each dictionary in day_data
        for record in day_data:
            record['date'] = current_date

        # append the day's data (list of dictionaries) to team_box_scores
        player_bs.extend(day_data)

    # add time delay between requests
    time.sleep(30)
    
    # move to next day
    current_date += timedelta(days=1)

# convert list of dictionaries to a DataFrame
player_bs_df = pd.DataFrame(player_bs)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

In [122]:
player_bs_df.head()

Unnamed: 0,slug,name,team,location,opponent,outcome,seconds_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,...,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,game_score,date
0,cassesa01,Sam Cassell,Team.MILWAUKEE_BUCKS,Location.AWAY,Team.HOUSTON_ROCKETS,Outcome.WIN,2250,15,20,0,...,7,1,2,11,1,0,4,5,30.2,1999-11-02
1,vanexni01,Nick Van Exel,Team.DENVER_NUGGETS,Location.HOME,Team.PHOENIX_SUNS,Outcome.WIN,2733,12,21,5,...,7,2,7,9,0,0,3,1,29.7,1999-11-02
2,piercpa01,Paul Pierce,Team.BOSTON_CELTICS,Location.AWAY,Team.TORONTO_RAPTORS,Outcome.WIN,2273,12,19,3,...,6,1,7,5,2,0,2,3,25.4,1999-11-02
3,odomla01,Lamar Odom,Team.LOS_ANGELES_CLIPPERS,Location.HOME,Team.SEATTLE_SUPERSONICS,Outcome.LOSS,2607,10,18,2,...,15,2,10,3,2,2,3,1,25.1,1999-11-02
4,hillgr01,Grant Hill,Team.DETROIT_PISTONS,Location.AWAY,Team.MIAMI_HEAT,Outcome.LOSS,3134,16,35,1,...,11,3,6,5,0,1,4,5,23.8,1999-11-02


In [None]:
player_bs_df.to_csv('../data/original/br_player_box_scores_2000_2024.csv', index=False)

# Season Schedule

In [139]:
pd.DataFrame(client.season_schedule(season_end_year=2001))

In [None]:
start_time = time.time()

# initialize empty list
season_schedule = []

# define start and end dates
start_date = datetime(1999, 11, 2) # start of 2000 season
end_date = datetime(2023, 12, 31)

# loop over each day in the range
current_date = start_date
while current_date <= end_date:
    
    print(current_date.year)
    
    # fetch data for current day
    year_data = client.season_schedule(season_end_year=current_date.year)
    
    # check if day_data is not empty
    if year_data:
        # add current date to each dictionary in day_data
        for record in year_data:
            record['season'] = current_date.year

        # append the year's data (list of dictionaries) to team_box_scores
        season_schedule.extend(year_data)

    # add time delay between requests
    time.sleep(30)
    
    # move to next year
    current_date = current_date.replace(year=current_date.year + 1)

# convert list of dictionaries to a DataFrame
season_schedule_df = pd.DataFrame(season_schedule)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

In [132]:
season_schedule_df = pd.read_csv('../data/original/br_season_schedule_2000_2024.csv')

In [133]:
# convert the 'start_time' to datetime
season_schedule_df['start_time'] = pd.to_datetime(season_schedule_df['start_time'])

# adjust erroneous dates starting from 11/1/2000
incorrect_date_start = pd.Timestamp('2000-11-01', tz='UTC')

# correct the date part in 'start_time' by moving backwards one day
season_schedule_df.loc[season_schedule_df['start_time'] >= incorrect_date_start, 'start_time'] -= pd.Timedelta(days=1)

# extract just the date part
season_schedule_df['date'] = season_schedule_df['start_time'].dt.date

In [None]:
# reorder columns
season_schedule_df = season_schedule_df[['season', 'date', 'start_time', 'home_team', 
                             'away_team', 'home_team_score', 'away_team_score']]

In [33]:
# remove prefixes from categorical columns
columns_to_process = ['home_team', 'away_team']

for col in columns_to_process:
    season_schedule_df[col] = season_schedule_df[col].apply(lambda x: x.split('.')[1] if pd.notnull(x) and '.' in x else x)
    season_schedule_df[col] = season_schedule_df[col].str.replace('_', ' ').str.title()

In [134]:
season_schedule_df.to_csv('../data/original/br_season_schedule_2000_2024_2.csv', index=False)

# Play by Play

In [None]:
pd.DataFrame(client.play_by_play(home_team=Team.BOSTON_CELTICS, year=1999, month=11, day=5))

In [None]:
# get game dates for a given team
season_schedule_df = pd.read_csv('../data/original/br_season_schedule_2000_2024.csv')
season_schedule_df['start_time'] = pd.to_datetime(season_schedule_df['start_time'])
celtics_home_games = season_schedule_df[season_schedule_df['home_team'] == 'Team.BOSTON_CELTICS']
celtics_home_games.head()

In [None]:
start_time = time.time()

# initialize empty list
play_by_play = []

for current_date in celtics_home_games['start_time']:
    
    print(current_date)
    
    try:
        # Attempt to fetch data for current day
        day_data = client.play_by_play(home_team=Team.BOSTON_CELTICS,
                                       day=current_date.day, 
                                       month=current_date.month, 
                                       year=current_date.year)
        
        # If the API call is successful and day_data is not empty
        if day_data:
            # Add current date to each dictionary in day_data
            for record in day_data:
                record['date'] = current_date

            # Append the day's data (list of dictionaries) to play_by_play
            play_by_play.extend(day_data)

    except Exception as e:
        # Handle errors (print them or pass)
        print(f"Error occurred: {e}")


    # add time delay between requests
    time.sleep(30)

# convert list of dictionaries to a DataFrame
play_by_play_df = pd.DataFrame(play_by_play)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

In [72]:
play_by_play_df.to_csv('../data/original/play_by_play_2000_2024_Boston_Celtics.csv', index=False)

# Play by Play (every day)

In [None]:
start_time = time.time()

# initialize empty list
play_by_play_df = []

# define start and end dates
start_date = datetime(2022, 1, 1)
end_date = datetime(2023, 12, 31) 

# loop over each day in the range
current_date = start_date
while current_date <= end_date:
    
    print(current_date)
    
    try:
        # Attempt to fetch data for current day
        day_data = client.play_by_play(home_team=Team.BOSTON_CELTICS,
                                       day=current_date.day, 
                                       month=current_date.month, 
                                       year=current_date.year)
        
        # If the API call is successful and day_data is not empty
        if day_data:
            # Add current date to each dictionary in day_data
            for record in day_data:
                record['date'] = current_date

            # Append the day's data (list of dictionaries) to play_by_play
            play_by_play.extend(day_data)

    except Exception as e:
        # Handle errors (print them or pass)
        print(f"Error occurred: {e}")

    # add time delay between requests
    time.sleep(30)
    
    # move to next day
    current_date += timedelta(days=1)

# convert list of dictionaries to a DataFrame
play_by_play_df = pd.DataFrame(play_by_play_df)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

In [75]:
play_by_play_df.to_csv('../data/original/play_by_play_2022_2024_Boston_Celtics_by_day.csv', index=False)

# Standings

In [None]:
pd.DataFrame(client.standings(season_end_year=2019))

In [None]:
start_time = time.time()

# initialize empty list
standings = []

# define start and end dates
start_date = datetime(2000, 1, 1)
end_date = datetime(2023, 12, 31)

# loop over each day in the range
current_date = start_date
while current_date <= end_date:
    
    print(current_date.year)
    
    # fetch data for current day
    year_data = client.standings(season_end_year=current_date.year)
    
    # check if year_data is not empty
    if year_data:
        # add current date to each dictionary in day_data
        for record in year_data:
            record['season'] = current_date.year

        # append the year's data (list of dictionaries) to team_box_scores
        standings.extend(year_data)

    # add time delay between requests
    time.sleep(30)
    
    # move to next year
    current_date = current_date.replace(year=current_date.year + 1)

# convert list of dictionaries to a DataFrame
standings_df = pd.DataFrame(standings)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

In [27]:
# reorder columns
standings_df = standings_df[['season', 'conference', 'division', 
                             'team', 'wins', 'losses']]

In [28]:
# remove prefixes from categorical columns
columns_to_process = ['team', 'division', 'conference']

for col in columns_to_process:
    standings_df[col] = standings_df[col].apply(lambda x: x.split('.')[1] if pd.notnull(x) and '.' in x else x)
    standings_df[col] = standings_df[col].str.replace('_', ' ').str.title()

In [47]:
standings_df.to_csv('../data/original/br_standings_2000_2024.csv', index=False)

# Player Season Totals (Basic Stats)

In [None]:
pd.DataFrame(client.players_season_totals(season_end_year=2018))

In [None]:
start_time = time.time()

# initialize empty list
players_season_totals = []

# define start and end dates
start_date = datetime(2000, 1, 1)
end_date = datetime(2023, 12, 31)

# loop over each day in the range
current_date = start_date
while current_date <= end_date:
    
    print(current_date.year)
    
    # fetch data for current day
    year_data = client.players_season_totals(season_end_year=current_date.year)
    
    # check if year_data is not empty
    if year_data:
        # add current date to each dictionary in day_data
        for record in year_data:
            record['season'] = current_date.year

        # append the year's data (list of dictionaries) to team_box_scores
        players_season_totals.extend(year_data)

    # add time delay between requests
    time.sleep(30)
    
    # move to next year
    current_date = current_date.replace(year=current_date.year + 1)

# convert list of dictionaries to a DataFrame
players_season_totals_df = pd.DataFrame(players_season_totals)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

In [56]:
# make 'year' the first column
season_column = players_season_totals_df.pop('season')
players_season_totals_df.insert(0, 'season', season_column)

In [57]:
# remove prefixes from categorical columns
columns_to_process = ['team']

for col in columns_to_process:
    players_season_totals_df[col] = players_season_totals_df[col].apply(lambda x: x.split('.')[1] if pd.notnull(x) and '.' in x else x)
    players_season_totals_df[col] = players_season_totals_df[col].str.replace('_', ' ').str.title()

In [58]:
# extract strings from 'positions' column
import re

def extract_position(text):
    '''
    Converts 'CENTER' to 'Center' and 'POWER FORWARD' to 'Power Forward'
    '''
    match = re.search(r"Position\.\w+: '([\w\s]+)'", text)
    if match:
        return match.group(1).title()  


players_season_totals_df['positions'] = players_season_totals_df['positions'].apply(extract_position)

In [None]:
players_season_totals_df.head()

In [49]:
players_season_totals_df.to_csv('../data/original/br_players_season_totals_basic_2000_2024.csv', index=False)

# Player Season Totals (Advanced Stats)

In [None]:
pd.DataFrame(client.players_advanced_season_totals(season_end_year=2018))

In [None]:
start_time = time.time()

# initialize empty list
players_season_totals_adv = []

# define start and end dates
start_date = datetime(2000, 1, 1)
end_date = datetime(2023, 12, 31)

# loop over each day in the range
current_date = start_date
while current_date <= end_date:
    
    print(current_date.year)
    
    # fetch data for current day
    year_data = client.players_advanced_season_totals(season_end_year=current_date.year)
    
    # check if year_data is not empty
    if year_data:
        # add current date to each dictionary in day_data
        for record in year_data:
            record['season'] = current_date.year

        # append the year's data (list of dictionaries) to team_box_scores
        players_season_totals_adv.extend(year_data)

    # add time delay between requests
    time.sleep(30)
    
    # move to next year
    current_date = current_date.replace(year=current_date.year + 1)

# convert list of dictionaries to a DataFrame
players_season_totals_adv_df = pd.DataFrame(players_season_totals_adv)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

In [62]:
# make 'year' the first column
season_column = players_season_totals_adv_df.pop('season')
players_season_totals_adv_df.insert(0, 'season', season_column)

In [64]:
# remove prefixes from categorical columns
columns_to_process = ['team']

for col in columns_to_process:
    players_season_totals_adv_df[col] = players_season_totals_adv_df[col].apply(lambda x: x.split('.')[1] if pd.notnull(x) and '.' in x else x)
    players_season_totals_adv_df[col] = players_season_totals_adv_df[col].str.replace('_', ' ').str.title()

In [65]:
# extract strings from 'positions' column
import re

def extract_position(text):
    '''
    Converts 'CENTER' to 'Center' and 'POWER FORWARD' to 'Power Forward'
    '''
    match = re.search(r"Position\.\w+: '([\w\s]+)'", text)
    if match:
        return match.group(1).title()  


players_season_totals_adv_df['positions'] = players_season_totals_adv_df['positions'].apply(extract_position)

In [None]:
players_season_totals_adv_df.head()

In [51]:
players_season_totals_adv_df.to_csv('../data/original/br_players_season_totals_advanced_2000_2024.csv', index=False)