## Data Scraping and Collecting

In [1]:
# Import libraries and settings
import os
import time
import random
import unicodedata
import pandas as pd
# import seaborn as sns
# import matplotlib as plt
%config InlineBackend.figure_format ='retina'
from bs4 import BeautifulSoup
import requests
from requests.exceptions import HTTPError
from variables import team_dictionary, month_dictionary, month_list
from pprint import pprint

## Capture box scores for each NBA game played in the 22-23 and 23-24 seasons
We will look at the schedule of every game on BBref and save the box score urls to get the stats of every player

##### Function - Links for Calendar Month

In [2]:
# first fetch the month links
def get_month_links(start_url):
  base_url = 'https://www.basketball-reference.com'
  month_link_list = []
  try:
    response = requests.get(start_url)
    response.raise_for_status()
  except requests.exceptions.HTTPError as err:
    if response.status_code == 429:
      print("Rate limit exceeded. Please try again later.")
    else:
      print(f"HTTP error occurred: {err}")
    return None, None
  
  soup = BeautifulSoup(response.text, 'html.parser')
  season = soup.find('h1').text
  season = season.strip().split(' ')
  season = season[0]
  body = soup.find('body')
  div_elements = body.find_all('div', class_='filter')
  for div in div_elements:
    a_tags = div.find_all('a', href=True)
    for a_tag in a_tags:
      link_text = a_tag.text.strip().lower()
      if link_text in month_list:
        month_link_list.append((link_text, f"{base_url}{a_tag['href']}"))
  return month_link_list, season

##### Function - Links for each Box Score and Date

In [9]:
def get_box_score_links(month_link_list):
  base_url = 'https://www.basketball-reference.com'
  page_to_check_dict = {'Month': [], 'Url': [], 'Index': []}
  box_link_array = []
  all_dates = []


  for month, page in month_link_list:
    page_link_list = []
    page_date_list = []
    try:
      response = requests.get(page)
      response.raise_for_status()
      soup = BeautifulSoup(response.text, 'html.parser')
      table = soup.find_all('tbody')
      box_scores = table[0].find_all('a', href=True)
      for i in box_scores:
        if i.text.strip() == 'Box Score':
          page_link_list.append(f"{base_url}{i['href']}")
        if ',' in i.text.strip():
            date = i.text.strip()
            date = date.split(', ')
            year = date[2]
            date = date[1].split(' ')
            day = f'0{date[1]}' if len(date[1]) == 1 else date[1]

            mon = month_dictionary[date[0]]
            date = f'{year}{mon}{day}'
            page_date_list.append(date)
      if len(page_link_list) == 0 or len(box_scores)/len(page_link_list) != 4:
          page_to_check_dict['Url'].append(page)
          page_to_check_dict['Month'].append(month)
          page_to_check_dict['Index'].append(len(page_link_list))
      else:
          page_to_check_dict['Url'].append(page)
          page_to_check_dict['Month'].append(month)
          page_to_check_dict['Index'].append(None)
      box_link_array.append(page_link_list)
      all_dates.append(page_date_list)
      time.sleep(10)
    except HTTPError as err:
      if response.status_code == 429:
        print("Rate limit exceeded. Please try again later.")
      else:
        print(f"HTTP error occurred: {err}")
      return None, None
  return box_link_array, all_dates

##### Function - Extract player stats

In [2]:
# iterate through the box links and dates and extract game data for each player
# from https://medium.com/@HeeebsInc/using-machine-learning-to-predict-daily-fantasy-basketball-scores-part-i-811de3c54a98
def extract_player_data(box_links, all_dates, season):
  df_columns = ['Date', 'Name', 'Team', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
               '3P%','FT', 'FTA', 'FT%', 'ORB','DRB', 'TRB', 'AST', 'STL', 'BLK', 
               'TOV', 'PF', 'PTS', 'GmSc', '+-' ]
  stat_df = pd.DataFrame(columns = df_columns)
  error_df = pd.DataFrame(columns = ['URL', 'Error'])
  for i, (l, d) in enumerate(zip(box_links, all_dates)):
    print(f'Processing batch {i+1}/{len(box_links)}')
    for link, date in zip(l, d):
      print(f'{link}\n{date}')
      print(f'Currently Scraping {link}')
      
      try:
        response = requests.get(link)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, 'html.parser')

        tables = soup.find_all('table', id=lambda x: x and x.endswith('-game-basic'))
        for table in tables:
          caption = table.find('caption')
          team_name = caption.text.split(' Basic and Advanced Stats Table')[0].strip()
          rows = table.find('tbody').find_all('tr')
          for row in rows:
            if row.find('th').text in ['Team Totals', 'Reserves']:
              continue
            player_name = normalize_name(row.find('th').text.strip())
            dnp = row.find('td', {'data-stat': 'reason'})

            stats = [date, player_name, team_name]

            if dnp and 'Did Not Play' in dnp.text:
              stats += ['DNP'] * (len(df_columns) - 3)
            else:
              for td in row.find_all('td'):
                stat = td.text.strip()
                stats.append(stat if stat else 0)
  
            if len(stats) == len(df_columns):
              new_row = pd.DataFrame([stats], columns=df_columns)
              stat_df = pd.concat([stat_df, new_row], ignore_index=True)
            else:
              print(f'Skipping incomplete data for {player_name}')

        print(f'Finished Scraping: {link}')

      except HTTPError as http_err:
        if response.status_code == 429:
          print(f'Rate limit exceeded for {link}, Status code: {response.status_code}')
          print(f"Retry-After: {response.headers.get('Retry-After')}")
          time.sleep(int(response.get('Retry-After'), 60))
        else:
          print(f'HTTP error occurred: {http_err}')
        error = {'URL': link, 'Error': f'HTTPError: {http_err}'}
        new_row = pd.DataFrame([error])
        error_df = pd.concat([error_df, new_row], ignore_index=True)

      except Exception as e:
        print(f"Error: {e}")
        print(f"Exception Type: {type(e).__name__}")
        error_message = str(e) if str(e) else "No error message provided."
        print(f'Error Scraping: {link}')
        error = {'URL': link, 'Error': error_message}
        new_row = pd.DataFrame([error])
        error_df = pd.concat([error_df, new_row], ignore_index=True)
      
      delay = random.uniform(3, 7)
      print(f"Delaying for {delay:.2f} seconds")
      time.sleep(delay)

  stat_df.to_csv(f'Season({season}).csv', lineterminator='\n', index=False)
  error_df.to_csv(f'Errors_Season({season}).csv', lineterminator='\n', index=False)
  
  message = f'Saved game stats for the {season} season to a csv'
  print(message)

      
def normalize_name(name):
    normalized = unicodedata.normalize('NFKD', name)
    without_diacritics = ''.join(c for c in normalized if not unicodedata.combining(c))
    return without_diacritics.lower()

### 2022-23 Season Data Collection

In [21]:
# get list of months along with urls for each
season_22_23 = 'https://www.basketball-reference.com/leagues/NBA_2023_games.html'
month_link_list, season_22_23 = get_month_links(season_22_23)

print(f'Season: {season_22_23}')
pprint(month_link_list)

Season: 2022-23
[('october',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-october.html'),
 ('november',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-november.html'),
 ('december',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-december.html'),
 ('january',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-january.html'),
 ('february',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-february.html'),
 ('march',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-march.html'),
 ('april',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-april.html'),
 ('may',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-may.html'),
 ('june',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-june.html')]


In [9]:
# get box score link for each calendar month
box_scores, dates = get_box_score_links(month_link_list)

In [34]:
pprint(box_scores)
pprint(dates)

[['https://www.basketball-reference.com/boxscores/202210180BOS.html',
  'https://www.basketball-reference.com/boxscores/202210180GSW.html',
  'https://www.basketball-reference.com/boxscores/202210190DET.html',
  'https://www.basketball-reference.com/boxscores/202210190IND.html',
  'https://www.basketball-reference.com/boxscores/202210190ATL.html',
  'https://www.basketball-reference.com/boxscores/202210190BRK.html',
  'https://www.basketball-reference.com/boxscores/202210190MEM.html',
  'https://www.basketball-reference.com/boxscores/202210190MIA.html',
  'https://www.basketball-reference.com/boxscores/202210190TOR.html',
  'https://www.basketball-reference.com/boxscores/202210190MIN.html',
  'https://www.basketball-reference.com/boxscores/202210190SAS.html',
  'https://www.basketball-reference.com/boxscores/202210190UTA.html',
  'https://www.basketball-reference.com/boxscores/202210190PHO.html',
  'https://www.basketball-reference.com/boxscores/202210190SAC.html',
  'https://www.baske

In [3]:
extract_player_data(box_scores, dates, season_22_23)

NameError: name 'box_scores' is not defined

### 2023-24 Season Data Collection

In [24]:
season_23_24 = 'https://www.basketball-reference.com/leagues/NBA_2024_games.html'
month_link_list, season_23_24 = get_month_links(season_23_24)

In [114]:
print(f'Season: {season_23_24}')
pprint(month_link_list)

Season: 2023-24
[('october',
  'https://www.basketball-reference.com/leagues/NBA_2024_games-october.html'),
 ('november',
  'https://www.basketball-reference.com/leagues/NBA_2024_games-november.html'),
 ('december',
  'https://www.basketball-reference.com/leagues/NBA_2024_games-december.html'),
 ('january',
  'https://www.basketball-reference.com/leagues/NBA_2024_games-january.html'),
 ('february',
  'https://www.basketball-reference.com/leagues/NBA_2024_games-february.html'),
 ('march',
  'https://www.basketball-reference.com/leagues/NBA_2024_games-march.html'),
 ('april',
  'https://www.basketball-reference.com/leagues/NBA_2024_games-april.html'),
 ('may',
  'https://www.basketball-reference.com/leagues/NBA_2024_games-may.html'),
 ('june',
  'https://www.basketball-reference.com/leagues/NBA_2024_games-june.html')]


In [25]:
box_scores, dates = get_box_score_links(month_link_list)

In [116]:
pprint(box_scores)
pprint(dates)

[['https://www.basketball-reference.com/boxscores/202310240DEN.html',
  'https://www.basketball-reference.com/boxscores/202310240GSW.html',
  'https://www.basketball-reference.com/boxscores/202310250ORL.html',
  'https://www.basketball-reference.com/boxscores/202310250NYK.html',
  'https://www.basketball-reference.com/boxscores/202310250IND.html',
  'https://www.basketball-reference.com/boxscores/202310250CHO.html',
  'https://www.basketball-reference.com/boxscores/202310250MIA.html',
  'https://www.basketball-reference.com/boxscores/202310250TOR.html',
  'https://www.basketball-reference.com/boxscores/202310250BRK.html',
  'https://www.basketball-reference.com/boxscores/202310250MEM.html',
  'https://www.basketball-reference.com/boxscores/202310250CHI.html',
  'https://www.basketball-reference.com/boxscores/202310250UTA.html',
  'https://www.basketball-reference.com/boxscores/202310250SAS.html',
  'https://www.basketball-reference.com/boxscores/202310250LAC.html',
  'https://www.baske

In [None]:
extract_player_data(box_scores, dates, season_23_24)

# Add additional columns to csv files
- ~~Game Type: Season, Play-In, Playoffs, Finals~~
- ~~Fantasy stats for 22-23 and 23-24 seasons~~
- ~~BBref Game Link for each player~~
- ~~"Opponent" column for each player~~
- ~~"Home" column that has the values '1' if player was on home team or '0' if player on on away team~~

In [95]:
player_22_23_df = pd.read_csv('../data/processed/Season(2022-23).csv')
player_23_24_df = pd.read_csv('../data/processed/Season(2023-24).csv')

In [24]:
player_22_23_df.tail()

Unnamed: 0,Date,Name,Team,MP,FG,FGA,FG%,3P,3PA,3P%,...,BLK,TOV,PF,PTS,GmSc,+-,TeamAbbr,GameLink,Opponent,Home
33601,20230612,vlatko cancar,Denver Nuggets,DNP,DNP,DNP,DNP,DNP,DNP,DNP,...,DNP,DNP,DNP,DNP,DNP,DNP,DEN,https://www.basketball-reference.com/boxscores...,Miami Heat,1
33602,20230612,reggie jackson,Denver Nuggets,DNP,DNP,DNP,DNP,DNP,DNP,DNP,...,DNP,DNP,DNP,DNP,DNP,DNP,DEN,https://www.basketball-reference.com/boxscores...,Miami Heat,1
33603,20230612,zeke nnaji,Denver Nuggets,DNP,DNP,DNP,DNP,DNP,DNP,DNP,...,DNP,DNP,DNP,DNP,DNP,DNP,DEN,https://www.basketball-reference.com/boxscores...,Miami Heat,1
33604,20230612,ish smith,Denver Nuggets,DNP,DNP,DNP,DNP,DNP,DNP,DNP,...,DNP,DNP,DNP,DNP,DNP,DNP,DEN,https://www.basketball-reference.com/boxscores...,Miami Heat,1
33605,20230612,peyton watson,Denver Nuggets,DNP,DNP,DNP,DNP,DNP,DNP,DNP,...,DNP,DNP,DNP,DNP,DNP,DNP,DEN,https://www.basketball-reference.com/boxscores...,Miami Heat,1


In [29]:
# Add game type columns for 2022-23
def determin_game_type(date):
  date = str(date)
  if '20231024' <= date <= '20240414':
    return 'Season'
  elif '20240416' <= date <= '20240419':
    return 'Play-In'
  elif '20240420' <= date <= '20240530':
    return 'Playoffs'
  elif '20240606' <= date <= '20240617':
    return 'Finals'
  else:
    return 'Unknown'
  
player_23_24_df['GameType'] = player_23_24_df['Date'].apply(determin_game_type)
player_23_24_df.to_csv('../data/processed/fantasy_stats_2023_24_with_game_type.csv', index=False)


#### Download CSV files for each unique date for 22-23 and 23-24 seasons

In [11]:
dates_2022_23 = player_22_23_df['Date'].unique()
dates_2023_24 = player_23_24_df['Date'].unique()
def reformat_date(date):
  date_str = str(date)
  return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"

formatted_dates_2022_23 = [reformat_date(date) for date in dates_2022_23]
formatted_dates_2023_24 = [reformat_date(date) for date in dates_2023_24]

In [20]:
# 2022 - 23
print(len(formatted_dates_2022_23))
print(len([f for f in os.listdir('../data/raw/2022-23') if os.path.isfile(os.path.join('../data/raw/2022-23', f))]))

212
212


In [22]:
# 2023 - 24
print(len(formatted_dates_2023_24))
print(len([f for f in os.listdir('../data/raw/2023-24') if os.path.isfile(os.path.join('../data/raw/2023-24', f))]))

208
208


#### Join player data with fantasy stats data

In [103]:
player_23_24_df.shape

(34126, 29)

In [106]:
player_23_24_df = pd.read_csv('../data/processed/Season(2023-24).csv')

# Ensure the player names are lowercase in the player data
player_23_24_df['Name'] = player_23_24_df['Name'].str.lower()

# Initialize an empty DataFrame to store the combined data
combined_23_24_df = pd.DataFrame()

# Path to the folder containing fantasy stats CSVs
fantasy_stats_path = '../data/raw/2023-24/'

# Iterate over each fantasy stats CSV file
for filename in sorted(os.listdir(fantasy_stats_path)):
    if filename.endswith(".csv"):
        # Extract and reformat the date from the filename
        date = filename.split('.')[0].replace('-', '')
        date_int = int(date)

        # Load the fantasy stats CSV
        fantasy_stats_df = pd.read_csv(os.path.join(fantasy_stats_path, filename))

        # Ensure the player names are lowercase in the fantasy stats
        fantasy_stats_df['player'] = fantasy_stats_df['player'].str.lower()

        # Check if the necessary columns are present
        if 'id' in fantasy_stats_df.columns and 'fpts_fanduel' in fantasy_stats_df.columns:
            # Filter to the relevant columns only
            fantasy_stats_df = fantasy_stats_df[['player', 'id', 'fpts_fanduel']]

            # Add a 'Date' column to the fantasy stats DataFrame
            fantasy_stats_df['Date'] = date_int

            # Filter player data for the current date
            player_data_for_date = player_23_24_df[player_23_24_df['Date'] == date_int]

            # Merge the player data with the fantasy stats
            merged_df = pd.merge(
                player_data_for_date,
                fantasy_stats_df,
                left_on=['Date', 'Name'],
                right_on=['Date', 'player'],
                how='left'
            )

            # Drop the 'player' column from the merged dataframe
            merged_df = merged_df.drop(columns=['player'])

            # Append the merged data to the combined DataFrame
            combined_23_24_df = pd.concat([combined_23_24_df, merged_df], ignore_index=True)
        else:
            # Handle case where necessary columns are missing
            print(f"Filling missing data for file {filename}.")

            # Filter player data for the current date
            player_data_for_date = player_23_24_df[player_23_24_df['Date'] == date_int]

            # Fill 'id' with -1 and 'fpts_fanduel' with 0
            player_data_for_date['id'] = -1
            player_data_for_date['fpts_fanduel'] = 0

            # Append the modified data to the combined DataFrame
            combined_23_24_df = pd.concat([combined_23_24_df, player_data_for_date], ignore_index=True)

# Ensure all necessary rows are kept and clean up the data
combined_23_24_df = combined_23_24_df.drop_duplicates()

# Fill NaN values in 'id' and 'fpts_fanduel' columns for DNP players
combined_23_24_df['id'].fillna(-1, inplace=True)  # Assign -1 to indicate missing IDs
combined_23_24_df['fpts_fanduel'].fillna(0, inplace=True)  # Assign 0 to DNP players

# Convert 'id' field to int
combined_23_24_df['id'] = combined_23_24_df['id'].astype(int)

# Save the combined data to a new CSV file
combined_23_24_df.to_csv('../data/processed/Season(2023-24)_fantasy_stats.csv', index=False)

Filling missing data for file 2023-12-09.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_data_for_date['id'] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_data_for_date['fpts_fanduel'] = 0
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_23_2

In [105]:
print(player_23_24_df.shape)
print(combined_23_24_df.shape)

(34126, 29)
(34098, 31)


In [73]:
# combined_22_23_df['id'] = combined_22_23_df['id'].astype(int)
# combined_22_23_df.to_csv('../data/processed/Season(2022-23)_fantasy_stats.csv', index=False)


In [93]:

grouped_by_id = combined_22_23_df.groupby('id')['Name'].unique().reset_index()

multiple_players_per_id = grouped_by_id[grouped_by_id['Name'].apply(len) > 1]
valid_ids = multiple_players_per_id[multiple_players_per_id['id'] != -1]

# Output ids with more than one player
if not valid_ids.empty:
    print("IDs with multiple players assigned:")
    print(valid_ids)
else:
    print("Each 'id' value has only one player assigned, except for DNPs with 'id' -1.")


Each 'id' value has only one player assigned, except for DNPs with 'id' -1.


# Rate Limit Error

In [41]:

start_url = 'https://www.basketball-reference.com/leagues/NBA_2023_games.html'

# Function to make a request with rate limit handling
def get_response_with_backoff(url):
    try:
        response = requests.get(url)
        if response.status_code == 429:
            retry_after = response.headers.get('Retry-After')
            if retry_after:
                retry_after = int(retry_after)
                print(f"Rate limit exceeded. Retry after {retry_after} seconds.")
                for remaining in range(retry_after, 0, -1):
                    print(f'Retrying in {remaining} seconds...', end='\r')
                    time.sleep(1)
                print('Retrying now...')
            else:
                retry_after = 60
                print("Rate limit exceeded. Retrying after a default 60 seconds.")
                for remaining in range(retry_after, 0, -1):
                    print(f'Retrying in {remaining} seconds...', end='\r')
                    time.sleep(1)
                print('Retrying now...')
            return get_response_with_backoff(url)
        elif response.status_code == 200:
            return response
        else:
            print(f"Received unexpected status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Call the function
response = get_response_with_backoff(start_url)
if response:
    print(f"Response Status Code: {response.status_code}")
else:
    print("Failed to fetch the URL.")


Response Status Code: 200
