In [66]:
# Import libraries and settings
import os
import time
import random
import unicodedata
import pandas as pd
import seaborn as sns
import matplotlib as plt
%config InlineBackend.figure_format ='retina'
from bs4 import BeautifulSoup
import requests
from requests.exceptions import HTTPError
from variables import team_dictionary, month_dictionary, month_list
from pprint import pprint

### Capture box scores for each NBA game played in the 22-23 and 23-24 seasons
We will look at the schedule of every game on BBref and save the box score urls to get the stats of every player

##### Function - Links for Calendar Month

In [5]:
# first fetch the month links
def get_month_links(start_url):
  base_url = 'https://www.basketball-reference.com'
  month_link_list = []
  try:
    response = requests.get(start_url)
    response.raise_for_status()
  except requests.exceptions.HTTPError as err:
    if response.status_code == 429:
      print("Rate limit exceeded. Please try again later.")
    else:
      print(f"HTTP error occurred: {err}")
    return None, None
  
  soup = BeautifulSoup(response.text, 'html.parser')
  season = soup.find('h1').text
  season = season.strip().split(' ')
  season = season[0]
  body = soup.find('body')
  div_elements = body.find_all('div', class_='filter')
  for div in div_elements:
    a_tags = div.find_all('a', href=True)
    for a_tag in a_tags:
        link_text = a_tag.text.strip().lower()
        if link_text in month_list:
            month_link_list.append((link_text, f"{base_url}{a_tag['href']}"))
  return month_link_list, season

##### Function - Links for each Box Score and Date

In [6]:
def get_box_score_links(month_link_list):
  base_url = 'https://www.basketball-reference.com'
  page_to_check_dict = {'Month': [], 'Url': [], 'Index': []}
  box_link_array = []
  all_dates = []


  for month, page in month_link_list:
    page_link_list = []
    page_date_list = []
    try:
      response = requests.get(page)
      response.raise_for_status()
      soup = BeautifulSoup(response.text, 'html.parser')
      table = soup.find_all('tbody')
      box_scores = table[0].find_all('a', href=True)
      for i in box_scores:
        if i.text.strip() == 'Box Score':
            page_link_list.append(f'{base_url}{i['href']}')
        if ',' in i.text.strip():
            date = i.text.strip()
            date = date.split(', ')
            year = date[2]
            date = date[1].split(' ')
            day = f'0{date[1]}' if len(date[1]) == 1 else date[1]

            mon = month_dictionary[date[0]]
            date = f'{year}{mon}{day}'
            page_date_list.append(date)
      if len(page_link_list) == 0 or len(box_scores)/len(page_link_list) != 4:
          page_to_check_dict['Url'].append(page)
          page_to_check_dict['Month'].append(month)
          page_to_check_dict['Index'].append(len(page_link_list))
      else:
          page_to_check_dict['Url'].append(page)
          page_to_check_dict['Month'].append(month)
          page_to_check_dict['Index'].append(None)
      box_link_array.append(page_link_list)
      all_dates.append(page_date_list)
      time.sleep(10)
    except HTTPError as err:
      if response.status_code == 429:
        print("Rate limit exceeded. Please try again later.")
      else:
        print(f"HTTP error occurred: {err}")
      return None, None
  return box_link_array, all_dates

##### Function - Extract player stats

In [111]:
# iterate through the box links and dates and extract game data for each player
# from https://medium.com/@HeeebsInc/using-machine-learning-to-predict-daily-fantasy-basketball-scores-part-i-811de3c54a98
def extract_player_data(box_links, all_dates, season):
  df_columns = ['Date', 'Name', 'Team', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
               '3P%','FT', 'FTA', 'FT%', 'ORB','DRB', 'TRB', 'AST', 'STL', 'BLK', 
               'TOV', 'PF', 'PTS', 'GmSc', '+-' ]
  stat_df = pd.DataFrame(columns = df_columns)
  error_df = pd.DataFrame(columns = ['URL', 'Error'])
  for i, (l, d) in enumerate(zip(box_links, all_dates)):
    print(f'Processing batch {i+1}/{len(box_links)}')
    for link, date in zip(l, d):
      print(f'{link}\n{date}')
      print(f'Currently Scraping {link}')
      
      try:
        response = requests.get(link)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, 'html.parser')

        tables = soup.find_all('table', id=lambda x: x and x.endswith('-game-basic'))
        for table in tables:
          caption = table.find('caption')
          team_name = caption.text.split(' Basic and Advanced Stats Table')[0].strip()
          rows = table.find('tbody').find_all('tr')
          for row in rows:
            if row.find('th').text in ['Team Totals', 'Reserves']:
              continue
            player_name = normalize_name(row.find('th').text.strip())
            dnp = row.find('td', {'data-stat': 'reason'})

            stats = [date, player_name, team_name]

            if dnp and 'Did Not Play' in dnp.text:
              stats += ['DNP'] * (len(df_columns) - 3)
            else:
              for td in row.find_all('td'):
                stat = td.text.strip()
                stats.append(stat if stat else 0)
  
            if len(stats) == len(df_columns):
              new_row = pd.DataFrame([stats], columns=df_columns)
              stat_df = pd.concat([stat_df, new_row], ignore_index=True)
            else:
              print(f'Skipping incomplete data for {player_name}')

        print(f'Finished Scraping: {link}')

      except HTTPError as http_err:
        if response.status_code == 429:
          print(f'Rate limit exceeded for {link}, Status code: {response.status_code}')
          print(f'Retry-After: {response.headers.get('Retry-After')}')
          time.sleep(int(response.get('Retry-After'), 60))
        else:
          print(f'HTTP error occurred: {http_err}')
        error = {'URL': link, 'Error': f'HTTPError: {http_err}'}
        new_row = pd.DataFrame([error])
        error_df = pd.concat([error_df, new_row], ignore_index=True)

      except Exception as e:
        print(f"Error: {e}")
        print(f"Exception Type: {type(e).__name__}")
        error_message = str(e) if str(e) else "No error message provided."
        print(f'Error Scraping: {link}')
        error = {'URL': link, 'Error': error_message}
        new_row = pd.DataFrame([error])
        error_df = pd.concat([error_df, new_row], ignore_index=True)
      
      delay = random.uniform(3, 7)
      print(f"Delaying for {delay:.2f} seconds")
      time.sleep(delay)

  stat_df.to_csv(f'Season({season}).csv', lineterminator='\n', index=False)
  error_df.to_csv(f'Errors_Season({season}).csv', lineterminator='\n', index=False)
  
  message = f'Saved game stats for the {season} season to a csv'
  print(message)

      
def normalize_name(name):
    normalized = unicodedata.normalize('NFKD', name)
    without_diacritics = ''.join(c for c in normalized if not unicodedata.combining(c))
    return without_diacritics.lower()

In [8]:
def get_team_tables(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Dictionary to store tables for each team
    team_tables = {}
    
    # Find all tables with the class "sortable stats_table now_sortable"
    tables = soup.find_all('table', id=lambda x: x and x.endswith('-game-basic'))
    
    # Loop through the tables and store them based on the team name in the caption
    for table in tables:
      caption = table.find('caption')
      if caption:
        team_name = caption.text.split(" Basic and Advanced Stats Table")[0].strip()
        print(team_name)
        team_tables[team_name] = table
    
    if team_tables:
      print("\nFound tables for the teams:")
      for team, table in team_tables.items():
        print(f"{team}")
        # pprint(table)
    else:
      print("No tables found for the teams.")
    
    return team_tables

# Call the function with the given URL
url = "https://www.basketball-reference.com/boxscores/202310240DEN.html"
team_tables = get_team_tables(url)

Los Angeles Lakers
Denver Nuggets

Found tables for the teams:
Los Angeles Lakers
Denver Nuggets


### 2022-23 Season Data Collection

##### Links for each calendar month

In [9]:
# get list of months along with urls for each
season_22_23 = 'https://www.basketball-reference.com/leagues/NBA_2023_games.html'
month_link_list, season_22_23 = get_month_links(season_22_23)

In [None]:
print(f'Season: {season_22_23}')
pprint(month_link_list)

##### Box score links and dates

In [11]:
# get box score link for each calendar month
box_scores, dates = get_box_score_links(month_link_list)

In [None]:
pprint(box_scores)
pprint(dates)

##### Extract player stats for 22-23 season

In [112]:
# TODO extract player stats from each box score for 22-23 season BUG fix dataframe error
extract_player_data(box_scores, dates, season_22_23)

Processing batch 1/9
https://www.basketball-reference.com/boxscores/202210180BOS.html
20221018
Currently Scraping https://www.basketball-reference.com/boxscores/202210180BOS.html
Finished Scraping: https://www.basketball-reference.com/boxscores/202210180BOS.html
Delaying for 6.30 seconds
https://www.basketball-reference.com/boxscores/202210180GSW.html
20221018
Currently Scraping https://www.basketball-reference.com/boxscores/202210180GSW.html
Finished Scraping: https://www.basketball-reference.com/boxscores/202210180GSW.html
Delaying for 4.52 seconds
https://www.basketball-reference.com/boxscores/202210190DET.html
20221019
Currently Scraping https://www.basketball-reference.com/boxscores/202210190DET.html
Finished Scraping: https://www.basketball-reference.com/boxscores/202210190DET.html
Delaying for 3.19 seconds
https://www.basketball-reference.com/boxscores/202210190IND.html
20221019
Currently Scraping https://www.basketball-reference.com/boxscores/202210190IND.html
Finished Scraping

### 2023-24 Season Data Collection

In [None]:
# TODO get 2023-24 season data

### Rate Limit Error

In [96]:

start_url = 'https://www.basketball-reference.com/leagues/NBA_2023_games.html'

# Function to make a request with rate limit handling
def get_response_with_backoff(url):
    try:
        response = requests.get(url)
        if response.status_code == 429:
            retry_after = response.headers.get('Retry-After')
            if retry_after:
                retry_after = int(retry_after)
                print(f"Rate limit exceeded. Retry after {retry_after} seconds.")
                for remaining in range(retry_after, 0, -1):
                    print(f'Retrying in {remaining} seconds...', end='\r')
                    time.sleep(1)
                print('Retrying now...')
            else:
                retry_after = 60
                print("Rate limit exceeded. Retrying after a default 60 seconds.")
                for remaining in range(retry_after, 0, -1):
                    print(f'Retrying in {remaining} seconds...', end='\r')
                    time.sleep(1)
                print('Retrying now...')
            return get_response_with_backoff(url)
        elif response.status_code == 200:
            return response
        else:
            print(f"Received unexpected status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Call the function
response = get_response_with_backoff(start_url)
if response:
    print(f"Response Status Code: {response.status_code}")
else:
    print("Failed to fetch the URL.")


Response Status Code: 200
