In [16]:
# Import libraries and settings
import os
import time
import pandas as pd
import seaborn as sns
import matplotlib as plt
%config InlineBackend.figure_format ='retina'
from bs4 import BeautifulSoup
import requests
from variables import team_dictionary, month_dictionary, month_list
from pprint import pprint

### Capture box scores for each NBA game played in the 22-23 and 23-24 seasons
We will look at the schedule of every game on BBref and save the box score urls to get the stats of every player

In [35]:
# first fetch the month links
def get_month_links(start_url):
  base_url = 'https://www.basketball-reference.com'
  month_link_list = []
  try:
    response = requests.get(start_url)
    response.raise_for_status()
  except requests.exceptions.HTTPError as err:
    if response.status_code == 429:
      print("Rate limit exceeded. Please try again later.")
    else:
      print(f"HTTP error occurred: {err}")
    return None, None
  
  soup = BeautifulSoup(response.text, 'html.parser')
  season = soup.find('h1').text
  season = season.strip().split(' ')
  season = season[0]
  body = soup.find('body')
  div_elements = body.find_all('div', class_='filter')
  for div in div_elements:
    a_tags = div.find_all('a', href=True)
    for a_tag in a_tags:
        link_text = a_tag.text.strip().lower()
        if link_text in month_list:
            month_link_list.append((link_text, f"{base_url}{a_tag['href']}"))
  return month_link_list, season
  
# get the links for each box score
def get_box_score_links(month_link_list):
  base_url = 'https://www.basketball-reference.com'
  page_to_check_dict = {'Month': [], 'Url': [], 'Index': []}
  box_link_array = []
  all_dates = []


  for month, page in month_link_list:
    page_link_list = []
    page_date_list = []
    try:
      response = requests.get(page)
      response.raise_for_status()
      soup = BeautifulSoup(response.text, 'html.parser')
      table = soup.find_all('tbody')
      box_scores = table[0].find_all('a', href=True)
      for i in box_scores:
        if i.text.strip() == 'Box Score':
            page_link_list.append(f'{base_url}{i['href']}')
        if ',' in i.text.strip():
            date = i.text.strip()
            date = date.split(', ')
            year = date[2]
            date = date[1].split(' ')
            day = f'0{date[1]}' if len(date[1]) == 1 else date[1]

            mon = month_dictionary[date[0]]
            date = f'{year}{mon}{day}'
            page_date_list.append(date)
      if len(page_link_list) == 0 or len(box_scores)/len(page_link_list) != 4:
          page_to_check_dict['Url'].append(page)
          page_to_check_dict['Month'].append(month)
          page_to_check_dict['Index'].append(len(page_link_list))
      else:
          page_to_check_dict['Url'].append(page)
          page_to_check_dict['Month'].append(month)
          page_to_check_dict['Index'].append(None)
      box_link_array.append(page_link_list)
      all_dates.append(page_date_list)
      time.sleep(10)
    except requests.exceptions.HTTPError as err:
      if response.status_code == 429:
        print("Rate limit exceeded. Please try again later.")
      else:
        print(f"HTTP error occurred: {err}")
      return None, None
  return box_link_array, all_dates


# iterate through the box links and dates and extract game data for each player
# from https://medium.com/@HeeebsInc/using-machine-learning-to-predict-daily-fantasy-basketball-scores-part-i-811de3c54a98
def extract_player_data(box_links, all_dates, season):
  df_columns = ['Date', 'Name', 'Team', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
               '3P%','FT', 'FTA', 'FT%', 'ORB', 
               'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+-' ]
  stat_df = pd.DataFrame(columns = df_columns)
  error_df = pd.DataFrame(columns = ['URL', 'Error'])
  for l, d in zip(box_links, all_dates):
    for link, date in zip(l, d):
      print(f'{link}\n{date}')
      print(f'Currently Scraping {link}')
      try:
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # first table
        table1 = soup.find('table', {'class': "sortable stats_table"})
        team = table1.text.split('\n')[0]
        parenthesis = team.find('(')
        team = team[:parenthesis - 1]
        print(team)
        table1 = table1.find('tbody')
        table1 = table1.find_all('tr')
        rows = []
        for row in table1:
            name = row.findAll('th')[0].text
            cols = row.findAll('td')
            cols = [i.text.strip() for i in cols]
            cols.append(name)
            rows.append(cols)
        for player in rows:
          if len(player) < 21:
            continue
          else:
            player = [0 if i == '' else i for i in player]
            colon = player[0].find(':')
            time = f'{player[0][:colon]}.{player[0][colon + 1::]}'
            print(player, len(player))
            player_dic = {'Date': date, 'Name': player[-1], 'Team': team_dictionary[team], 
                        'MP': time,'FG': player[1], 'FGA': player[2], 
                        'FG%': player[3], '3P': player[4], '3PA': player[5],
                          '3P%': player[6], 'FT': player[7], 'FTA': player[8], 
                          'FT%': player[9],'ORB': player[10], 
                          'DRB': player[11], 'TRB': player[12], 
                          'AST': player[13], 'STL': player[14], 
                          'BLK': player[15], 'TOV': player[16], 'PF': player[17],
                          'PTS': player[18], '+-': player[19]}
            new_row = pd.DataFrame([player_dic])
            stat_df = pd.concat([stat_df, new_row], ignore_index=True)
            continue

        # second table
        table2 = soup.findAll('table', {'class': "sortable stats_table"})
        team = table2[9].text.split('\n')[0]
        parenthesis = team.find('(')
        team = team[:parenthesis - 1]
        table2 = table2[9].find('tbody')
        table2 = table2.find_all('tr')
        rows = []
        for row in table2:
            name = row.findAll('th')[0].text
            cols = row.findAll('td')
            cols = [i.text.strip() for i in cols]
            cols.append(name)
            rows.append(cols)
        for player in rows:
            if len(player) < 21:
                print(player)
                continue
            else:
                player = [0 if i == '' else i for i in player]
                colon = player[0].find(':')
                time = f'{player[0][:colon]}.{player[0][colon + 1::]}'
                print(player, len(player))
                player_dic = {'Date': date, 'Name': player[-1], 'Team': team_dictionary[team], 
                              'MP': time,'FG': player[1],'FGA': player[2], 
                              'FG%': player[3], '3P': player[4], 
                              '3PA': player[5],'3P%': player[6], 'FT': player[7], 
                              'FTA': player[8], 'FT%': player[9],
                              'ORB': player[10], 'DRB': player[11], 
                              'TRB': player[12], 'AST': player[13],
                              'STL': player[14], 'BLK': player[15], 
                              'TOV': player[16], 'PF': player[17],
                              'PTS': player[18], '+-': player[19]}
                new_row = pd.DataFrame([player_dic])
                stat_df = pd.concat([stat_df, new_row], ignore_index=True)
                continue
        print(f'Finished Scraping: {link}')
      except Exception as e:
        print(f'Error Scrapping: {link}')
        error = {'URL': {link}, 'Error': str(e)}
        new_row = pd.DataFrame([error])
        error_df = pd.concat([error_df, new_row], ignore_index=True)
        error_df.to_csv(f'Errors_Season({season}).csv', lineterminator='\n', index=False)

    stat_df.to_csv(f'Season({season}).csv', lineterminator='\n', index=False)
    message = f'Saved game stats for the {season} season to a csv'
    print(message)

  stat_df.to_csv(f'Season({season}).csv', lineterminator='\n', index=False)
  message = f'Saved game stats for the {season} season to a csv'
  print(message)



### 2022-23 Season Data Collection

In [5]:
# get list of months along with urls for each
season_22_23 = 'https://www.basketball-reference.com/leagues/NBA_2023_games.html'
month_link_list, season_22_23 = get_month_links(season_22_23)

In [29]:
print(f'Season: {season_22_23}')
pprint(month_link_list)

Season: 2022-23
[('october',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-october.html'),
 ('november',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-november.html'),
 ('december',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-december.html'),
 ('january',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-january.html'),
 ('february',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-february.html'),
 ('march',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-march.html'),
 ('april',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-april.html'),
 ('may',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-may.html'),
 ('june',
  'https://www.basketball-reference.com/leagues/NBA_2023_games-june.html')]


In [24]:
# get box score link for each calendar month
box_scores, dates = get_box_score_links(month_link_list)

In [30]:
pprint(box_scores)
pprint(dates)

[['https://www.basketball-reference.com/boxscores/202210180BOS.html',
  'https://www.basketball-reference.com/boxscores/202210180GSW.html',
  'https://www.basketball-reference.com/boxscores/202210190DET.html',
  'https://www.basketball-reference.com/boxscores/202210190IND.html',
  'https://www.basketball-reference.com/boxscores/202210190ATL.html',
  'https://www.basketball-reference.com/boxscores/202210190BRK.html',
  'https://www.basketball-reference.com/boxscores/202210190MEM.html',
  'https://www.basketball-reference.com/boxscores/202210190MIA.html',
  'https://www.basketball-reference.com/boxscores/202210190TOR.html',
  'https://www.basketball-reference.com/boxscores/202210190MIN.html',
  'https://www.basketball-reference.com/boxscores/202210190SAS.html',
  'https://www.basketball-reference.com/boxscores/202210190UTA.html',
  'https://www.basketball-reference.com/boxscores/202210190PHO.html',
  'https://www.basketball-reference.com/boxscores/202210190SAC.html',
  'https://www.baske

##### Extract player stats for 22-23 season

In [None]:
# TODO extract player stats from each box score for 22-23 season BUG fix dataframe error
extract_player_data(box_scores, dates, season_22_23)

### 2023-24 Season Data Collection

In [None]:
# TODO get 2023-24 season data

#### MISC

In [40]:

start_url = 'https://www.basketball-reference.com/leagues/NBA_2023_games.html'

# Function to make a request with rate limit handling
def get_response_with_backoff(url):
    try:
        response = requests.get(url)
        if response.status_code == 429:
            retry_after = response.headers.get('Retry-After')
            if retry_after:
                print(f"Rate limit exceeded. Retry after {retry_after} seconds.")
                time.sleep(int(retry_after))
            else:
                print("Rate limit exceeded. Retrying after a default 60 seconds.")
                time.sleep(60)
            # Retry the request after waiting
            return get_response_with_backoff(url)
        elif response.status_code == 200:
            return response
        else:
            print(f"Received unexpected status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Call the function
response = get_response_with_backoff(start_url)
if response:
    print(f"Response Status Code: {response.status_code}")
    # Continue with your processing if response is successful
else:
    print("Failed to fetch the URL.")


Response Status Code: 200
