In [14]:
import pandas as pd
import os
import time
import random
import requests
from bs4 import BeautifulSoup

In [15]:
# README: NBA Player Data Filter Code

# Overview
# This code defines the `scrape_list_names` function, which processes NBA player data from a CSV file.
# It 

def scrape_list_names(players_path):
    """
    Extracts a list of player names from a CSV file based on a filter condition.

    Args:
        players_path (str): The file path to the CSV file containing player data.
                           The CSV must include the following columns:
                           - 'MP': Minutes played by the player.
                           - 'Player-additional': Column containing player names.

    Output:
        list: A Python list of player names (from the 'Player-additional' column) 
              who average more than 8 minutes per game.
    """
    player_data = pd.read_csv(players_path)
    filtered_players = player_data[player_data["MP"] > 8]
    player_list = filtered_players['Player-additional'].tolist()
    return player_list

In [16]:
def scrape_gamelog(player_id, url, year):
    """
    Scrapes NBA player game log data from Basketball Reference and saves it as a CSV file.

    Args:
        player_id (str): The unique identifier for the player.
        url (str): The URL of the player's Basketball Reference game log page.
        year (int): The season year for which the game log data is being retrieved.

    Output:
        - Saves a CSV file containing the player's game log data in the specified directory.
        - The file is named as '{player_id}_gamelog_{year}.csv'.

    Returns:
        None
    """
    
    #Go to website and open basketball reference page
    response = requests.get(url)

    # Check for response status
    if response.status_code != 200:
        print(f"Failed to retrieve data for {player_id}. Status code: {response.status_code}")
        return None  # Exit function early
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'id': 'pgl_basic'})
    headers = [th.text for th in table.find('thead').find_all('th')][1:]
    rows = table.find('tbody').find_all('tr')
    data = []
    # Pull the data from the table
    for row in rows:
        if row.find('th', {'scope': 'row'}) is not None:
            game_date = row.find('th').text
            cells = row.find_all('td')
            cells_data = [cell.text for cell in cells]
            data.append([player_id, game_date] + cells_data)
    headers = ['Player_ID', 'Date'] + headers
    df = pd.DataFrame(data, columns=headers)
    # Remove rows where the player did not play (G column is empty)
    df = df[df['G'] != '']
    # Modify the 7th column based on its content
    df.iloc[:, 6] = df.iloc[:, 6].apply(lambda x: 'away' if '@' in x else 'home')
    # Rename columns
    df.rename(columns={df.columns[8]: 'Court'}, inplace=True)
    # Remove the second column
    df.drop(df.columns[1], axis=1, inplace=True)
    # Define the directory where you want to save the file
     
    save_directory = f"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\Game Logs\Game_Logs_{year}"
    # Ensure the directory exists
    os.makedirs(save_directory, exist_ok=True)
    # Create the full file path
    file_name = os.path.join(save_directory, f'{player_id}_gamelog_{year}.xlsx')
    # Save to Excel with player ID in the file name
    with pd.ExcelWriter(file_name) as writer:
        df.to_excel(writer, index=False, sheet_name='Game Log')

In [None]:

#CSV with player averages from entire season
player_basic_stats = r"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\2021_Averages\21_22_Player_Basic_Stats.csv"
ids = scrape_list_names(player_basic_stats) #get the IDs of all players who play over 8 minutes a game
error_names = [] #capture any names that caused errors
year = 2021
#actual scraping, turn off for 

for player_id in ids:
        try:
            curr_game_log = f"C:\\Github_Repos\\MATH2015-Linear-Regression-Model\\Linear_Regression_Data\\Game_Logs\\{player_id}_gamelog_{year}.xlsx"
            url = f'https://www.basketball-reference.com/players/{player_id[0]}/{player_id}/gamelog/{year}'
            print(url)
            scrape_gamelog(player_id, url, year)
            time.sleep(random.uniform(5, 15))  # Wait for 2 seconds between request
        except Exception as e:
            # Capture any error that occurs and move to the next player
            error_names.append((player_id, str(e)))
            print(f"An error occurred with {player_id}: {e}")


https://www.basketball-reference.com/players/e/embiijo01/gamelog/2021
https://www.basketball-reference.com/players/j/jamesle01/gamelog/2021
https://www.basketball-reference.com/players/a/antetgi01/gamelog/2021
https://www.basketball-reference.com/players/d/duranke01/gamelog/2021
https://www.basketball-reference.com/players/d/doncilu01/gamelog/2021
https://www.basketball-reference.com/players/y/youngtr01/gamelog/2021
https://www.basketball-reference.com/players/d/derozde01/gamelog/2021
https://www.basketball-reference.com/players/i/irvinky01/gamelog/2021
https://www.basketball-reference.com/players/m/moranja01/gamelog/2021
https://www.basketball-reference.com/players/j/jokicni01/gamelog/2021
https://www.basketball-reference.com/players/t/tatumja01/gamelog/2021
https://www.basketball-reference.com/players/b/bookede01/gamelog/2021
https://www.basketball-reference.com/players/m/mitchdo01/gamelog/2021
https://www.basketball-reference.com/players/c/curryst01/gamelog/2021
https://www.basketba

: 