In [43]:
import pandas as pd
import os
from bs4 import BeautifulSoup
import requests
import numpy as np
import time
import random

In [44]:
def scrape_gamelog(player_id, url, year):
    
    #Go to website and open basketball reference page
    response = requests.get(url)

    # Check for response status
    if response.status_code != 200:
        print(f"Failed to retrieve data for {player_id}. Status code: {response.status_code}")
        return None  # Exit function early
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'id': 'pgl_basic'})
    headers = [th.text for th in table.find('thead').find_all('th')][1:]
    rows = table.find('tbody').find_all('tr')
    data = []
    # Pull the data from the table
    for row in rows:
        if row.find('th', {'scope': 'row'}) is not None:
            game_date = row.find('th').text
            cells = row.find_all('td')
            cells_data = [cell.text for cell in cells]
            data.append([player_id, game_date] + cells_data)
    headers = ['Player_ID', 'Date'] + headers
    df = pd.DataFrame(data, columns=headers)
    # Remove rows where the player did not play (G column is empty)
    df = df[df['G'] != '']
    # Modify the 7th column based on its content
    df.iloc[:, 6] = df.iloc[:, 6].apply(lambda x: 'away' if '@' in x else 'home')
    # Rename columns
    df.rename(columns={df.columns[8]: 'Court'}, inplace=True)
    # Remove the second column
    df.drop(df.columns[1], axis=1, inplace=True)
    # Define the directory where you want to save the file
    save_directory = f"C:\\Github_Repos\MATH2015-Linear-Regression-Model\\Linear Regression Data\\Game Logs"
    # Ensure the directory exists
    os.makedirs(save_directory, exist_ok=True)
    # Create the full file path
    file_name = os.path.join(save_directory, f'{player_id}_gamelog_{year}.xlsx')
    # Save to Excel with player ID in the file name
    with pd.ExcelWriter(file_name) as writer:
        df.to_excel(writer, index=False, sheet_name='Game Log')

In [45]:
def scrape_list_names(players_path):
    player_data = pd.read_csv(players_path)
    filtered_players = player_data[player_data["MP"] > 8]
    player_list = filtered_players['Player-additional'].tolist()
    return player_list

In [46]:

#opponent_shooting_header_str = (
#'Rk\tTeam\tG\tMP\tFG%\tDist.\t \t2P\tRange_0-3\tRange_3-10\tRange_10-16\tRange_16-3P\t3P\t \t2P\tRange_0-3_Made\tRange_3-10_Made\tRange_10-16_Made\tRange_16-3P_Made\t3P_Made\t \t2P_Made\t3P_Made\t '
#    '\t%FGA\tMd.\t \t%FGA_Made\tMd_Made\t \t%3PA\t3P%')



def update_header(csv_path, header_string):
    """
    Updates the header of the opponent shooting CSV file with the provided header string.

    Args:
        csv_path (str): Path to the opponent shooting CSV file.
        header_string (str): Tab-separated string of column names.
    """
    # Convert the string to a list of headers
    new_header = [col.strip() for col in header_string.split('\t')]

    # Read the CSV without a header
    try:
        data = pd.read_csv(csv_path, header=None, encoding='ISO-8859-1')
    except FileNotFoundError:
        print(f"File not found: {csv_path}")
        return
    except pd.errors.ParserError as e:
        print(f"Error parsing the CSV file: {e}")
        return

    # Assign the new header to the DataFrame
    data.columns = new_header

    # Save the updated CSV
    data.to_csv(csv_path, index=False)
    print(f"Updated CSV file saved with new header: {csv_path}")


### Player Features	Location  
True Shooting %	Advanced Stats -  "TS%"  
Free throw attempt rate	Advanced Stats -  "FTr"  
Box Plus/Minus	Advanced Stats - "BPM"  
Average Points	Basic Stats - "PTS"  
Usage Rate	Advanced Stats - "USG%"  

### Team Stats	Location  
Pace	Team Advanced - "Pace"  
Team Offensive Rating	Team Advanced - "ORtg"  
Opponent FG%	Opponent Team Stats - "FG%"  
Opponent Shots at Rim	Shooting Opponent - "0-3"  
Opponent Shots 3-10ft	Shooting Opponent - "3-10"  
Expected FG% (opponent)	Team Advanced = "eFG%_2"  

### Other	Location  
Home/Away	Scraped Game Logs  

### Output	Location  
Player Points Scored	Scraped Game Logs  




In [None]:
abbreviation_map = {
    'Atlanta Hawks': 'ATL', 'Boston Celtics': 'BOS', 'Brooklyn Nets': 'BKN', 'Chicago Bulls': 'CHI',
    'Charlotte Hornets': 'CHA', 'Cleveland Cavaliers': 'CLE', 'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN', 'Detroit Pistons': 'DET', 'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU', 'Indiana Pacers': 'IND', 'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL', 'Memphis Grizzlies': 'MEM', 'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL', 'Minnesota Timberwolves': 'MIN', 'New Orleans Pelicans': 'NOP',
    'New York Knicks': 'NYK', 'Oklahoma City Thunder': 'OKC', 'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI', 'Phoenix Suns': 'PHX', 'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC', 'San Antonio Spurs': 'SAS', 'Toronto Raptors': 'TOR',
    'Utah Jazz': 'UTA', 'Washington Wizards': 'WAS'
}



def final_file_creation(fgame_log, fplayer_basic, fplayer_advanced, fteam_basic, fteam_advanced, fopponent_basic, fopponent_shooting):
    # Specify the encoding for all CSV files
    curr_player_gamelog = pd.read_excel(fgame_log)
    player_basic = pd.read_csv(fplayer_basic, encoding='ISO-8859-1')
    player_advanced = pd.read_csv(fplayer_advanced, encoding='ISO-8859-1')

    player_id = curr_player_gamelog.iloc[0]["Player_ID"]
    team_basic = pd.read_csv(fteam_basic, encoding='ISO-8859-1')
    team_basic['Team'] = team_basic['Team'].str.replace('*', '', regex=False)
    team_basic['Team'] = team_basic['Team'].replace(abbreviation_map)
    
    team_advanced = pd.read_csv(fteam_advanced, encoding='ISO-8859-1')
    team_advanced['Team'] = team_advanced['Team'].str.replace('*', '', regex=False)
    team_advanced['Team'] = team_advanced['Team'].replace(abbreviation_map)
    
    opponent_basic = pd.read_csv(fopponent_basic, encoding='ISO-8859-1')
    opponent_basic['Team'] = opponent_basic['Team'].str.replace('*', '', regex=False)
    opponent_basic['Team'] = opponent_basic['Team'].replace(abbreviation_map)
    
    opponent_shooting = pd.read_csv(fopponent_shooting, encoding='ISO-8859-1')
    opponent_shooting['Team'] = opponent_shooting['Team'].str.replace('*', '', regex=False)
    opponent_shooting['Team'] = opponent_shooting['Team'].replace(abbreviation_map)

    # Rename column if needed
    if 'Player-additional' in player_basic.columns:
        player_basic.rename(columns={'Player-additional': 'Player_ID'}, inplace=True)

    if 'Player-additional' in player_advanced.columns:
        player_advanced.rename(columns={'Player-additional': 'Player_ID'}, inplace=True)

    # FEATURE EXTRACTION
    curr_player_features = curr_player_gamelog[["PTS","Player_ID","Tm","Court","Opp"]]
    correct_abr = {"PHO":"PHX","CHO":"CHI","BRK":"BKN"}
    for index, row in curr_player_features.iterrows():
        if row["Opp"] in correct_abr:
            curr_player_features.at[index, "Opp"] = correct_abr[row["Opp"]]

    #switch the home vs away to binary - HOME = 1, AWAY = 0
    binary_court = {"home": 1, "away": 0}
    curr_player_features["Court"] = curr_player_features["Court"].map(binary_court)


    player_advanced_features = player_advanced[["Player_ID", "TS%", "FTr", "USG%", "BPM"]]
    player_basic_features = player_basic[["Player_ID", "PTS"]]
    

    team_advaced_features = team_advanced[["Team", "ORtg", "eFG%_2", "Pace"]]
    opponent_features = opponent_basic[["Team", "FG%"]]
    opponent_shooting_features = opponent_shooting[["Team", "Range_0-3", "Range_3-10"]]

    # COMBINE THE DATA FRAMES INTO A FINAL
    merged_df = pd.merge(curr_player_features, player_advanced_features, on="Player_ID", how="left")
    merged_df = pd.merge(merged_df, player_basic_features, on="Player_ID", how="left")
    merged_df.rename(columns = {"PTS_x" : "final_PTS"}, inplace = True)
    merged_df.rename(columns = {"PTS_y" : "avg_PTS"}, inplace = True)
    
    merged_df["ORtg"] = None
    merged_df["eFG%_2"] = None
    merged_df["Pace"] = None
    merged_df["Opponent_FG%"] = None
    merged_df["Range_0-3"] = None
    merged_df["Range_3-10"] = None

    for index, row in merged_df.iterrows():
        team_data = team_advaced_features[team_advaced_features["Team"] == row["Tm"]]
        if not team_data.empty:
            merged_df.at[index, "ORtg"] = team_data.iloc[0]["ORtg"]
            merged_df.at[index, "eFG%_2"] = team_data.iloc[0]["eFG%_2"]
            merged_df.at[index, "Pace"] = team_data.iloc[0]["Pace"]

        opp_data = opponent_features[opponent_features["Team"] == row["Opp"]]
        if not opp_data.empty:
            merged_df.at[index, "Opponent_FG%"] = opp_data.iloc[0]["FG%"]

       
        opp_shooting = opponent_shooting_features[opponent_shooting_features["Team"] == row["Opp"]]
        if not opp_shooting.empty:  
            merged_df.at[index, "Range_0-3"] = opp_shooting.iloc[0]["Range_0-3"]
            merged_df.at[index, "Range_3-10"] = opp_shooting.iloc[0]["Range_3-10"]
        
        team_data = team_advanced[team_advanced["Team"] == row["Tm"]]
    
    #Remove the non numerica columns now that all data has been organized
    #with_names = merged_df #if you want to see pre word removal
    #print(with_names)
    non_numeric_columns = ["Player_ID","Tm", "Opp", "Court"]
    merged_df = merged_df.drop(columns=non_numeric_columns)
    output_dir = r"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear Regression Data\Final Datasets"
    os.makedirs(output_dir, exist_ok=True)

    # Save the file
    merged_df.to_csv(os.path.join(output_dir, f"Final_Dataset_{player_id}.csv"), index=False)
    
    #print(opponent_shooting_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_player_features["Court"] = curr_player_features["Court"].map(binary_court)


In [None]:
# File paths in correct order for final file creation
test_curr_player = r"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear Regression Data\Game Logs\brunsja01_gamelog_2023.xlsx"
player_basic_stats = r"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear Regression Data\23_24_Player_Basic_Stats.csv"
player_advanced_stats = r"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear Regression Data\23_24_Player_Advanced_Stats.csv"
team_basic_stats = r"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear Regression Data\23_24_Team_Basic_Stats.csv"
team_advanced_stats = r"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear Regression Data\23_24_Team_Advanced_Stats.csv"
opponent_basic_stats = r"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear Regression Data\23_24_Opponent_Basic_Stats.csv"
shooting_opponent_stats = r"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear Regression Data\23_24_Shooting_Opponent_Stats.csv"

# Call the function
final_file_creation(
    test_curr_player,
    player_basic_stats,
    player_advanced_stats,
    team_basic_stats,
    team_advanced_stats,
    opponent_basic_stats,
    shooting_opponent_stats
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_player_features["Court"] = curr_player_features["Court"].map(binary_court)


In [None]:
def scrape_list_names(players_path):
    player_data = pd.read_csv(players_path)
    filtered_players = player_data[player_data["MP"] > 8]
    player_list = filtered_players['Player-additional'].tolist()
    return player_list

# Run Scraper

In [None]:



ids = scrape_list_names(player_basic_stats) #get the IDs of all players who play over 8 minutes a game
ids_not_scraped = ids[33:] #so far I have 32 IDs

weird_names = []
error_names = []
year = 2023
#actual scraping, turn off for now

# for player_id in ids_not_scraped:
#         try:
#             curr_game_log = f"C:\\Github_Repos\\MATH2015-Linear-Regression-Model\\Linear Regression Data\\Game Logs\\{player_id}_gamelog_{year}.xlsx"
#             url = f'https://www.basketball-reference.com/players/{player_id[0]}/{player_id}/gamelog/{year}'
#             print(url)
#             scrape_gamelog(player_id, url, year)
#             time.sleep(random.uniform(5, 15))  # Wait for 2 seconds between request
#         except Exception as e:
#             # Capture any error that occurs and move to the next player
#             error_names.append((player_id, str(e)))
#             print(f"An error occurred with {player_id}: {e}")


https://www.basketball-reference.com/players/m/murrade01/gamelog/2023
https://www.basketball-reference.com/players/t/thomaca02/gamelog/2023
https://www.basketball-reference.com/players/k/kuzmaky01/gamelog/2023
https://www.basketball-reference.com/players/t/townska01/gamelog/2023
https://www.basketball-reference.com/players/s/siakapa01/gamelog/2023
https://www.basketball-reference.com/players/s/siakapa01/gamelog/2023
https://www.basketball-reference.com/players/s/siakapa01/gamelog/2023
https://www.basketball-reference.com/players/w/wembavi01/gamelog/2023
An error occurred with wembavi01: 'NoneType' object has no attribute 'find'
https://www.basketball-reference.com/players/m/murraja01/gamelog/2023
https://www.basketball-reference.com/players/s/sengual01/gamelog/2023
https://www.basketball-reference.com/players/b/bridgmi02/gamelog/2023
An error occurred with bridgmi02: 'NoneType' object has no attribute 'find'
https://www.basketball-reference.com/players/g/grantje01/gamelog/2023
https://