### Next Steps
 - Create new dataset with same format from 2022 season as testing data once the model is trained  
 - Perform statistical analysis of results, then revisit model if the results are poor

### Code Improvements
 - Automate directory creation
 - Automated file creation for season averages
 - for dataset creation consider adding a list of headers as an intput so that we can customize the features being selected

### Player Features	Location  
True Shooting %	Advanced Stats -  "TS%"  
Free throw attempt rate	Advanced Stats -  "FTr"  
Box Plus/Minus	Advanced Stats - "BPM"  
Average Points	Basic Stats - "PTS"  
Usage Rate	Advanced Stats - "USG%"  

### Team Stats	Location  
Pace	Team Advanced - "Pace"  
Team Offensive Rating	Team Advanced - "ORtg"  
Opponent FG%	Opponent Team Stats - "FG%"  
Opponent Shots at Rim	Shooting Opponent - "0-3"  
Opponent Shots 3-10ft	Shooting Opponent - "3-10"  
Expected FG% (opponent)	Team Advanced = "eFG%_2"  

### Other	Location  
Home/Away	Scraped Game Logs  

### Output	Location  
Player Points Scored	Scraped Game Logs  




In [274]:
player_shooting_header = ['Rk', 
                          'Player', 
                          'Pos', 
                          'Age', 
                          'Team', 
                          'G',
                          'GS', 
                          'MP', 
                          'FG%', 
                          'Dist.', 
                          #
                          'FGA 2P', 
                          'FGA 0-3', 
                          'FGA 3-10', 
                          'FGA 10-16', 
                          'FGA 16-3P', 
                          'FGA 3P', 
                         #
                          'FG% 2P', 
                          'FG% 0-3', 
                          'FG% 3-10', 
                         'FG% 10-16', 
                          'FG% 16-3P', 
                          'FG% 3P', 
                          #'
                           '% Assisted 2P', 
                           '% Assisted 3P', 
                           'FGA Dunk', 
                           '# Dunks Attempted', 
                           #
                           'FGA Corner 3', 
                           'FG% Corner 3', 
                           'Att.', 
                           '#_2', 
                           'Awards',
                           'Player_ID']
    
opponent_shooting_header = [
    'oRk',
    'oTeam',
    'oG',
    'oMP',
    'oFG%',
    'oAvg Distance',
    'oFGA 2P',
    'oFGA 0-3',
    'oFGA 3-10',
    'oFGA 10-16',
    'oFGA 16-3P',
    'oFGA 3P',
    'oFG% 2P',
    'oFG% 0-3',
    'oFG% 3-10',
    'oFG% 10-16',
    'oFG% 16-3P',
    'oFG% 3P',    
    'o% Assisted 2P',
    'o% Assisted 3P',
    'oFGA Dunk',
    'oMade Dunks',
    'oFGA Layup',
    'oMade Layup',
    'oFGA Corner 3',
    'oFG% Corner 3'
]


abbreviation_map = {
    'Atlanta Hawks': 'ATL', 'Boston Celtics': 'BOS', 'Brooklyn Nets': 'BKN', 'Chicago Bulls': 'CHI',
    'Charlotte Hornets': 'CHA', 'Cleveland Cavaliers': 'CLE', 'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN', 'Detroit Pistons': 'DET', 'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU', 'Indiana Pacers': 'IND', 'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL', 'Memphis Grizzlies': 'MEM', 'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL', 'Minnesota Timberwolves': 'MIN', 'New Orleans Pelicans': 'NOP',
    'New York Knicks': 'NYK', 'Oklahoma City Thunder': 'OKC', 'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI', 'Phoenix Suns': 'PHX', 'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC', 'San Antonio Spurs': 'SAS', 'Toronto Raptors': 'TOR',
    'Utah Jazz': 'UTA', 'Washington Wizards': 'WAS'}   


In [275]:
import pandas as pd
import os
import numpy as np


In [276]:

#preferably all of these would be within the same folder. keep the same naming so that the file paths can be created automatically
def averages_file_fixup(year, #season year 
               fplayer_basic, #player averages
               fplayer_advanced, #player shooting averages
                fplayer_shooting, #player advanced averages
               fteam_basic, #team averaages
               fteam_advanced, #team advanced averages
               fopponent_basic, #opponent averages
               fopponent_shooting, #opponent shooting averages
                ): 
    
    #Shooting Averages: remove first two rows (one is junk and the other is the header), and add a new header
    player_shooting = pd.read_csv(fplayer_shooting,skiprows=2,header=None)
    player_shooting = player_shooting.dropna(axis=1, how='all') #drop blank columns
    
    player_shooting.columns = player_shooting_header

    opponent_shooting = pd.read_csv(fopponent_shooting,skiprows=2,header=None)
    opponent_shooting = opponent_shooting.dropna(axis=1, how='all') #drop blank columns
    opponent_shooting = opponent_shooting.loc[:, ~opponent_shooting.columns.isna()]  
    opponent_shooting.reset_index(drop=True, inplace=True)
    opponent_shooting.columns = opponent_shooting_header

    #Player Averages
    player_basic = pd.read_csv(fplayer_basic)
    player_advanced = pd.read_csv(fplayer_advanced)

    #Team Averages
    team_basic = pd.read_csv(fteam_basic)
    team_advanced = pd.read_csv(fteam_advanced)
    opponent_basic = pd.read_csv(fopponent_basic)

    #Fix team column in team based CSV files
    team_basic['Team'] = team_basic['Team'].str.replace('*', '', regex=False)
    team_basic['Team'] = team_basic['Team'].replace(abbreviation_map)
    
    
    team_advanced['Team'] = team_advanced['Team'].str.replace('*', '', regex=False)
    team_advanced['Team'] = team_advanced['Team'].replace(abbreviation_map)
    
    opponent_basic['Team'] = opponent_basic['Team'].str.replace('*', '', regex=False)
    opponent_basic['Team'] = opponent_basic['Team'].replace(abbreviation_map)
    
    opponent_shooting['oTeam'] = opponent_shooting['oTeam'].str.replace('*', '', regex=False)
    opponent_shooting['oTeam'] = opponent_shooting['oTeam'].replace(abbreviation_map)

    # Rename column if needed
    if 'Player-additional' in player_basic.columns:
        player_basic.rename(columns={'Player-additional': 'Player_ID'}, inplace=True)

    if 'Player-additional' in player_advanced.columns:
        player_advanced.rename(columns={'Player-additional': 'Player_ID'}, inplace=True)
    

    return year, player_basic, player_advanced,player_shooting, team_basic, team_advanced, opponent_basic, opponent_shooting


    

In [277]:
def fix_game_log_excel(fgame_log):
    game_log = pd.read_excel(fgame_log)
    #Fix Game Log Abreviations
    correct_abr = {"PHO":"PHX","CHO":"CHI","BRK":"BKN"}
    game_log["Opp"] = game_log["Opp"].replace(correct_abr)
    game_log["Tm"] = game_log["Tm"].replace(correct_abr)
    
    #Make Home/Away column into binary daya the home vs away to binary - HOME = 1, AWAY = 0
    binary_court = {"home": 1, "away": 0}
    game_log = game_log.copy()
    game_log["Court"] = game_log["Court"].map(binary_court)

    #Make the minutes into decimals
    game_log['MP'] = game_log['MP'].astype(str)
    mp_split = game_log['MP'].str.split(':', expand=True)
    game_log['Minutes'] = pd.to_numeric(mp_split[0], errors='coerce')
    game_log['Seconds'] = pd.to_numeric(mp_split[1], errors='coerce')
    game_log['MP_decimal'] = game_log['Minutes'] + (game_log['Seconds'] / 60)
    game_log['MP_decimal'] = game_log['MP_decimal'].fillna(0)
    game_log.drop(['Minutes', 'Seconds','MP'], axis=1, inplace=True)

   
    #The team names and team apreviations are incorrect the seasonal averages. To fix this I wrote some code to get rid of * symbols, replace team names with abr
    player_id = game_log.iloc[0]["Player_ID"]
    return game_log, player_id


In [278]:
def extract_features(player_id, L_game_log_features, player_basic, player_advanced, player_shooting,team_basic, team_advanced, opponent_basic, opponent_shooting,
    #Game Log
    L_curr_player_features,
    #Player Stats
    L_player_advanced_features,
    L_player_basic_features,
    L_player_shooting_features,
    #Team Features
    L_team_basic_features,
    L_team_advanced_features,
    #Opponent Features
    L_opponent_features,
    L_opponent_shooting_features,
    ):
    
    #Game Log
    game_log_features = L_game_log_features[L_curr_player_features]

    #Create dataframes with all player features 
    #Player Features
    player_advanced_features = player_advanced[L_player_advanced_features]
    player_basic_features = player_basic[L_player_basic_features]   


    # COMBINE THE DATA FRAMES INTO A FINAL DATAFRAME
    merged_df = pd.merge(game_log_features, player_advanced_features, on="Player_ID", how="left")
    merged_df = pd.merge(merged_df, player_basic_features, on="Player_ID", how="left")
    merged_df.rename(columns = {"PTS_x" : "final_PTS"}, inplace = True)
    merged_df.rename(columns = {"PTS_y" : "avg_PTS"}, inplace = True)

    ### UP TO HERE IS GOOD 

    #Player Shooting Features
    for col in L_player_shooting_features: #create player shooting columns
        merged_df[col] = None
    
    for index, row in merged_df.iterrows():
        player_data = player_shooting[player_shooting["Player_ID"] == row["Player_ID"]]
        if not player_data.empty:
            for feature in L_player_shooting_features:
                if feature in player_data.columns:  # Ensure the feature exists in team_data
                    merged_df.at[index, feature] = player_data.iloc[0][feature]

                    
    #Team Advanced Features
    for col in L_team_advanced_features: #create opponent shooting columns
        merged_df[col] = None
    
    for index, row in merged_df.iterrows():
        team_data = team_advanced[team_advanced["Team"] == row["Tm"]]
        if not team_data.empty:
            for feature in L_team_advanced_features:
                if feature in team_data.columns:  # Ensure the feature exists in team_data
                    merged_df.at[index, feature] = team_data.iloc[0][feature]
    
     #Team Basic Features
    for col in L_team_basic_features: #create opponent shooting columns
        merged_df[col] = None
    
    for index, row in merged_df.iterrows():
        team_data = team_basic[team_basic["Team"] == row["Tm"]]
        if not team_data.empty:
            for feature in L_team_basic_features:
                if feature in team_data.columns:  # Ensure the feature exists in team_data
                    merged_df.at[index, feature] = team_data.iloc[0][feature]

    #Opponent Shooting Features
    for col in L_opponent_shooting_features: #create opponent shooting columns
        merged_df[col] = None
    
    for index, row in merged_df.iterrows():
        team_data = opponent_shooting[opponent_shooting["oTeam"] == row["Opp"]]
        if not team_data.empty:
            for feature in L_opponent_shooting_features:
                if feature in team_data.columns:  # Ensure the feature exists in team_data
                    merged_df.at[index, feature] = team_data.iloc[0][feature]
    
    #Opponent Basic Features
    for col in L_opponent_features: #create opponent shooting columns
        merged_df[col] = None
    
    for index, row in merged_df.iterrows():
        team_data = opponent_basic[opponent_basic["Team"] == row["Opp"]]
        if not team_data.empty:
            for feature in L_opponent_features:
                if feature in team_data.columns:  # Ensure the feature exists in team_data
                    merged_df.at[index, feature] = team_data.iloc[0][feature]
    

    
    #Remove the non numerica columns now that all data has been organized
    with_names = merged_df #if you want to see pre word removal
    #print(with_names)
    non_numeric_columns = ["Player_ID","Tm", "Opp"]
    merged_df = merged_df.drop(columns=non_numeric_columns)
    
    return merged_df




## Clean up final datasets
 - Remove duplicated rows
 - Remove rows with empty cells

In [279]:


import os
import pandas as pd

def clean_and_save_datasets(folder_path, output_path):
    """
    Cleans and saves datasets by removing rows with missing values and 
    skipping datasets with completely missing columns.

    Parameters:
        folder_path (str): Path to the folder containing raw datasets.
        training_data_path (str): Path to save cleaned datasets.

    Returns:
        None
    """
    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)
    
    count = 0
    # Process each file in the folder
    for game_log in os.listdir(folder_path):
        final_dataset_path = os.path.join(folder_path, game_log)
        print(final_dataset_path)
        
        # Process only CSV files
        if final_dataset_path.endswith('.csv'):
            # Load the dataset
            clean_up = pd.read_csv(final_dataset_path)

            # Check for completely missing columns
            missing_columns = clean_up.columns[clean_up.isnull().all()]

            if len(missing_columns) > 0:
                # Skip saving if any column is completely missing
                print(f"The following columns are completely missing in {game_log}: {list(missing_columns)}")
                print(f"The dataset {game_log} will not be saved.")
            else:
                # Remove rows with missing values
                clean_up = clean_up.dropna()

                # Save the cleaned dataset
                cleaned_file_path = os.path.join(output_path, f"testing_data_{count}.csv")
                count+=1
                clean_up.to_csv(cleaned_file_path, index=False)
                



## Code Running

### below is year selection and file paths

In [None]:
## YEAR SELECTION #################################################################################################################################

year = 2023
run = 2

#Player
pb = f"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\{year}_Averages\{year%1000}_{year%1000+1}_Player_Basic_Stats.csv"
pa = f"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\{year}_Averages\{year%1000}_{year%1000+1}_Player_Advanced_Stats.csv"
ps = f"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\{year}_Averages\{year%1000}_{year%1000+1}_Player_Shooting_Stats.csv"
#Team
tb = f"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\{year}_Averages\{year%1000}_{year%1000+1}_Team_Basic_Stats.csv"
ta = f"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\{year}_Averages\{year%1000}_{year%1000+1}_Team_Advanced_Stats.csv"
#Opponent
ob = f"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\{year}_Averages\{year%1000}_{year%1000+1}_Opponent_Basic_Stats.csv"
osh = f"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\{year}_Averages\{year%1000}_{year%1000+1}_Opponent_Shooting_Stats.csv"




#Folder paths
game_logs_path = f"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\Game Logs\Game_Logs_{year}"
clean_csv_path = f"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\Season Datasets\Run_{run}\Final_Datasets_{year}"

########################################################################################################################################
## FEATURES

#Game log and Player
L_game_log_features = ["PTS","Player_ID","Tm","Court","Opp"]

#Player Features
L_player_basic_features = ["Player_ID", "PTS", "FGA", "MP"]
L_player_advanced_features = ["Player_ID","TS%", "FTr", "USG%", "PER"]
L_player_shooting_features = [] 

#Team Features
L_team_advanced_features = ["ORtg", "eFG%_2", "Pace"]
L_team_basic_features = []

#L_team_basic_features = [""] unsure if i need this for now
L_opponent_features = ["FG%"]
L_opponent_shooting_features = ['oFG%','oFG% 3P','oAvg Distance','oFGA 2P','oFGA 0-3','oFGA 3-10','oFGA 10-16','oFGA 16-3P','oFGA 3P','oFG% 2P','oFG% 0-3','oFG% 3-10','oFG% 10-16','oFG% 16-3P','oFG% 3P']   


### Debugging Cell

In [281]:
# #Fix up the averages and create dataframes, this only needs to be done once
# year, player_basic, player_advanced,player_shooting, team_basic, team_advanced, opponent_basic, opponent_shooting = averages_file_fixup(year, pb,pa,ps,tb,ta,ob,osh)
# test_gl, id = fix_game_log_excel(r"C:\Github_Repos\MATH2015-Linear-Regression-Model\Linear_Regression_Data\Game Logs\Game_Logs_2021\bookede01_gamelog_2021.xlsx")
# print(test_gl)
# clean_csv = extract_features(id, test_gl, player_basic, player_advanced, player_shooting, team_basic, team_advanced, opponent_basic, opponent_shooting,
#         #Game Log
#         L_game_log_features,
#         #Player Stats
#         L_player_advanced_features,
#         L_player_basic_features,
#         L_player_shooting_features,
#         #Team Features
#         L_team_basic_features,
#         L_team_advanced_features,
#         #Opponent Features
#         L_opponent_features,
#         L_opponent_shooting_features
#         )



### Full Folder Runs

In [282]:

##########################################################################################################################################
## CODE 

#Fix up the averages and create dataframes, this only needs to be done once
year, player_basic, player_advanced,player_shooting, team_basic, team_advanced, opponent_basic, opponent_shooting = averages_file_fixup(year, pb,pa,ps,tb,ta,ob,osh)

#First Fix up the Game Logs
count = 1
for game_log in os.listdir(game_logs_path):
    gamelog_file_path = os.path.join(game_logs_path,game_log)
    test_gl, id = fix_game_log_excel(gamelog_file_path)
    
    #Extract the features and combine them into one csv
    clean_csv = extract_features(id, test_gl, player_basic, player_advanced, player_shooting, team_basic, team_advanced, opponent_basic, opponent_shooting,
        #Game Log
        L_game_log_features,
        #Player Stats
        L_player_advanced_features,
        L_player_basic_features,
        L_player_shooting_features,
        #Team Features
        L_team_basic_features,
        L_team_advanced_features,
        #Opponent Features
        L_opponent_features,
        L_opponent_shooting_features
        )
    
    missing_columns = clean_csv.columns[clean_csv.isnull().all()]

    if len(missing_columns) > 0:
        # Skip saving if any column is completely missing
        print(f"The following columns are completely missing in {game_log}: {list(missing_columns)}")
        print(f"The dataset {game_log} will not be saved.")
    else:
        # Remove rows with missing values
        clean_csv = clean_csv.dropna()

        # Save the cleaned dataset
        os.makedirs(clean_csv_path, exist_ok=True)
        cleaned_file_path = os.path.join(clean_csv_path, f"testing_data_{id}.csv")
        count+=1
        clean_csv.to_csv(cleaned_file_path, index=False)


##########################################################################################################################################



In [283]:
player_shooting_header = ['Rk', 
                          'Player', 
                          'Pos', 
                          'Age', 
                          'Team', 
                          'G',
                          'GS', 
                          'MP', 
                          'FG%', 
                          'Dist.', 
                          #
                          'FGA 2P', 
                          'FGA 0-3', 
                          'FGA 3-10', 
                          'FGA 10-16', 
                          'FGA 16-3P', 
                          'FGA 3P', 
                         #
                          'FG% 2P', 
                          'FG% 0-3', 
                          'FG% 3-10', 
                         'FG% 10-16', 
                          'FG% 16-3P', 
                          'FG% 3P', 
                          #'
                           '% Assisted 2P', 
                           '% Assisted 3P', 
                           'FGA Dunk', 
                           '# Dunks Attempted', 
                           #
                           'FGA Corner 3', 
                           'FG% Corner 3', 
                           'Att.', 
                           '#_2', 
                           'Awards',
                           'Player_ID']
    
opponent_shooting_header = [
    'oRk',
    'oTeam',
    'oG',
    'oMP',
    'oFG%',
    'oAvg Distance',
    'oFGA 2P',
    'oFGA 0-3',
    'oFGA 3-10',
    'oFGA 10-16',
    'oFGA 16-3P',
    'oFGA 3P',
    'oFG% 2P',
    'oFG% 0-3',
    'oFG% 3-10',
    'oFG% 10-16',
    'oFG% 16-3P',
    'oFG% 3P',    
    'o% Assisted 2P',
    'o% Assisted 3P',
    'oFGA Dunk',
    'oMade Dunks',
    'oFGA Layup',
    'oMade Layup',
    'oFGA Corner 3',
    'oFG% Corner 3'
]


abbreviation_map = {
    'Atlanta Hawks': 'ATL', 'Boston Celtics': 'BOS', 'Brooklyn Nets': 'BKN', 'Chicago Bulls': 'CHI',
    'Charlotte Hornets': 'CHA', 'Cleveland Cavaliers': 'CLE', 'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN', 'Detroit Pistons': 'DET', 'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU', 'Indiana Pacers': 'IND', 'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL', 'Memphis Grizzlies': 'MEM', 'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL', 'Minnesota Timberwolves': 'MIN', 'New Orleans Pelicans': 'NOP',
    'New York Knicks': 'NYK', 'Oklahoma City Thunder': 'OKC', 'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI', 'Phoenix Suns': 'PHX', 'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC', 'San Antonio Spurs': 'SAS', 'Toronto Raptors': 'TOR',
    'Utah Jazz': 'UTA', 'Washington Wizards': 'WAS'}   