## PLAYER FANTASY POINTS PROJECTIONS

### Importing necessary libraries

In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
from sklearn.exceptions import DataConversionWarning
from sklearn.ensemble import RandomForestRegressor
from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog

### Defining functions to extract required information

In [2]:
# Ensure reproducibility by setting the random seed
np.random.seed(123)

# Suppress specific warnings to maintain clean output
warnings.filterwarnings("ignore", category=DataConversionWarning)

# Function to retrieve the ID of a player by their name
def set_player_id(player_name):
    """
    Retrieves the ID of a player based on their name.

    Parameters:
    player_name (str): The name of the player to search for.

    Returns:
    int or None: The ID of the player if found, else None.
    """
    player_info = players.find_players_by_full_name(player_name)
    if player_info:
        return player_info[0]['id']
    else:
        return None

# Function to extract the opponent team from the matchup string
def extract_opponent(matchup):
    """
    Extracts the opponent team abbreviation from the matchup string.

    Parameters:
    matchup (str): The matchup string containing team abbreviations.
    
    """
    opp_team = matchup[-3:]
    return opp_team

def get_opponent_stats(opp_statistics, opponent_team, column_indices):
    """
    Fetches specific statistics for a given opponent team from a DataFrame.

    Parameters:
    opp_statistics (DataFrame): DataFrame containing statistics of opponents.
    opponent_team (str): Abbreviation of the opponent team.
    column_indices (list of lists): Indices of columns from which to pull data.

    Returns:
    list or None: List containing the opponent team's statistics if found, otherwise None.
    """
    # Locate the row for the specified team
    row_of_team = opp_statistics[opp_statistics['TEAM'] == opponent_team]

    if row_of_team.empty:
        print(f"Opponent team '{opponent_team}' not found in the DataFrame.")
        return None

    collected_stats = []
    # Iterate over each list of column indices to gather statistics
    for indices in column_indices:
        # Extract statistics-related values for the opponent team
        row_stats = [float(row_of_team.iloc[0, idx - 1]) for idx in indices]
        collected_stats.append(row_stats)

    # Calculate the average of col 2 and add it to each statistics list
    avg_col_2 = np.mean(opp_statistics.iloc[:, 2].values)
    for stats_list in collected_stats:
        stats_list.append(avg_col_2)

    return collected_stats

### Defining gradient boosting predictor model

In [3]:
# Function to perform hyperparameter tuning for Gradient Boosting Regressor using mean squared error (MSE)
def gradient_boosting_fit(features, targets):
    # Initialize a Gradient Boosting Regressor model
    gb_model = GradientBoostingRegressor()
    
    # Define the grid of hyperparameters to search over
    hyperparams = {
        'n_estimators': [5, 15, 25, 40],  # Specify number of estimators
        'learning_rate': [0.01, 0.05, 0.1, 0.2]  # Specify learning rate
    }
    
    # Perform grid search with cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(gb_model, hyperparams, cv=2, scoring='neg_mean_squared_error')
    grid_search.fit(features, targets)

    # Extract the best model, best hyperparameters, and best score
    best_GB_model = grid_search.best_estimator_

    return best_GB_model

# Function to make predictions using the best Gradient Boosting model
def gradient_boosting_predictor(features, targets, input_values, fitted_model):
    # Check if input_values is a list or a single value and convert to array

    input_array = np.atleast_2d(input_values)

    # Ensure input_array has the same number of features as the training data
    if input_array.shape[1] != features.shape[1]:
        # If mismatch, remove the last column from input_array
        input_array = input_array[:, :-1]

    # Make predictions using the best model
    predicted_GB_values = fitted_model.predict(input_array)

    return predicted_GB_values

# Function to perform Gradient Boosting prediction and return rounded values
def GB_preditction(df, x_columns, y_columns, input_values, best_model):
    # Use the predict_gradient_boosting function to make predictions and calculate MSE
    predicted_values = gradient_boosting_predictor(df.iloc[:, x_columns].values, df.iloc[:, y_columns].values, input_values, best_model)
    values = np.round(predicted_values, 2)
    
    return values

### Defining random forest predictor model 

In [4]:
# Function to perform hyperparameter tuning for Random Forest Regressor
def random_forest_fit(features, targets):
    # Initialize a Random Forest Regressor model
    model = RandomForestRegressor()
    
    # Define the grid of hyperparameters to search over
    hyperparams = {
        'n_estimators': [5, 15, 25, 40],  # Specify number of estimators
        'max_depth': [None, 10, 20, 30],    # Specify maximum depth of trees
    }
    
    # Perform grid search with cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(model, hyperparams, cv=2, scoring='neg_mean_squared_error')
    grid_search.fit(features, targets)

    # Extract the best model, best hyperparameters, and best score
    best_RF_model = grid_search.best_estimator_

    return best_RF_model

# Function to make predictions using the best Random Forest model
def random_forest_predictor(features, targets, input_values, best_model):
    # Check if input_values is a list or a single value and convert to array

    input_array = np.atleast_2d(input_values)

    # Ensure input_array has the same number of features as the training data
    if input_array.shape[1] != features.shape[1]:
        # If mismatch, remove the last column from input_array
        input_array = input_array[:, :-1]

    # Make predictions using the best model
    predicted_RF_values = best_model.predict(input_array)

    return predicted_RF_values

# Function to perform Random Forest prediction and return rounded values
def RF_preditction(df, x_columns, y_columns, input_values, best_model):
    # Use the predict_random_forest function to make predictions and calculate MSE
    predicted_values = random_forest_predictor(df.iloc[:, x_columns].values, df.iloc[:, y_columns].values, input_values, best_model)
    values = np.round(predicted_values, 2)
    
    return values


### Loading player pool data

In [5]:
players_data = pd.read_csv('1_input_basketball.csv')
players_data['Points'] = np.nan
players_data['Rebounds'] = np.nan
players_data['Assists'] = np.nan
players_data['3PM'] = np.nan

### Generating projections for each player in the pool

In [6]:
# Loop through each row in the player data
for index, row in players_data.iterrows():
    
    # Extract player name, position, and opponent team
    player_name = row.iloc[0]  # Name from the first column
    position = row.iloc[3]
    opponent = row.iloc[5]

    # Define column indices for opponent team statistics
    col_num_oppn_stats = [4, 11, 10]   # Columns for DPTS, DFG%, D3PM
    col_num_1_oppn_stats = [5]         # Columns for DREB
    col_num_2_oppn_stats = [6]         # Columns for DAST
    col_num_3_oppn_stats = [10, 11, 4] # Columns for D3PM, DFG%, DPTS

    # Specify the file path for opponent team statistics based on player position
    file_path = '1_input_rw-def-vs-pos (' + position + ').csv'
    
    # Read the CSV file containing opponent team statistics
    opp_stats = pd.read_csv(file_path)

    # Call a function to retrieve opponent team statistics
    values_list = get_opponent_stats(opp_stats, opponent,
                                         [col_num_oppn_stats, col_num_1_oppn_stats, col_num_2_oppn_stats, col_num_3_oppn_stats])
    
    # Print the player name
    print(player_name)

    # Get the player ID
    player_id = set_player_id(player_name)

    if player_id:
        # Get player game log for the current season
        gamelog = playergamelog.PlayerGameLog(player_id=player_id, season='2023-24')
        player_stats = gamelog.get_data_frames()[0]
        
        # Check if the player has played at least 3 games
        if len(player_stats) < 3:
            continue

        # Drop specific columns from the player's gamelog
        columns_to_drop = ['SEASON_ID', 'Game_ID', 'WL', 'PLUS_MINUS', 'VIDEO_AVAILABLE', 'Player_ID', 'FTM', 'FT_PCT']
        player_stats = player_stats.drop(columns=columns_to_drop)

        # Apply the mapping function to create a new 'Team' column in the player game log DataFrame
        player_stats['TEAM'] = player_stats['MATCHUP'].apply(lambda x: extract_opponent(x))

        # Merge the DataFrames based on the 'Team' column
        merged_df = pd.merge(player_stats, opp_stats, on='TEAM', how='left')

        # Define column indices and target variable indices
        x_col_1 = [22, 29, 28]      # Columns for DPTS, DFG%, D3PM
        x_col_2 = [23]              # Columns for DREB
        x_col_3 = [24]              # Columns for DAST
        x_col_4 = [28, 29, 22]      # Columns for D3PM, DFG%, DPTS
        y_col_1 = [18]              # Column for Points
        y_col_2 = [12]              # Column for Rebounds
        y_col_3 = [13]              # Column for Assists
        y_col_4 = [6]               # Column for 3PM
        
        
        # Create a list of result names
        predict_col_names = ["Points", "Rebounds", "Assists", "3PM"]

        # Perform hyperparameter tuning for both models and predict the projections 
        for i, (feature_cols, target_cols, values) in enumerate(zip([x_col_1, x_col_2, x_col_3, x_col_4],
                                                         [y_col_1, y_col_2, y_col_3, y_col_4],
                                                         values_list)):
            # Extract input and target values
            x_values = values[:-1]  # Extract input values
            y_values = values[-1]   # Extract target values

            # Perform hyperparameter tuning for Random Forest model
            gb_model = random_forest_fit(merged_df.iloc[:, feature_cols].values,
                                                                     merged_df.iloc[:, target_cols].values)
            
            # Make predictions using the Random Forest model
            rf = RF_preditction(merged_df, feature_cols, target_cols, x_values, gb_model)

            # Perform hyperparameter tuning for Gradient Boosting model
            rf_model = gradient_boosting_fit(merged_df.iloc[:, feature_cols].values,
                                                                         merged_df.iloc[:, target_cols].values)
            
            # Make predictions using the Gradient Boosting model
            gb = GB_preditction(merged_df, feature_cols, target_cols, x_values, rf_model)
            
            # Take the average of predictions from both models as the projection 
            players_data.at[index, predict_col_names[i]] = (rf[0] + gb[0]) / 2


Stanley Umude
Gradey Dick
Thomas Bryant
Dalano Banton
Bruce Brown
Jontay Porter
Brice Sensabaugh
Payton Pritchard
Trayce Jackson-Davis
Orlando Robinson
Bobby Portis
Jordan Nwora
Andrew Nembhard
Ochai Agbaji
Shai Gilgeous-Alexander
Keon Ellis
Evan Fournier
Kris Dunn
Taylor Hendricks
Scoot Henderson
Jae Crowder
De'Aaron Fox
Tosan Evbuomwan
Keegan Murray
Myles Turner
Damian Lillard
Tyrese Haliburton
Kyle Lowry
Collin Sexton
Jaden Ivey
James Harden
Jusuf Nurkic
Royce O'Neale
Tobias Harris
Jimmy Butler
Chet Holmgren
Isaac Okoro
Domantas Sabonis
Terry Rozier
Patrick Beverley
Keyonte George
Tyrese Maxey
Malik Monk
Brandin Podziemski
Jonathan Kuminga
GG Jackson II
Kelly Olynyk
Aaron Nesmith
Georges Niang
Desmond Bane
Harrison Barnes
Cade Cunningham
John Collins
Jalen Duren
Ivica Zubac
Kris Murray
Sam Merrill
Marcus Sasser
Malik Beasley
Paul Reed
Walker Kessler
Darius Garland
Jarrett Allen
Devin Booker
Pascal Siakam
Caris LeVert
Jayson Tatum
Jalen Williams
Toumani Camara
Draymond Green
Jaime Ja

#### Calculating fantasy points earned by a player

In [7]:
players_data['final_projections'] = 1*players_data['Points'] + 1.25*players_data['Rebounds'] + 1.5*players_data['Assists']+0.5*players_data['3PM']

In [8]:
players_data = players_data.dropna()

In [9]:
players_data

Unnamed: 0,First Name,Last Name,Salary,Position,Team,Opponent,Projection,Points,Rebounds,Assists,3PM,final_projections
0,Stanley Umude,1,4.7,SG,DET,IND,26.45,4.270,3.100,0.030,0.535,8.45750
1,Gradey Dick,1,5.3,SG,TOR,SAC,26.10,6.725,2.100,1.220,1.285,11.82250
2,Thomas Bryant,1,4.7,C,MIA,CLE,22.93,8.035,4.400,0.770,0.040,14.71000
3,Dalano Banton,1,5.3,PG,POR,LAC,25.25,7.430,2.220,3.165,1.030,15.46750
4,Bruce Brown,1,7.0,SG,TOR,SAC,32.81,9.890,4.010,2.805,0.485,19.35250
...,...,...,...,...,...,...,...,...,...,...,...,...
161,Javon Freeman-Liberty,1,4.5,SG,TOR,SAC,0.84,7.845,3.975,1.705,0.215,15.47875
162,Mike Muscala,1,4.4,C,OKC,UTA,0.65,2.885,3.700,0.460,0.415,8.40750
163,Kessler Edwards,1,4.3,SF,SAC,TOR,0.61,4.270,1.030,0.345,0.720,6.43500
164,Rayan Rupert,1,4.4,SG,POR,LAC,0.62,3.940,2.045,1.740,0.355,9.28375


In [10]:
players_data.to_csv('1_output_players_projections.csv', index=False)