In [1]:
import os, sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))
from nbafuns import *
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.linear_model import RidgeCV, Ridge
from joblib import Parallel, delayed, parallel_backend

data_DIR = "../fdata/"
export_DIR = "./fdata/"

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
def build_player_list(posessions):
    players = list(
        set(
            list(posessions["off1"].unique())
            + list(posessions["off2"].unique())
            + list(posessions["off3"])
            + list(posessions["off4"].unique())
            + list(posessions["off5"].unique())
            + list(posessions["def1"].unique())
            + list(posessions["def2"].unique())
            + list(posessions["def3"].unique())
            + list(posessions["def4"].unique())
            + list(posessions["def5"].unique())
        )
    )
    players.sort()
    return players

@njit
def map_players(stints_x_base, stint_X_rows, players, p_num):
    for i in np.arange(len(stints_x_base)):
        row = stints_x_base[i]
        for p in row[:5]:
            stint_X_rows[i,players.index(p)] = 1
        for p in row[5:]:
            stint_X_rows[i,players.index(p)+p_num] = -1
    return stint_X_rows

# Break the dataframe into x_train (nxm matrix), y_train (nx1 matrix of target values), and weights (not necessary because all rows will have 1 possession)
def convert_to_matricies(possessions, name, players):
    # extract only the columns we need
    # Convert the columns of player ids into a numpy matrix
    stints_x_base = possessions[
        ["off1", "off2", "off3", "off4", "off5", "def1", "def2", "def3", "def4", "def5"]
    ].to_numpy(dtype = np.int64)

    # Apply our mapping function to the numpy matrix
    p_num = len(players)
    stint_X_rows = np.zeros([len(stints_x_base),2*p_num],dtype = np.int8)
    stint_X_rows = map_players(stints_x_base, stint_X_rows, players, p_num)

    # Convert the column of target values into a numpy matrix
    stint_Y_rows = possessions[name].to_numpy()
    # return matricies and possessions series
    return stint_X_rows, stint_Y_rows

# Convert lambda value to alpha needed for ridge CV
def lambda_to_alpha(lambda_value, samples):
    return (lambda_value * samples) / 2.0

# Convert RidgeCV alpha back into a lambda value
def alpha_to_lambda(alpha_value, samples):
    return (alpha_value * 2.0) / samples

def calculate_rapm(train_x, train_y, lambdas, name, players):
    # # convert our lambdas to alphas
    # alphas = [lambda_to_alpha(l, train_x.shape[0]) for l in lambdas]
    # # create a 5 fold CV ridgeCV model. Our target data is not centered at 0, so we want to fit to an intercept.
    # # clf = RidgeCV(alphas=alphas, cv=5, fit_intercept=True, normalize=False)
    # # normalize option Deprecated
    # clf = RidgeCV(alphas=alphas, cv=5, fit_intercept=True)
    # Use Normal Ridge with fixed alpha instead of RidgeCV
    alpha = lambda_to_alpha(lambdas, train_x.shape[0])
    clf = Ridge(alpha=alpha, fit_intercept=True)
    # fit our training data
    model = clf.fit(
        train_x,
        train_y,
    )
    # convert our list of players into a mx1 matrix
    player_arr = np.transpose(np.array(players).reshape(1, len(players)))

    # extract our coefficients into the offensive and defensive parts
    coef_offensive_array = model.coef_[0 : len(players)][np.newaxis].T
    coef_defensive_array = model.coef_[len(players) :][np.newaxis].T

    # concatenate the offensive and defensive values with the playey ids into a mx3 matrix
    player_id_with_coef = np.concatenate(
        [player_arr, coef_offensive_array, coef_defensive_array], axis=1
    )
    # build a dataframe from our matrix
    players_coef = pd.DataFrame(player_id_with_coef)
    intercept = model.intercept_

    # apply new column names
    players_coef.columns = ["playerId", "O{0}".format(name), "D{0}".format(name)]

    # Add the offesnive and defensive components together (we should really be weighing this to the number of offensive and defensive possession played as they are often not equal).
    players_coef[name] = (
        players_coef["O{0}".format(name)] + players_coef["D{0}".format(name)]
    )

    # rank the values
    players_coef["{0}_R".format(name)] = players_coef[name].rank(ascending=False)
    players_coef["O{0}_R".format(name)] = players_coef["O{0}".format(name)].rank(
        ascending=False
    )
    players_coef["D{0}_R".format(name)] = players_coef["D{0}".format(name)].rank(
        ascending=False
    )
    return players_coef, model

def get_df(results,season,player_dict):
    results = np.round(results, decimals=2)
    results = results.reindex(sorted(results.columns), axis=1)
    results["playerId"] = results["playerId"].astype("int")
    results["RAPM_R"] = results["RAPM_R"].astype("int")
    results["ORAPM_R"] = results["ORAPM_R"].astype("int")
    results["DRAPM_R"] = results["DRAPM_R"].astype("int")
    results["Player"] = results["playerId"].map(player_dict)
    results = results.sort_values(by=["RAPM"], ascending=False)
    results = results.reset_index(drop=True)
    results = results.drop(columns=["playerId"])
    results["Year"] = int(season)+1
    results = results[
        [
            "Year",
            "Player",
            "ORAPM",
            "ORAPM_R",
            "DRAPM",
            "DRAPM_R",
            "RAPM",
            "RAPM_R",
        ]
    ]

    return results

In [3]:
def single_season_rapm(season, league = "NBA",lambdas = 0.01):
    # lambdas = [0.01, 0.05, 0.1]
    player_dict = get_players_pbp(league=league)
    possessions = pd.read_parquet(data_DIR + league +"_rapm_possessions_" + season + ".parquet")
    player_list = build_player_list(possessions)
    possessions["PPP"] = 100 * possessions["pts"]
    train_x, train_y = convert_to_matricies(possessions, "PPP", player_list)
    results, model = calculate_rapm(train_x, train_y, lambdas, "RAPM", player_list)
    results = get_df(results,season,player_dict)
    results.to_csv(export_DIR + league + "_RAPM_" + season + ".csv",index=False)
    model.year = int(season)
    return model

# Single Season RAPM

In [4]:
# season_start = 2000
# season_end = 2024
# seasons = np.arange(season_start, season_end, 1).astype(str)
# for season in seasons:
#     single_season_rapm(season, league = "NBA",lambdas = 0.01)

# Multiseason RAPM

In [5]:
def multiseason_season_rapm(season, league = "NBA",lambdas = 0.01, multi = 3):
    # lambdas = [0.01, 0.05, 0.1]
    player_dict = get_players_pbp(league=league)
    yr = int(season)
    seasons = (np.arange(yr-multi,yr)+1).astype(str)
    posa = []
    for s in seasons:
        poss = pd.read_parquet(data_DIR + league +"_rapm_possessions_" + s + ".parquet")
        posa.append(poss)
    possessions = pd.concat(posa)
    player_list = build_player_list(possessions)
    possessions["PPP"] = 100 * possessions["pts"]
    train_x, train_y = convert_to_matricies(possessions, "PPP", player_list)
    results, model = calculate_rapm(train_x, train_y, lambdas, "RAPM", player_list)
    results = get_df(results,season,player_dict)
    results.to_csv(export_DIR + league + "_RAPM_" + seasons[0] + "_" + seasons[-1] + ".csv",index=False)
    return model

In [6]:
season_start = 2002
season_end = 2023
seasons = np.arange(season_start, season_end, 1).astype(str)
for season in seasons:
    multiseason_season_rapm(season, league = "NBA",lambdas = 0.01)