In [1]:
import os, sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))
from nbafuns import *
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.linear_model import RidgeCV, Ridge
from joblib import Parallel, delayed, parallel_backend

data_DIR = "../fdata/"
export_DIR = "./fdata/"

fig_DIR = "../figs/rapm/"

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
def build_player_list(possessions):
    p = [possessions.iloc[:,i].unique() for i in range(10)]
    players = list(set(chain(*p)))
    players.sort()
    return players

@njit
def map_players(stints_x_base, stint_X_rows, players, p_num):
    for i in np.arange(len(stints_x_base)):
        row = stints_x_base[i]
        for p in row[:5]:
            stint_X_rows[i,players.index(p)] = 1
        for p in row[5:]:
            stint_X_rows[i,players.index(p)+p_num] = -1
    return stint_X_rows

# Break the dataframe into x_train (nxm matrix), y_train (nx1 matrix of target values), and weights (not necessary because all rows will have 1 possession)
def convert_to_matricies(possessions, name, players):
    # extract only the columns we need
    # Convert the columns of player ids into a numpy matrix
    stints_x_base = possessions.iloc[:,:10].to_numpy(dtype = np.int64)

    # Apply our mapping function to the numpy matrix
    p_num = len(players)
    stint_X_rows = np.zeros([len(stints_x_base),2*p_num],dtype = np.int8)
    stint_X_rows = map_players(stints_x_base, stint_X_rows, players, p_num)

    # Convert the column of target values into a numpy matrix
    stint_Y_rows = possessions[name].to_numpy()
    # return matricies and possessions series
    return stint_X_rows, stint_Y_rows

# Convert lambda value to alpha needed for ridge CV
def lambda_to_alpha(lambda_value, samples):
    return (lambda_value * samples) / 2.0

# Convert RidgeCV alpha back into a lambda value
def alpha_to_lambda(alpha_value, samples):
    return (alpha_value * 2.0) / samples

def calculate_rapm(train_x, train_y, lambdas, name, players, possessions):
    # convert our lambda to alpha
    alpha = lambda_to_alpha(lambdas, train_x.shape[0])
    # Use Normal Ridge with fixed alpha instead of RidgeCV
    clf = Ridge(alpha=alpha, fit_intercept=True)
    # fit our training data
    model = clf.fit(
        train_x,
        train_y,
    )
    # convert our list of players into a mx1 matrix
    player_arr = np.transpose(np.array(players).reshape(1, len(players)))
    # extract our coefficients into the offensive and defensive parts
    coef_ = model.coef_
    coef_offensive_array = coef_[0 : len(players)][np.newaxis].T
    coef_defensive_array = coef_[len(players) :][np.newaxis].T
    # concatenate the offensive and defensive values with the playey ids into a mx3 matrix
    p = possessions.iloc[:,:10].to_numpy()
    puq, cts = np.unique(p.flatten(), return_counts=True) 
    cts = cts[:,np.newaxis] 
    player_id_with_coef = np.concatenate(
        [player_arr, coef_offensive_array, coef_defensive_array,cts], axis=1
    )
    # build a dataframe from our matrix
    players_coef = pd.DataFrame(player_id_with_coef)
    intercept = model.intercept_
    
    # apply new column names
    players_coef.columns = ["playerId", f"O{name}", f"D{name}","Possessions"]
    # Add the offensive and defensive components together (we should really be weighing this to the number of offensive and defensive possession played as they are often not equal).
    players_coef[name] = (players_coef[f"O{name}"] + players_coef[f"D{name}"])
    # rank the values
    players_coef["{0}_R".format(name)] = players_coef[name].rank(ascending=False)
    players_coef[f"O{name}_R"] = players_coef[f"O{name}"].rank(ascending=False)
    players_coef[f"D{name}_R"] = players_coef[f"D{name}"].rank(ascending=False)
    return players_coef, model

def get_df(results,season,player_dict):
    results = np.round(results, decimals=2)
    results = results.reindex(sorted(results.columns), axis=1)
    results["playerId"] = results["playerId"].astype("int")
    results["RAPM_R"] = results["RAPM_R"].astype("int")
    results["ORAPM_R"] = results["ORAPM_R"].astype("int")
    results["DRAPM_R"] = results["DRAPM_R"].astype("int")
    results["Possessions"] = results["Possessions"].astype("int")
    results["Player"] = results["playerId"].map(player_dict)
    results = results.sort_values(by=["RAPM"], ascending=False)
    results = results.reset_index(drop=True)
    results["Year"] = int(season)+1
    results = results[
        [
            "Year",
            "Player",
            "ORAPM",
            "ORAPM_R",
            "DRAPM",
            "DRAPM_R",
            "RAPM",
            "RAPM_R",
            "Possessions",
        ]
    ]

    return results

# Single Season RAPM

In [3]:
def single_season_rapm(season, league = "NBA",lambdas = 0.01):
    player_dict = get_players_pbp(league=league)
    possessions = pd.read_parquet(data_DIR + league +"_rapm_possessions_" + season + ".parquet")
    player_list = build_player_list(possessions)
    possessions["PPP"] = 100 * possessions["pts"]
    train_x, train_y = convert_to_matricies(possessions, "PPP", player_list)
    results, model = calculate_rapm(train_x, train_y, lambdas, "RAPM", player_list,possessions)
    results = get_df(results,season,player_dict)
    results.to_csv(export_DIR + "single_season/"  + league + "_RAPM_" + season + ".csv",index=False)
    model.year = int(season)
    model.results = results

    return model

In [4]:
# %%script echo skipping
season_start = 2000
season_end = 2023
seasons = np.arange(season_start, season_end, 1).astype(str)
out = [single_season_rapm(season, league = "NBA",lambdas = 0.005) for season in seasons]

# Multi Season RAPM

In [5]:
def multiseason_season_rapm(season, league = "NBA",lambdas = 0.002, multi = 3):
    player_dict = get_players_pbp(league=league)
    yr = int(season)
    seasons = (np.arange(yr-multi,yr)+1).astype(str)
    posa = []
    for s in seasons:
        poss = pd.read_parquet(data_DIR + league +"_rapm_possessions_" + s + ".parquet")
        posa.append(poss)
    possessions = pd.concat(posa)
    player_list = build_player_list(possessions)
    possessions["PPP"] = 100 * possessions["pts"]
    train_x, train_y = convert_to_matricies(possessions, "PPP", player_list)
    results, model = calculate_rapm(train_x, train_y, lambdas, "RAPM", player_list,possessions)
    results = get_df(results,season,player_dict)
    results.to_csv(export_DIR + "multi_season/" + league + "_RAPM_" + season + "_" + f"{multi}_yr.csv",index=False)
    model.results = results
    model.train_x = train_x
    model.train_y = train_y

    return model

In [6]:
# multi, lambdas = 3, 0.002
# multi, lambdas = 5, 0.001

In [7]:
season_start = 2022
season_end = 2023
seasons = np.arange(season_start, season_end, 1).astype(str)

out = [multiseason_season_rapm(season) for season in seasons]

In [8]:
out[0].train_y

array([  0, 200, 200, ..., 100, 300, 300], dtype=int64)

In [9]:
dir(out[0])

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_decision_function',
 '_estimator_type',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_set_intercept',
 '_validate_data',
 '_validate_params',
 'alpha',
 'coef_',
 'copy_X',
 'daal_model_',
 'fit',
 'fit_intercept',
 'fit_shape_good_for_daal_',
 'get_metadata_routing',
 'get_params',
 'in