Importing and preparing the game log for the NBA Opener - Los Angeles Lakers vs. Denver Nuggets

In [20]:
import pandas as pd

def prepare_scoring_outcome_model(file_path):
    # Load CSV and assign proper headers
    raw_df = pd.read_csv("[2023-10-24]-LAL@DEN.csv", header=None)
    raw_df.columns = raw_df.iloc[0]
    df = raw_df[1:].copy().reset_index(drop=True)

    # Rename columns for easier access
    df = df.rename(columns={
        "Shot made or missed": "result",
        "Various types of events occurs in a game": "type",
        "Shot distance (feet)": "shot_distance",
        "X axis value converted to X coordinate on an standard NBA court \n(94 feet long and 50 feet wide)": "converted_x",
        "Y axis value converted to Y coordinate on an standard NBA court \n(94 feet long and 50 feet wide)": "converted_y",
        "More details on how the event happened": "reason",
        "Description of the play\nIMPORTANT LIMITATION:\nReferee names in descriptions are only available \nfrom the 2015-2016 season and onwards!": "description",
        "Home Team's\nFive-Men Lineup\n(Players On The Court)\n1": "h1",
    })

    # Convert coordinate and distance features
    numeric_cols = ["shot_distance", "converted_x", "converted_y"]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Extract shot type
    df["shot_type"] = df["type"].astype(str).str.lower().str.extract(r"(layup|dunk|3pt|jump|hook|floater)", expand=False)

    # Defensive impact flags
    df["was_steal"] = df["description"].astype(str).str.contains("steal", case=False, na=False).astype(int)
    df["was_block"] = df["description"].astype(str).str.contains("block", case=False, na=False).astype(int)
    df["was_foul"] = df["reason"].astype(str).str.contains("foul", case=False, na=False).astype(int)

    # Assign proxy defender (first home player)
    df["primary_defender"] = df["h1"]

    # Define scoring outcome label:
    # 1 = scoring event (description includes "PTS")
    # 0 = all other events (miss, turnover, steal, block, etc.)
    df["label"] = df["description"].astype(str).str.contains("PTS", case=False, na=False).astype(int)

    # Select final features
    features = df[["converted_x", "converted_y", "shot_distance", "shot_type",
                   "was_steal", "was_block", "was_foul", "label", "primary_defender"]].copy()

    return features

# Apply to the uploaded LAL@NOP game log
features_from_game = prepare_scoring_outcome_model("/mnt/data/[2023-12-31]-0022300446-LAL@NOP.csv")
features_from_game.head(50)

cleaned_data = features_from_game.dropna(subset = ["converted_x", "converted_y", "shot_distance", "shot_type", "was_steal", "was_block", "was_foul", "label", "primary_defender"])
cleaned_data.head(50)

Unnamed: 0,converted_x,converted_y,shot_distance,shot_type,was_steal,was_block,was_foul,label,primary_defender
2,24.8,5.8,1.0,dunk,0,0,0,1,Nikola Jokic
3,26.3,82.5,7.0,jump,0,0,0,1,Nikola Jokic
4,1.3,10.6,24.0,3pt,0,0,0,1,Nikola Jokic
5,24.4,83.2,6.0,layup,0,0,0,1,Nikola Jokic
6,45.7,18.4,25.0,3pt,0,0,0,1,Nikola Jokic
7,12.6,65.9,26.0,3pt,0,0,0,1,Nikola Jokic
8,6.6,21.9,25.0,3pt,0,0,0,0,Nikola Jokic
10,14.1,19.7,18.0,jump,0,0,0,0,Nikola Jokic
12,27.0,87.7,2.0,layup,0,1,0,0,Nikola Jokic
14,25.0,7.4,2.0,layup,0,0,0,1,Nikola Jokic


Encoding Categorical Features - shot_type and primary_defender are converted into numerical outputs

In [24]:
from sklearn.preprocessing import LabelEncoder

df_encoded = cleaned_data.copy()
le_shot = LabelEncoder()
le_def = LabelEncoder()

df_encoded["shot_type"] = le_shot.fit_transform(df_encoded["shot_type"])
df_encoded["primary_defender"] = le_def.fit_transform(df_encoded["primary_defender"])
df_encoded.head(50)

Unnamed: 0,converted_x,converted_y,shot_distance,shot_type,was_steal,was_block,was_foul,label,primary_defender
2,24.8,5.8,1.0,1,0,0,0,1,4
3,26.3,82.5,7.0,3,0,0,0,1,4
4,1.3,10.6,24.0,0,0,0,0,1,4
5,24.4,83.2,6.0,4,0,0,0,1,4
6,45.7,18.4,25.0,0,0,0,0,1,4
7,12.6,65.9,26.0,0,0,0,0,1,4
8,6.6,21.9,25.0,0,0,0,0,0,4
10,14.1,19.7,18.0,3,0,0,0,0,4
12,27.0,87.7,2.0,4,0,1,0,0,4
14,25.0,7.4,2.0,4,0,0,0,1,4


Training the test XGBoost model for the NBA opener

In [30]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb 
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

# Split features and target
X = df_encoded.drop(columns=["label"])
y = df_encoded["label"]

# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Train XGBoost model 
initial_model = xgb.XGBClassifier(eval_metric='logloss', n_estimators = 20, max_depth = 3, learning_rate = 0.1)
initial_model.fit(X_train, y_train)

# Predict and evaluate 
y_pred = initial_model.predict(X_test)
y_proba = initial_model.predict_proba(X_test)[:, 1]

results = {
    "Accuracy": accuracy_score(y_test, y_pred), 
    "ROC - AUC": roc_auc_score(y_test, y_proba),
    "Log Loss": log_loss(y_test, y_proba)
}

print(results)




{'Accuracy': 0.6111111111111112, 'ROC - AUC': 0.725, 'Log Loss': 0.6432039039462246}


Rendering the Scoring Opportunity Outcome Predictor per Defender

In [37]:
# Group by defender and average their probabilities 

df_encoded["score_proba"] = initial_model.predict_proba(X)[:, 1]

# Group by primary defender and average scoring opportunity probabilities 
defender_scores_initial = df_encoded.groupby("primary_defender")["score_proba"].mean().reset_index()
defender_scores_initial = defender_scores_initial.rename(columns = {"score_proba": "avg_score_probability_allowed"})

# Sort the values by average score probability allowed 
defender_scores_initial = defender_scores_initial.sort_values(by = "avg_score_probability_allowed", ascending = True)

# Decode encoded primary_defender back to actual player names 
defender_scores_initial["primary_defender_name"] = le_def.inverse_transform(defender_scores_initial["primary_defender"])
defender_scores_initial = defender_scores_initial[["primary_defender_name", "avg_score_probability_allowed"]]

print(defender_scores_initial.head(10))

  primary_defender_name  avg_score_probability_allowed
2          Jamal Murray                       0.437655
4          Nikola Jokic                       0.498377
3    Michael Porter Jr.                       0.561931
5         Peyton Watson                       0.573582
1       Christian Braun                       0.576039
6            Zeke Nnaji                       0.597822
0          Aaron Gordon                       0.622565
