In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
import json

In [2]:
data_full = pd.read_csv("nba_player_stats_20250304.csv")

data_full.head()

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,...,FTA_PER_GAME,OREB_PER_GAME,DREB_PER_GAME,REB_PER_GAME,AST_PER_GAME,STL_PER_GAME,BLK_PER_GAME,TOV_PER_GAME,PF_PER_GAME,PTS_PER_GAME
0,1985-86,LAL,22.0,82,1.0,1542.0,209,388,0.539,1.0,...,2.04,1.95,2.7,4.65,0.66,0.6,0.6,1.21,2.79,6.35
1,1986-87,LAL,23.0,79,72.0,2240.0,316,587,0.538,0.0,...,3.57,2.66,5.13,7.78,1.06,0.89,1.01,1.29,2.16,10.78
2,1987-88,LAL,24.0,82,64.0,2636.0,322,640,0.503,0.0,...,4.62,2.99,5.67,8.66,1.13,1.06,0.55,1.46,2.49,11.43
3,1988-89,LAL,25.0,82,82.0,2510.0,401,758,0.529,4.0,...,4.38,3.15,5.87,9.01,1.26,1.15,0.67,1.45,2.1,13.27
4,1989-90,LAL,26.0,82,82.0,2709.0,385,806,0.478,13.0,...,4.51,3.2,5.49,8.68,1.1,0.8,0.61,1.41,2.52,12.94


In [3]:
def check_for_nans(data):
    """
    Check for NaNs in the dataset.
    """
    if data.isna().any().any():
        print("NaNs found in the data")
    else:
        print("No NaNs found in the data")

check_for_nans(data_full)

NaNs found in the data


In [4]:
def print_rows_with_nans(df):
    """
    Prints out all rows in the DataFrame that contain NaNs along with the unique columns.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to check for NaNs.
    """
    rows_with_nans = df[df.isna().any(axis=1)]
    nan_columns = rows_with_nans.columns[rows_with_nans.isna().any()].tolist()
    print(f"Columns with NaNs: {nan_columns}")

print_rows_with_nans(data_full)

Columns with NaNs: ['GS', 'FG_PCT', 'FG3_PCT', 'FT_PCT']


In [5]:
# for players traded mid-season, the data has multiple rows containing information for each team they played for. 
# this breaks the year-by-year sequential structure i want.
data_full["is_traded"] = data_full.duplicated(subset=["SEASON_ID", "PLAYER_NAME"], keep=False).astype(int) 
# creating a new column to represent seasons where the player was traded
data_full = data_full[(data_full["TEAM_ABBREVIATION"] == "TOT") | (data_full["is_traded"] == 0)] # dropping all non "TOT" rows for traded players.

In [6]:
data_full = data_full.drop(columns=['GS', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS']) # these are season totals, i need per game values

# Since FG%, GS, FG3%, FT% have nan values I will drop them. They either have redundant data or meaningless data in the case of GS (IMO).
# I will add a masking layer so that they don't interfere with the model.
# A cleaner solution would be to drop the columns or remove old data but I'll do it this way since there's already so little data.

In [7]:
data_full = data_full.drop(columns=["TEAM_ABBREVIATION", "SEASON_ID"]) # dropping the unnecessary columns except season_start_year and player_name
# we'll need them for sorting and grouping later on.

In [8]:
data_full = data_full.fillna(-999.0) # just in case

We need to normalize the data but we cant do it normally since there are masked values of -999.0.

In [9]:
data = data_full[data_full["SEASON_START_YEAR"] != 2025] # since the 2024-2025 season isnt complete, it shouldnt be used in training

In [10]:
def compute_normalization_params(df, mask_value=-999.0):
    """
    Compute per-column means and stds ignoring the masked value.
    """
    
    columns = df.select_dtypes(include=[np.number]).columns.tolist()
        
    means = {}
    stds = {}
    for col in columns:
        # need to filter out the masked values before computing the statistics
        valid_data = df.loc[df[col] != mask_value, col]
        means[col] = valid_data.mean()
        stds[col] = valid_data.std()
    return means, stds

def normalize_data(df, means, stds, mask_value=-999.0):
    """
    Normalize data column-wise using provided means and stds while leaving masked values unchanged.
    """
    
    df_normalized = df.copy()
    columns = df.select_dtypes(include=[np.number]).columns.tolist()
        
    for col in columns:
        df_normalized[col] = df_normalized[col].astype("float64")
        valid_mask = df_normalized[col] != mask_value # normalize only valid entries
        df_normalized.loc[valid_mask, col] = (df_normalized.loc[valid_mask, col] - means[col]) / stds[col]
    return df_normalized

In [11]:
means, stds = compute_normalization_params(data_full)

data_full_normalized = normalize_data(data_full, means, stds, mask_value=-999.0)
data_normalized = normalize_data(data, means, stds, mask_value=-999.0)

In [12]:
players = data["PLAYER_NAME"].unique()
train_players, val_players = train_test_split(players, test_size=0.1, random_state=42) 
# i split before creating the sequences. if i split afterwards, i will not be able to validate on out of sample data.

train_data = data_normalized[data_normalized["PLAYER_NAME"].isin(train_players)]
val_data = data_normalized[data_normalized["PLAYER_NAME"].isin(val_players)]

In [13]:
data.columns

Index(['PLAYER_AGE', 'GP', 'SEASON_START_YEAR', 'PLAYER_NAME', 'MIN_PER_GAME',
       'FGM_PER_GAME', 'FGA_PER_GAME', 'FG3M_PER_GAME', 'FG3A_PER_GAME',
       'FTM_PER_GAME', 'FTA_PER_GAME', 'OREB_PER_GAME', 'DREB_PER_GAME',
       'REB_PER_GAME', 'AST_PER_GAME', 'STL_PER_GAME', 'BLK_PER_GAME',
       'TOV_PER_GAME', 'PF_PER_GAME', 'PTS_PER_GAME', 'is_traded'],
      dtype='object')

In [14]:
data_full.to_csv("data_full.csv", index=False)
data.to_csv("data.csv", index=False) # to save preprocessed versions

data_full_normalized.to_csv('data_full_normalized.csv', index=False)
data_normalized.to_csv('data_normalized.csv', index=False) # have to create a separate file since the season year value is normalized as well!

In [15]:
def create_sequences(data):
    """
    Create sequences for the entire career of a player.
    Each sequence grows in length and the label is the stats we're trying to predict for the following year.
    """
    sequences = []
    labels = []
    player_info = []
    player_groups = data.groupby("PLAYER_NAME")
    
    for player_name, group in player_groups:
        group = group.sort_values(by="SEASON_START_YEAR").reset_index(drop=True)
        
        for i in range(1, len(group)):
            seq = group.iloc[:i].drop(columns=["PLAYER_NAME", "SEASON_START_YEAR"]).values
            label = group.iloc[[i]].drop(columns=["PLAYER_AGE", "PLAYER_NAME", "SEASON_START_YEAR", "is_traded"]).values.flatten()
            sequences.append(seq)
            labels.append(label)
            player_info.append((player_name, group.iloc[i]["SEASON_START_YEAR"])) 
            # need to retain info to know what and whose season we're predicting later on
    
    return sequences, labels, player_info
    # the output needs to be flattened to go back to 1d label structure

In [16]:
train_sequences, train_labels, train_player_info = create_sequences(train_data)
val_sequences, val_labels, val_player_info = create_sequences(val_data)

In [17]:
train_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, dtype="float32", padding="pre", value = -999.0)
val_sequences = tf.keras.preprocessing.sequence.pad_sequences(val_sequences, dtype="float32", padding="pre", value = -999.0)

# i'm padding the sequences since they variable length sequences can cause problems in the training process.

In [18]:
train_labels = np.array(train_labels, dtype="float32")
val_labels = np.array(val_labels, dtype="float32")

# making sure the labels have the correct dtype

In [19]:
# converting to tf datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_sequences, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_sequences, val_labels))

In [20]:
batch_size = 32

train_dataset = train_dataset.shuffle(len(train_sequences)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [21]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Masking(mask_value= -999.0), # masking the padded parts of the sequence
    tf.keras.layers.GaussianNoise(stddev=0.01), # for robustness
    tf.keras.layers.LSTM(128, return_sequences=False, dropout=0.1, recurrent_dropout=0.05, use_bias=False),  
    tf.keras.layers.Dense(train_labels.shape[1], use_bias=False) # no bias to force the model to not learn averages but use past values for predictions
]) # i will optimize the model architecture in further iterations. so far, it seems like larger and deeper networks lead to overfitting.

# since the data by its nature is very noisy we need to be vary of overfitting to noise! 

model.compile(optimizer="Adam", loss="MSE")
model.summary()

With LSTMs, adding more layers or more neurons seems to lead to overfitting issues.

In [22]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
# early stopping so we stop training when the model stops overfitting, and only save the best model based on out of sample estimates.

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.8, patience=1, min_lr=1e-6)
# reduce_lr helps the model stop diverging if its learning rate becomes too large as it gets closer to the minimum.

In [23]:
history = model.fit(train_dataset, epochs=1000, batch_size=32, 
                    validation_data=val_dataset, callbacks=[early_stopping, reduce_lr])

Epoch 1/1000
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - loss: 0.4660 - val_loss: 0.3177 - learning_rate: 0.0010
Epoch 2/1000
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 0.3163 - val_loss: 0.3076 - learning_rate: 0.0010
Epoch 3/1000
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 0.3085 - val_loss: 0.3060 - learning_rate: 0.0010
Epoch 4/1000
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 0.3016 - val_loss: 0.2998 - learning_rate: 0.0010
Epoch 5/1000
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 0.2983 - val_loss: 0.3008 - learning_rate: 0.0010
Epoch 6/1000
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 0.2923 - val_loss: 0.2989 - learning_rate: 8.0000e-04
Epoch 7/1000
[1m445/445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 0.2923 - val_loss

The MSE loss by itself doesn't mean much. Let's see some actual predictions to see how close we're getting!

In [24]:
stats_columns = data.drop(columns=["PLAYER_NAME", "SEASON_START_YEAR", "PLAYER_AGE", "is_traded"]).columns.tolist()

In [25]:
def inverse_normalize(normalized_preds, means, stds, columns):
    """
    Retransform normalized predictions to the original scale.
    """

    if len(normalized_preds.shape) == 1:
        original_preds = np.empty_like(normalized_preds)
        for i, col in enumerate(columns):
            original_preds[i] = normalized_preds[i] * stds[col] + means[col]
    else:
        original_preds = np.empty_like(normalized_preds)
        for i, col in enumerate(columns):
            original_preds[:, i] = normalized_preds[:, i] * stds[col] + means[col]
    
    return original_preds

Let's predict a random historical season.

In [26]:
print("\nPredictions vs Actual Values:")

num_predictions = 1
random_indices = random.sample(range(len(val_sequences)), num_predictions)

for idx in random_indices:
    seq = val_sequences[idx:idx+1]  # select the sequence
    actual = val_labels[idx]         # select the actual label
    prediction = model.predict(seq)  # make prediction
    # retransform the normalized prediction back to the original scale
    original_prediction = inverse_normalize(prediction, means, stds, stats_columns)
    original_actual = inverse_normalize(actual, means, stds, stats_columns)
    
    player_name, season_year = val_player_info[idx]  # get player info
    print(f"Player: {player_name}, Year: {season_year * stds["SEASON_START_YEAR"] + means["SEASON_START_YEAR"]:.0f}")
    print("Stats:")
    for stat, actual_val, pred_val in zip(stats_columns, original_actual, original_prediction[0]):
        print(f"  {stat}: Actual = {actual_val:.1f}, Prediction = {pred_val:.1f}")


Predictions vs Actual Values:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step
Player: Larry Kenon, Year: 1983
Stats:
  GP: Actual = 48.0, Prediction = 58.0
  MIN_PER_GAME: Actual = 16.0, Prediction = 17.9
  FGM_PER_GAME: Actual = 2.5, Prediction = 3.1
  FGA_PER_GAME: Actual = 5.3, Prediction = 6.4
  FG3M_PER_GAME: Actual = -0.0, Prediction = 0.0
  FG3A_PER_GAME: Actual = 0.0, Prediction = 0.0
  FTM_PER_GAME: Actual = 0.9, Prediction = 1.1
  FTA_PER_GAME: Actual = 1.2, Prediction = 1.6
  OREB_PER_GAME: Actual = 1.4, Prediction = 1.2
  DREB_PER_GAME: Actual = 1.7, Prediction = 2.0
  REB_PER_GAME: Actual = 3.1, Prediction = 3.2
  AST_PER_GAME: Actual = 0.8, Prediction = 1.3
  STL_PER_GAME: Actual = 0.5, Prediction = 0.6
  BLK_PER_GAME: Actual = 0.2, Prediction = 0.1
  TOV_PER_GAME: Actual = 1.0, Prediction = 1.3
  PF_PER_GAME: Actual = 1.3, Prediction = 1.6
  PTS_PER_GAME: Actual = 5.8, Prediction = 7.2


Finally, let's save the model and parameters to reuse them later.

In [27]:
# Saving the model for reuse

model.save("nba_stats_predictor_model.keras")

normalization_params = {
    "means": means,
    "stds": stds,
    "stats_columns": stats_columns
}

with open("normalization_params.json", "w") as f:
    means_dict = {k: float(v) for k, v in means.items()}
    stds_dict = {k: float(v) for k, v in stds.items()}
    params = {
        "means": means_dict,
        "stds": stds_dict,
        "stats_columns": stats_columns
    }
    json.dump(params, f)