# NBA Player Stats Prediction

To start we can download the dataset using kagglehub and get an overview of the dataset

In [43]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Download dataset
print("Downloading dataset...")
path = kagglehub.dataset_download("eduardopalmieri/nba-player-stats-season-2425")
print(f"Path to dataset files: {path}")

# Load the dataset
import os
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
print(f"\nAvailable CSV files: {csv_files}")

# Load the first CSV file (adjust if needed)
df = pd.read_csv(os.path.join(path, csv_files[0]))

print("\n" + "="*80)
print("DATASET OVERVIEW")
print("="*80)
print(f"\nDataset shape: {df.shape}")
print(f"\nColumn names:\n{df.columns.tolist()}")
print(f"\nFirst few rows:\n{df.head()}")

# Check for missing values
print("\n" + "="*80)
print("DATA QUALITY CHECK")
print("="*80)
missing_values = df.isnull().sum()
print(f"\nMissing values:\n{missing_values[missing_values > 0]}")

Downloading dataset...
Path to dataset files: /Users/momokaaung/.cache/kagglehub/datasets/eduardopalmieri/nba-player-stats-season-2425/versions/37

Available CSV files: ['database_24_25.csv']

DATASET OVERVIEW

Dataset shape: (16512, 25)

Column names:
['Player', 'Tm', 'Opp', 'Res', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', 'Data']

First few rows:
          Player   Tm  Opp Res     MP  FG  FGA    FG%  3P  3PA  ...  DRB  TRB  \
0   Jayson Tatum  BOS  NYK   W  30.30  14   18  0.778   8   11  ...    4    4   
1  Anthony Davis  LAL  MIN   W  37.58  11   23  0.478   1    3  ...   13   16   
2  Derrick White  BOS  NYK   W  26.63   8   13  0.615   6   10  ...    3    3   
3   Jrue Holiday  BOS  NYK   W  30.52   7    9  0.778   4    6  ...    2    4   
4  Miles McBride  NYK  BOS   L  25.85   8   10  0.800   4    5  ...    0    0   

   AST  STL  BLK  TOV  PF  PTS  GmSc        Data  
0   10    1    1 

There aren't any missing values in the dataset which is good news.

In [44]:
# Define target variable
target_col = 'FG'  # What we're predicting

# Define features - use ALL columns except:
# - The target itself (FG)
# - Non-predictive columns (Player name, Date)
exclude_cols = ['FG', 'Player', 'Data', 'FG%', 'PTS', 'GmSc']  # columns to exclude

# Get all feature columns
feature_cols = [col for col in df.columns if col not in exclude_cols]

print(f"\nTarget variable: {target_col}")
print(f"\nFeature variables ({len(feature_cols)} total):")
print(feature_cols)

# Separate numeric and categorical features
numeric_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df[feature_cols].select_dtypes(exclude=[np.number]).columns.tolist()

print(f"\nNumeric features: {numeric_cols}")
print(f"\nCategorical features: {categorical_cols}")

# For now, let's use only numeric features (we can encode categoricals later if needed)
X = df[numeric_cols].copy()
y = df[target_col].copy()

# Handle missing values
print(f"\nRows before dropping NaN: {len(X)}")
valid_indices = X.notna().all(axis=1) & y.notna()
X = X[valid_indices]
y = y[valid_indices]
print(f"Rows after dropping NaN: {len(X)}")

# Handle any infinite values (from division by zero, etc.)
X = X.replace([np.inf, -np.inf], np.nan)
X = X.dropna()
y = y[X.index]

print(f"\nFinal dataset shape: X={X.shape}, y={y.shape}")


Target variable: FG

Feature variables (19 total):
['Tm', 'Opp', 'Res', 'MP', 'FGA', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF']

Numeric features: ['MP', 'FGA', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF']

Categorical features: ['Tm', 'Opp', 'Res']

Rows before dropping NaN: 16512
Rows after dropping NaN: 16512

Final dataset shape: X=(16512, 16), y=(16512,)


## Model Training
- Linear Regression
- Random Forest
- Gradient Boosting

In [45]:
# Dictionary to store results
results = {}

# Define models with their configurations
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'use_scaled': True,
        'has_coef': True
    },
    'Random Forest': {
        'model': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
        'use_scaled': False,
        'has_coef': False
    },
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'use_scaled': False,
        'has_coef': False
    }
}

# Train and evaluate all models
print("\n" + "="*80)
print("MODEL TRAINING")
print("="*80)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


for model_name, config in models.items():
    print("\n" + "-"*80)
    print(f"Model: {model_name}")
    print("-"*80)
    
    # Select appropriate training data
    X_train_use = X_train_scaled if config['use_scaled'] else X_train
    X_test_use = X_test_scaled if config['use_scaled'] else X_test
    
    # Train model
    model = config['model']
    model.fit(X_train_use, y_train)
    y_pred = model.predict(X_test_use)
    
    # Calculate and store metrics
    results[model_name] = {
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MAE': mean_absolute_error(y_test, y_pred),
        'R²': r2_score(y_test, y_pred)
    }
    
    # Print metrics
    for metric, value in results[model_name].items():
        print(f"{metric}: {value:.4f}")
    
    # Print coefficients or feature importances
    if config['has_coef']:
        print("\nFeature Coefficients:")
        for feature, coef in zip(feature_cols, model.coef_):
            print(f"  {feature}: {coef:.4f}")
    else:
        print("\nFeature Importances:")
        for feature, importance in zip(feature_cols, model.feature_importances_):
            print(f"  {feature}: {importance:.4f}")


MODEL TRAINING

--------------------------------------------------------------------------------
Model: Linear Regression
--------------------------------------------------------------------------------
RMSE: 1.0880
MAE: 0.7948
R²: 0.8874

Feature Coefficients:
  Tm: 0.1313
  Opp: 3.2700
  Res: 1.5630
  MP: -1.8575
  FGA: -0.0442
  3P: -0.1396
  3PA: 0.1424
  3P%: 0.0408
  FT: 3766820443346.9692
  FTA: 7517534931008.0820
  FT%: -9582703120100.6758
  ORB: -0.0379
  DRB: 0.0194
  TRB: 0.0453
  AST: -0.0101
  STL: -0.0159

--------------------------------------------------------------------------------
Model: Random Forest
--------------------------------------------------------------------------------
RMSE: 1.1613
MAE: 0.8345
R²: 0.8717

Feature Importances:
  Tm: 0.0242
  Opp: 0.8051
  Res: 0.0032
  MP: 0.0336
  FGA: 0.0554
  3P: 0.0055
  3PA: 0.0069
  3P%: 0.0063
  FT: 0.0063
  FTA: 0.0086
  FT%: 0.0093
  ORB: 0.0101
  DRB: 0.0057
  TRB: 0.0048
  AST: 0.0076
  STL: 0.0075

-----------

## Summary of model results and finding the best model
Three ways to measure how good your predictions are:

1. R² (R-squared) - "How well does my model fit?"
   - Score: 0 to 1 (1 is perfect, higher is better)
   - Think of it as: A percentage score for your model
   - Example: R² = 0.85 means your model is 85% accurate at explaining the data
   - Simple rule: Above 0.7 is usually good, above 0.9 is excellent
   
2. RMSE (Root Mean Squared Error) - "How far off am I, on average?"
   - Score: Lower is better (0 is perfect)
   - Think of it as: Your typical prediction error in real units
   - Example: RMSE = 2.5 means you're off by about 2-3 field goals per prediction
   - Simple rule: Compare to your target values - if predicting 5-10 FG, 
     an RMSE of 0.5 is great, but 5.0 is terrible
   - Important: Punishes big mistakes harder (being off by 10 is worse than 
     being off by 1 ten times)
   
3. MAE (Mean Absolute Error) - "What's my average mistake?"
   - Score: Lower is better (0 is perfect)
   - Think of it as: The average size of your errors
   - Example: MAE = 2.0 means on average you're wrong by 2 field goals
   - Simple rule: Same as RMSE - lower is better, compare to your data range
   - Important: Treats all mistakes equally (being off by 10 once = being off 
     by 1 ten times)

Which should you use?
- R²: Quick overall grade (like a test score)
- RMSE: When big mistakes are really bad (predicting medical dosages)
- MAE: When you just want the typical error size (most common cases)

In [46]:
# Summary of results
print("\n" + "="*80)
print("SUMMARY OF RESULTS")
print("="*80)
results_df = pd.DataFrame(results).T
print(results_df.to_string())

# Pick best by R² (maximize)
best_model_r2 = results_df['R²'].idxmax()

# Pick best by RMSE (minimize)
best_model_rmse = results_df['RMSE'].idxmin()

# Pick best by MAE (minimize)
best_model_mae = results_df['MAE'].idxmin()

print(f"\nBest Model (by R²): {best_model_r2}")
print(f"Best Model (by RMSE): {best_model_rmse}")
print(f"Best Model (by MAE): {best_model_mae}")


SUMMARY OF RESULTS
                       RMSE       MAE        R²
Linear Regression  1.088000  0.794814  0.887364
Random Forest      1.161312  0.834471  0.871673
Gradient Boosting  1.108159  0.808517  0.883151

Best Model (by R²): Linear Regression
Best Model (by RMSE): Linear Regression
Best Model (by MAE): Linear Regression


In [47]:
# ================================================================================
# SECOND PREDICTION TASK: Predict FGA (Field Goal Attempts)
# ================================================================================
print("\n\n" + "="*80)
print("SECOND MODELING TASK: PREDICTING FGA (Field Goal Attempts)")
print("="*80)

# Define new target
target_col = 'FGA'

# Exclude columns related to FGA or non-predictive ones
exclude_cols = ['FGA', 'FG', 'Player', 'Data', 'FG%', 'PTS', 'GmSc']

# Select features
feature_cols = [col for col in df.columns if col not in exclude_cols]
numeric_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()

X = df[numeric_cols].copy()
y = df[target_col].copy()

# Clean data
valid_indices = X.notna().all(axis=1) & y.notna()
X = X[valid_indices]
y = y.loc[X.index]
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]

print(f"\nFinal dataset for FGA: X={X.shape}, y={y.shape}")

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)





SECOND MODELING TASK: PREDICTING FGA (Field Goal Attempts)

Final dataset for FGA: X=(16512, 15), y=(16512,)


In [48]:
# ================================================================================
# MODEL TRAINING
# ================================================================================
print("\n" + "="*80)
print("MODEL TRAINING")
print("="*80)

fga_results = {}

for model_name, config in models.items():
    print("\n" + "-"*80)
    print(f"Model: {model_name}")
    print("-"*80)

    X_train_use = X_train_scaled if config['use_scaled'] else X_train
    X_test_use = X_test_scaled if config['use_scaled'] else X_test

    model = config['model']
    model.fit(X_train_use, y_train)
    y_pred = model.predict(X_test_use)

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    fga_results[model_name] = {'RMSE': rmse, 'MAE': mae, 'R²': r2}

    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")

    # Coefficients or feature importances
    if config['has_coef'] and hasattr(model, 'coef_'):
        print("\nFeature Coefficients:")
        for feature, coef in zip(numeric_cols, model.coef_):
            print(f"  {feature}: {coef:.4f}")
    elif hasattr(model, 'feature_importances_'):
        print("\nFeature Importances:")
        for feature, importance in zip(numeric_cols, model.feature_importances_):
            print(f"  {feature}: {importance:.4f}")
    else:
        print("\n(No coefficients or feature importances available for this model.)")


MODEL TRAINING

--------------------------------------------------------------------------------
Model: Linear Regression
--------------------------------------------------------------------------------
RMSE: 2.7505
MAE: 2.0328
R²: 0.7918

Feature Coefficients:
  MP: 1.9253
  3P: -0.0991
  3PA: 2.9083
  3P%: -0.1319
  FT: 0.0319
  FTA: 1.0524
  FT%: -0.0552
  ORB: -8198447534941.5078
  DRB: -16361840616219.9512
  TRB: 20856658806721.3750
  AST: 0.4037
  STL: 0.0829
  BLK: 0.0427
  TOV: 0.3908
  PF: -0.1837

--------------------------------------------------------------------------------
Model: Random Forest
--------------------------------------------------------------------------------
RMSE: 2.8018
MAE: 2.0417
R²: 0.7840

Feature Importances:
  MP: 0.6348
  3P: 0.0057
  3PA: 0.1662
  3P%: 0.0109
  FT: 0.0144
  FTA: 0.0308
  FT%: 0.0110
  ORB: 0.0162
  DRB: 0.0153
  TRB: 0.0200
  AST: 0.0222
  STL: 0.0113
  BLK: 0.0084
  TOV: 0.0176
  PF: 0.0152

--------------------------------------

In [49]:
# ================================================================================
# SUMMARY OF RESULTS
# ================================================================================
print("\n" + "="*80)
print("SUMMARY OF RESULTS")
print("="*80)

fga_results_df = pd.DataFrame(fga_results).T
print(f"{'':<20s}{'RMSE':>10s}{'MAE':>12s}{'R²':>10s}")
for model_name, row in fga_results_df.iterrows():
    print(f"{model_name:<20s}{row['RMSE']:>10.6f}{row['MAE']:>12.6f}{row['R²']:>10.6f}")

# Identify best models
best_model_r2 = fga_results_df['R²'].idxmax()
best_model_rmse = fga_results_df['RMSE'].idxmin()
best_model_mae = fga_results_df['MAE'].idxmin()

print(f"\nBest Model (by R²): {best_model_r2}")
print(f"Best Model (by RMSE): {best_model_rmse}")
print(f"Best Model (by MAE): {best_model_mae}")



SUMMARY OF RESULTS
                          RMSE         MAE        R²
Linear Regression     2.750458    2.032782  0.791823
Random Forest         2.801822    2.041688  0.783975
Gradient Boosting     2.724104    1.985679  0.795793

Best Model (by R²): Gradient Boosting
Best Model (by RMSE): Gradient Boosting
Best Model (by MAE): Gradient Boosting
