# NBA Player Stats Prediction - Refactored

**Team Members:** Ryan, Momoka, Jesus, Angel, Harshil   
**Course:** CS4661 - Introduction to Data Science  
**Objective:** Predict NBA player statistics using machine learning

---

## Project Overview

This notebook demonstrates a complete machine learning pipeline for predicting NBA player statistics:
- **Target Variables:** PTS (total points scored) and team win classifiction
- **Models:** Linear Regression, Random Forest, Gradient Boosting
- **Approach:** Modular, reusable functions for scalability and maintainability

## 1. Imports and Setup

In [10]:
import pandas as pd
import numpy as np
import kagglehub
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

## 2. Reusable Functions

These functions eliminate code duplication and make the pipeline modular.

In [11]:
def load_nba_data():
    """
    Download and load NBA player stats dataset from Kaggle.
    
    Returns:
        pd.DataFrame: Raw dataset
    """
    print("Downloading dataset...")
    path = kagglehub.dataset_download("eduardopalmieri/nba-player-stats-season-2425")
    print(f"Path to dataset files: {path}")
    
    csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
    print(f"\nAvailable CSV files: {csv_files}")
    
    df = pd.read_csv(os.path.join(path, csv_files[0]))
    
    print("\n" + "="*80)
    print("DATASET OVERVIEW")
    print("="*80)
    print(f"\nDataset shape: {df.shape}")
    print(f"\nColumn names:\n{df.columns.tolist()}")
    print(f"\nFirst few rows:\n{df.head()}")
    
    missing_values = df.isnull().sum()
    if missing_values.sum() > 0:
        print(f"\nMissing values:\n{missing_values[missing_values > 0]}")
    else:
        print("\nNo missing values found!")
    
    return df


def prepare_features(df, target_col, exclude_cols=None):
    """
    Prepare features and target variable for modeling.
    
    Args:
        df: DataFrame with raw data
        target_col: Name of target variable column
        exclude_cols: List of columns to exclude (default: auto-detected)
    
    Returns:
        tuple: (X, y, feature_names)
    """
    if exclude_cols is None:
        # Auto-detect columns to exclude
        exclude_cols = [target_col, 'Player', 'Data', 'FG%', 'PTS', 'GmSc']
        
        # Conditionally exclude based on target
        if target_col in ['FG', 'FGA']:
            exclude_cols.extend(['PTS', 'GmSc'])
        elif target_col == 'PTS':
            exclude_cols.extend(['GmSc', 'FG'])
        elif target_col == 'GmSc':
            exclude_cols.extend(['PTS'])
    
    # Get feature columns
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    # Select only numeric features for now
    numeric_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
    
    print(f"\nTarget variable: {target_col}")
    print(f"Feature variables ({len(numeric_cols)} total): {numeric_cols}")
    
    # Create feature matrix and target vector
    X = df[numeric_cols].copy()
    y = df[target_col].copy()
    
    # Clean data
    valid_indices = X.notna().all(axis=1) & y.notna()
    X = X[valid_indices]
    y = y[valid_indices]
    
    # Handle infinite values
    X = X.replace([np.inf, -np.inf], np.nan).dropna()
    y = y[X.index]
    
    print(f"Final dataset shape: X={X.shape}, y={y.shape}")
    
    return X, y, numeric_cols


def create_model_configs():
    """
    Create model configurations for training.
    
    Returns:
        dict: Model configurations
    """
    models = {
        'Linear Regression': {
            'model': LinearRegression(),
            'use_scaled': True,
            'has_coef': True
        },
        'Random Forest': {
            'model': RandomForestRegressor(n_estimators=100, random_state=42),
            'use_scaled': False,
            'has_coef': False
        },
        'Gradient Boosting': {
            'model': GradientBoostingRegressor(n_estimators=100, random_state=42),
            'use_scaled': False,
            'has_coef': False
        }
    }
    return models


def train_and_evaluate_models(X_train, X_test, y_train, y_test, feature_cols, target_name):
    """
    Train and evaluate all models for a given target variable.
    
    Args:
        X_train: Training features
        X_test: Test features
        y_train: Training target
        y_test: Test target
        feature_cols: List of feature column names
        target_name: Name of target variable (for display)
    
    Returns:
        dict: Results for each model
    """
    print("\n" + "="*80)
    print(f"MODEL TRAINING FOR {target_name}")
    print("="*80)
    
    models = create_model_configs()
    results = {}
    
    # Initialize scaler once for all models that need it
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    for model_name, config in models.items():
        print("\n" + "-"*80)
        print(f"Model: {model_name}")
        print("-"*80)
        
        # Select scaled or unscaled data based on model requirements
        X_train_use = X_train_scaled if config['use_scaled'] else X_train
        X_test_use = X_test_scaled if config['use_scaled'] else X_test
        
        # Train model
        model = config['model']
        model.fit(X_train_use, y_train)
        y_pred = model.predict(X_test_use)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[model_name] = {
            'RMSE': rmse,
            'MAE': mae,
            'R²': r2
        }
        
        # Print metrics
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"R²: {r2:.4f}")
        
        # Print coefficients or feature importances
        if config['has_coef'] and hasattr(model, 'coef_'):
            print("\nFeature Coefficients:")
            for feature, coef in zip(feature_cols, model.coef_):
                print(f"  {feature}: {coef:.4f}")
        elif hasattr(model, 'feature_importances_'):
            print("\nFeature Importances:")
            for feature, importance in zip(feature_cols, model.feature_importances_):
                print(f"  {feature}: {importance:.4f}")
    
    return results


def summarize_results(results, target_name):
    """
    Print summary of model results.
    
    Args:
        results: Dictionary of model results
        target_name: Name of target variable
    """
    print("\n" + "="*80)
    print(f"SUMMARY OF RESULTS FOR {target_name}")
    print("="*80)
    
    results_df = pd.DataFrame(results).T
    print(f"\n{'':<20s}{'RMSE':>10s}{'MAE':>12s}{'R²':>10s}")
    for model_name, row in results_df.iterrows():
        print(f"{model_name:<20s}{row['RMSE']:>10.6f}{row['MAE']:>12.6f}{row['R²']:>10.6f}")
    
    # Identify best models
    best_model_r2 = results_df['R²'].idxmax()
    best_model_rmse = results_df['RMSE'].idxmin()
    best_model_mae = results_df['MAE'].idxmin()
    
    print(f"\nBest Model (by R²): {best_model_r2}")
    print(f"Best Model (by RMSE): {best_model_rmse}")
    print(f"Best Model (by MAE): {best_model_mae}")
    
    return results_df


def predict_target(df, target_col, test_size=0.4, random_state=42):
    """
    Complete pipeline for predicting a target variable.
    
    Args:
        df: DataFrame with data
        target_col: Target variable to predict
        test_size: Proportion of data for testing
        random_state: Random seed for reproducibility
    
    Returns:
        dict: Results for all models
    """
    print("\n" + "#"*80)
    print(f"# PREDICTION PIPELINE FOR: {target_col}")
    print("#"*80)
    
    # Prepare features
    X, y, feature_cols = prepare_features(df, target_col)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    print(f"\nTrain set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    
    # Train and evaluate models
    results = train_and_evaluate_models(
        X_train, X_test, y_train, y_test, feature_cols, target_col
    )
    
    # Summarize results
    results_df = summarize_results(results, target_col)
    
    return results, results_df

In [12]:
def tune_hyperparameters(X_train, y_train, model_type='random_forest'):
    """
    Perform hyperparameter tuning using GridSearchCV or RandomizedSearchCV.
    Cross-validation is handled automatically by these methods.
    
    Args:
        X_train: Training features
        y_train: Training target
        model_type: Type of model to tune ('random_forest', 'gradient_boosting', 'xgboost', 'lightgbm')
    
    Returns:
        GridSearchCV object containing:
            - best_estimator_: The best trained model
            - best_params_: The best parameter combination
            - best_score_: The best cross-validation score
            - cv_results_: Detailed results for all parameter combinations
    
    Usage:
        grid_search = tune_hyperparameters(X_train, y_train, 'random_forest')
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        y_pred = best_model.predict(X_test)
    
    TODO (Jesus): Implement this function with:
        1. Define param_grid for each model_type
        2. Create GridSearchCV with cv=5
        3. Fit and return the grid_search object
    """
    # TODO: Add imports at top of notebook if needed
    # from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
    
    # TODO: Define parameter grids for different models
    # param_grids = {
    #     'random_forest': {
    #         'n_estimators': [50, 100, 200],
    #         'max_depth': [10, 20, None],
    #         'min_samples_split': [2, 5, 10]
    #     },
    #     'gradient_boosting': {
    #         'n_estimators': [50, 100, 200],
    #         'learning_rate': [0.01, 0.1, 0.2],
    #         'max_depth': [3, 5, 7]
    #     }
    # }
    
    # TODO: Create GridSearchCV and fit
    # grid_search = GridSearchCV(
    #     estimator=...,
    #     param_grid=param_grids[model_type],
    #     cv=5,  # 5-fold cross-validation (automatic)
    #     scoring='r2',
    #     n_jobs=-1,  # Use all CPU cores
    #     verbose=1
    # )
    # grid_search.fit(X_train, y_train)
    # return grid_search
    
    raise NotImplementedError("Jesus: Implement hyperparameter tuning here!")

## 3. Load and Explore Data

In [13]:
# Load dataset (only need to do this once!)
df = load_nba_data()

Downloading dataset...
Path to dataset files: /Users/ryan/.cache/kagglehub/datasets/eduardopalmieri/nba-player-stats-season-2425/versions/37

Available CSV files: ['database_24_25.csv']

DATASET OVERVIEW

Dataset shape: (16512, 25)

Column names:
['Player', 'Tm', 'Opp', 'Res', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', 'Data']

First few rows:
          Player   Tm  Opp Res     MP  FG  FGA    FG%  3P  3PA  ...  DRB  TRB  \
0   Jayson Tatum  BOS  NYK   W  30.30  14   18  0.778   8   11  ...    4    4   
1  Anthony Davis  LAL  MIN   W  37.58  11   23  0.478   1    3  ...   13   16   
2  Derrick White  BOS  NYK   W  26.63   8   13  0.615   6   10  ...    3    3   
3   Jrue Holiday  BOS  NYK   W  30.52   7    9  0.778   4    6  ...    2    4   
4  Miles McBride  NYK  BOS   L  25.85   8   10  0.800   4    5  ...    0    0   

   AST  STL  BLK  TOV  PF  PTS  GmSc        Data  
0   10    1    1    1  

## 4. Predict Points Scored (PTS)

Points scored (PTS) represents the number of successful points made by a player in a game.

In [14]:
# Run complete pipeline for FG prediction
pts_results, pts_results_df = predict_target(df, "PTS")


################################################################################
# PREDICTION PIPELINE FOR: PTS
################################################################################

Target variable: PTS
Feature variables (16 total): ['MP', 'FGA', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF']
Final dataset shape: X=(16512, 16), y=(16512,)

Train set: 9907 samples
Test set: 6605 samples

MODEL TRAINING FOR PTS

--------------------------------------------------------------------------------
Model: Linear Regression
--------------------------------------------------------------------------------
RMSE: 2.1743
MAE: 1.5885
R²: 0.9391

Feature Coefficients:
  MP: 0.2254
  FGA: 6.5157
  3P: 4.6443
  3PA: -3.6856
  3P%: -0.0842
  FT: 1.9715
  FTA: 0.2723
  FT%: 0.0838
  ORB: -0.0956
  DRB: 0.1303
  TRB: 0.0647
  AST: -0.0781
  STL: 0.0363
  BLK: 0.0855
  TOV: -0.0170
  PF: -0.0298

----------------------------------------------------

## 7. Next Steps (To Be Completed)

### TODO List for Team:

1. **Exploratory Data Analysis (EDA)** - Assigned to: Angel (Week 1-2)
   - Distribution plots for PTS (player-level)
   - Distribution plots for aggregated team statistics (team-level)
   - Correlation heatmaps (both player and team level)
   - Win vs Loss feature comparisons
   - Temporal trends

2. **Feature Engineering** - Assigned to: Ryan + Momoka (Week 1)
   - Encode categorical variables (Tm, Opp, Res)
   - Create derived features (shooting efficiency, etc.)
   - Stretch: Rolling averages for player form
   - Stretch goal: PCA (Dimensionality Reduction)

3. **Team Win Prediction (Classification)** - Assigned to: **Ryan** (Week 1-2)
   - Transform player-level data to team-game level using aggregation
   - Binary classification models (Logistic Regression, Random Forest Classifier, Gradient Boosting Classifier)
   - Evaluate with accuracy, precision, recall, F1-score, ROC curves
   - Compare classification performance across models
   - **Deliverable:** New prediction pipeline for binary classification + results comparison

4. **Hyperparameter Tuning & Additional Modeling** - Assigned to: Jesus (Week 1-2)
   - Implement `tune_hyperparameters()` function with GridSearchCV (cv=5)
   - Add XGBoost and LightGBM for both regression (PTS) and classification (Team Win)
   - Tune hyperparameters for all models (regression and classification)
   - Compare tuned vs baseline models
   - **Deliverable:** Tuned models + comparison table
     
5. **Visualization & Analysis** - Assigned to: Harshil (Week 1-2)
   - Residual plots
   - Feature importance charts
   - Prediction vs actual scatter plots

6. **Documentation** - Assigned to: All (Week 2)
   - Executive summary (Ryan, Momoka)
   - Methodology explanation (Ryan, Jesus)
   - Results interpretation (Ryan, Angel, Harshil)
   - Conclusions and recommendations (All)



## Validity of (3) with our dataset

In [15]:
# Quick verification of data structure
print("Sample of data to check game structure:")
print(df[['Player', 'Tm', 'Opp', 'Data', 'Res']].head(20))

# Check: How many players per team per game?
players_per_game = df.groupby(['Tm', 'Opp', 'Data']).size()
print(f"\nPlayers per team-game:")
print(players_per_game.describe())
print(f"\nMin players in a team-game: {players_per_game.min()}")
print(f"Max players in a team-game: {players_per_game.max()}")
print(f"Average players in a team-game: {players_per_game.mean():.1f}")

# Check: How many unique games?
unique_games = df.groupby(['Data']).apply(lambda x: len(x[['Tm', 'Opp']].drop_duplicates()) / 2)
print(f"\nTotal unique games in dataset: {unique_games.sum():.0f}")

Sample of data to check game structure:
                      Player   Tm  Opp        Data Res
0               Jayson Tatum  BOS  NYK  2024-10-22   W
1              Anthony Davis  LAL  MIN  2024-10-22   W
2              Derrick White  BOS  NYK  2024-10-22   W
3               Jrue Holiday  BOS  NYK  2024-10-22   W
4              Miles McBride  NYK  BOS  2024-10-22   L
5              Rui Hachimura  LAL  MIN  2024-10-22   W
6               Jaylen Brown  BOS  NYK  2024-10-22   W
7                Rudy Gobert  MIN  LAL  2024-10-22   L
8              Julius Randle  MIN  LAL  2024-10-22   L
9                 Al Horford  BOS  NYK  2024-10-22   W
10             Jalen Brunson  NYK  BOS  2024-10-22   L
11           Anthony Edwards  MIN  LAL  2024-10-22   L
12        Karl-Anthony Towns  NYK  BOS  2024-10-22   L
13             Austin Reaves  LAL  MIN  2024-10-22   W
14                 Josh Hart  NYK  BOS  2024-10-22   L
15  Nickeil Alexander-Walker  MIN  LAL  2024-10-22   L
16              LeBron Ja