In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler # Added for scaling later

# Define file paths for your CSVs
per_100_poss_file = '../data/raw/NBA24-25PER100STATS.csv'
advanced_stats_file = '../data/raw/NBA24-25ADVANCEDSTATS.csv'
shooting_stats_file = '../data/raw/NBA24-25SHOOTINGSTATS.csv'

# Load each CSV into a Pandas DataFrame (no skiprows needed for clean files)
try:
    df_per_100 = pd.read_csv(per_100_poss_file)
    df_advanced = pd.read_csv(advanced_stats_file)
    df_shooting = pd.read_csv(shooting_stats_file)

    print("DataFrames loaded successfully. Head of each:")
    print("\nPer 100 Possessions:")
    print(df_per_100.head())
    print("\nAdvanced Stats:")
    print(df_advanced.head())
    print("\nShooting Stats:")
    print(df_shooting.head())

    # Let's check the actual column names
    print("\n=== COLUMN NAMES ===")
    print("Per 100 Possessions columns:")
    print(df_per_100.columns.tolist())
    print("\nAdvanced Stats columns:")
    print(df_advanced.columns.tolist())
    print("\nShooting Stats columns:")
    print(df_shooting.columns.tolist())

except FileNotFoundError as e:
    print(f"Error: One of the CSV files not found. Please ensure they are in the correct directory.")
    print(f"Missing file: {e.filename}")
    exit() # Exit if essential files are missing

# Check if the required columns exist in each DataFrame
required_columns = ['Player', 'Team', 'Age'] # Note: Using 'Player', 'Team' (capitalized as per your data)
for col in required_columns:
    print(f"\nChecking column '{col}':")
    print(f"  Per 100 Possessions: {'✓' if col in df_per_100.columns else '✗'}")
    print(f"  Advanced Stats: {'✓' if col in df_advanced.columns else '✗'}")
    print(f"  Shooting Stats: {'✓' if col in df_shooting.columns else '✗'}")

# Only proceed with data type conversion if columns exist
if 'Age' in df_per_100.columns and 'Age' in df_advanced.columns and 'Age' in df_shooting.columns:
    # Convert Age columns to the same type (float)
    df_per_100['Age'] = df_per_100['Age'].astype(float)
    df_advanced['Age'] = df_advanced['Age'].astype(float)
    df_shooting['Age'] = df_shooting['Age'].astype(float)

    print("\n✓ Age columns converted to float")
else:
    print("\n✗ Age column missing in one or more DataFrames")

if 'Player' in df_per_100.columns and 'Player' in df_advanced.columns and 'Player' in df_shooting.columns:
    # Also ensure Player and Team are strings
    df_per_100['Player'] = df_per_100['Player'].astype(str)
    df_advanced['Player'] = df_advanced['Player'].astype(str)
    df_shooting['Player'] = df_shooting['Player'].astype(str)

    print("✓ Player columns converted to string")
else:
    print("✗ Player column missing in one or more DataFrames")

if 'Team' in df_per_100.columns and 'Team' in df_advanced.columns and 'Team' in df_shooting.columns:
    df_per_100['Team'] = df_per_100['Team'].astype(str)
    df_advanced['Team'] = df_advanced['Team'].astype(str)
    df_shooting['Team'] = df_shooting['Team'].astype(str)

    print("✓ Team columns converted to string")
else:
    print("✗ Team column missing in one or more DataFrames")


# Check data types before merging
print("\n=== DATA TYPES BEFORE MERGING ===")
if 'Age' in df_per_100.columns:
    print("Per 100 Possessions - Age:", df_per_100['Age'].dtype)
if 'Age' in df_advanced.columns:
    print("Advanced Stats - Age:", df_advanced['Age'].dtype)
if 'Age' in df_shooting.columns:
    print("Shooting Stats - Age:", df_shooting['Age'].dtype)


# Now we can merge using the correct column names
# First merge: Per 100 Possessions and Advanced Stats
if all(col in df_per_100.columns for col in ['Player', 'Team', 'Age']) and all(col in df_advanced.columns for col in ['Player', 'Team', 'Age']):
    merged_df = pd.merge(df_per_100, df_advanced, on=['Player', 'Team', 'Age'], how='inner', suffixes=('_per100', '_adv'))
    print("\n✓ First merge completed successfully")

    # Second merge: Add Shooting Stats
    if all(col in merged_df.columns for col in ['Player', 'Team', 'Age']) and all(col in df_shooting.columns for col in ['Player', 'Team', 'Age']):
        final_df = pd.merge(merged_df, df_shooting, on=['Player', 'Team', 'Age'], how='inner', suffixes=('', '_shooting'))
        print("✓ Second merge completed successfully")

        print("\n--- Merged DataFrame Info ---")
        print(final_df.info())
        print("\nFinal Merged DataFrame Head:")
        print(final_df.head())

        # --- NEW CODE BLOCKS START HERE ---

        # --- Initial Data Cleaning and Feature Preparation ---

        # 1. Handle redundant/duplicate columns from merging
        # Identify columns that are duplicates from the suffixes but likely contain the same data
        # Common duplicated columns might be 'Rk', 'G', 'GS', 'MP' etc.
        # Check the columns of 'final_df' after the merge to identify actual duplicates.

        columns_to_drop_after_merge = []
        # Explicitly list some common ones to ensure they are handled, based on typical Basketball-Reference outputs
        # and preferring the non-suffixed or '_per100' version if duplicates exist.
        common_redundant_base_cols = ['Rk', 'G', 'GS', 'MP'] # Add more as you inspect final_df.columns if needed

        for base_col in common_redundant_base_cols:
            if f"{base_col}_adv" in final_df.columns and base_col in final_df.columns:
                columns_to_drop_after_merge.append(f"{base_col}_adv")
            if f"{base_col}_shooting" in final_df.columns and base_col in final_df.columns:
                columns_to_drop_after_merge.append(f"{base_col}_shooting")
            if f"{base_col}_adv" in final_df.columns and f"{base_col}_per100" in final_df.columns:
                columns_to_drop_after_merge.append(f"{base_col}_adv")
            if f"{base_col}_shooting" in final_df.columns and f"{base_col}_per100" in final_df.columns:
                columns_to_drop_after_merge.append(f"{base_col}_shooting")

        # You might have a 'Pos' column. If 'Pos' and 'Pos_adv' both exist, keep 'Pos'.
        if 'Pos_adv' in final_df.columns and 'Pos' in final_df.columns:
            columns_to_drop_after_merge.append('Pos_adv')


        # Remove any duplicates from the list of columns to drop
        columns_to_drop_after_merge = list(set(columns_to_drop_after_merge))

        final_df.drop(columns=columns_to_drop_after_merge, inplace=True, errors='ignore')
        print(f"\nDataFrame shape after dropping redundant columns: {final_df.shape}")
        print("Columns after dropping redundant ones:")
        print(final_df.columns.tolist())


        # 2. Filter out players with insufficient playing time
        min_mp_threshold = 500 # Example: Minimum 500 minutes played for a full season. Adjust if needed.

        # Ensure 'MP' column exists after dropping duplicates.
        # Based on Basketball-Reference, 'MP' (Minutes Played) is usually in the base Per Game/Per 100 table.
        # If your 'MP' became 'MP_per100' and you dropped the original 'MP', adjust the name.
        mp_col_name = 'MP' # Default expected column name
        if 'MP_per100' in final_df.columns:
            mp_col_name = 'MP_per100' # Use the suffixed one if that's what remained

        if mp_col_name not in final_df.columns:
            print(f"Error: Could not find a suitable minutes played column ('MP' or 'MP_per100'). Please check your data.")
            exit()

        final_df = final_df[final_df[mp_col_name] >= min_mp_threshold].copy()
        print(f"\nFiltered to players with >= {min_mp_threshold} minutes. New shape: {final_df.shape}")


        # 3. Handling Missing Values (Imputation)
        print("\n--- Missing Value Check (before imputation) ---")
        # Only show columns with NaNs and their counts
        nan_counts = final_df.isnull().sum()
        print(nan_counts[nan_counts > 0])

        # Impute missing numerical values in `final_df`
        for col in final_df.columns:
            if pd.api.types.is_numeric_dtype(final_df[col]): # Ensure it's a numeric column
                if final_df[col].isnull().any():
                    # Common imputation for percentages/ratios where NaN means 'no attempts' is 0
                    # Common patterns for percentage/ratio columns that might be 0 for NaNs
                    if any(s in col for s in ['%', 'Ar', 'Dist']) or col in ['PER', 'TS%', 'USG%', 'eFG%', 'FTAr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'BPM', 'VORP']: # Broaden this as needed based on your columns
                        final_df[col].fillna(0, inplace=True)
                    else:
                        # For other numerical stats, median is often more robust to outliers than mean
                        median_val = final_df[col].median()
                        final_df[col].fillna(median_val, inplace=True)
                        # print(f"Imputed missing values in '{col}' with median: {median_val}")
            # Handle non-numeric NaNs if any (e.g., in 'Pos' if it's missing for some players, unlikely for NBA stats)
            elif final_df[col].isnull().any():
                final_df[col].fillna('Unknown', inplace=True) # Example: fill with 'Unknown' for categorical 'Pos' if needed

        print("\n--- Missing Value Check (after imputation) ---")
        nan_counts_after = final_df.isnull().sum()
        print(nan_counts_after[nan_counts_after > 0]) # Should ideally be empty


        # --- NEW BLOCK START: Save the merged and cleaned (unscaled) data ---
        # This DataFrame is now ready for feature selection and scaling,
        # but retains all original columns (except dropped duplicates) and unscaled values.
        output_cleaned_file = '../data/processed/nba_2025_player_stats_merged_cleaned.csv' # Adjust path to be relative to notebook
        final_df.to_csv(output_cleaned_file, index=False)
        print(f"\nMerged and cleaned data (unscaled) saved to {output_cleaned_file}")
        # --- NEW BLOCK END ---


        # 4. Select features for clustering
        # This is where you define which columns will be used for K-Means.
        # This list is your 'df_clustering' columns.
        features_for_clustering = [
            # Per 100 Possessions (ensure column names match your final_df after merging)
            # Use the exact column names after your merges and dropping redundant ones.
            # Example: 'PTS', 'AST', 'TRB', 'STL', 'BLK', 'TOV', 'PF'
            # If your 'Pts' is 'PTS_per100', use that. Inspect your final_df.columns.
            # Common Basketball-Reference columns after merges often become:
            'PTS', 'AST', 'TRB', 'STL', 'BLK', 'TOV', 'PF',
            'FG%', '3P%', '2P%', 'FT%', # These might be just 'FG%', '3P%', 'FT%'
            'eFG%', 'TS%', # These are typically from advanced stats
            '3PAr', 'FTr', # Advanced stats
            'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', # Advanced stats percentages
            'OWS', 'DWS', 'WS', 'WS/48', 'BPM', 'VORP', # Advanced stats impact metrics
            # Shooting Stats (ensure column names match)
            'Dist.', # Average Shot Distance
            '2P_FG%', '0-3_FG%', '3-10_FG%', '10-16_FG%', '16-3P_FG%', # FG% by distance
            # You might also find 'Att_3P', 'Att_2P' etc. if they were kept
            # Add or remove based on the actual columns in your final_df and your clustering goals.
        ]

        # Dynamically check and refine features_for_clustering based on available columns
        actual_features = [col for col in features_for_clustering if col in final_df.columns]
        missing_features = [col for col in features_for_clustering if col not in final_df.columns]

        if missing_features:
            print(f"\nWarning: The following desired features are missing and will be excluded from clustering: {missing_features}")
            print("Please check your original CSVs and merge process if these are critical.")

        df_clustering = final_df[actual_features].copy() # This will be the input to scaling

        # Create player_info DataFrame (ensure all needed columns are present in final_df)
        # This should happen *before* scaling df_clustering.
        # Based on your prior error, 'PLAYER', 'TEAM', 'POS', 'AGE' might be 'Player', 'Team', 'Pos', 'Age'
        # Adjust these column names to match what you actually have in final_df
        player_info_cols = ['Player', 'Team', 'Pos', 'Age']
        # Verify these exist in final_df
        existing_player_info_cols = [col for col in player_info_cols if col in final_df.columns]
        if len(existing_player_info_cols) < len(player_info_cols):
            print(f"Warning: Not all player info columns found. Missing: {list(set(player_info_cols) - set(existing_player_info_cols))}. Player info might be incomplete.")
        player_info = final_df[existing_player_info_cols].copy()


        print(f"\nDataFrame for clustering created with shape: {df_clustering.shape}")
        print("Columns for clustering:")
        print(df_clustering.columns.tolist())


        # 5. Feature Scaling
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(df_clustering)
        df_scaled = pd.DataFrame(scaled_features, columns=df_clustering.columns)

        print("\n--- Scaled Features Head ---")
        print(df_scaled.head())


        # 6. Save Processed Data for Clustering (scaled and player info)
        df_scaled.to_csv('../data/processed/nba_2025_player_stats_scaled_for_clustering.csv', index=False)
        player_info.to_csv('../data/processed/nba_2025_player_info.csv', index=False)
        print("\nCleaned and scaled data for clustering saved.")
        print("Player info saved.")

    else:
        print("✗ Cannot perform second merge - missing required columns")
else:
    print("✗ Cannot perform first merge - missing required columns")


DataFrames loaded successfully. Head of each:

Per 100 Possessions:
  Rk           Player   Age Team Pos     G    GS      MP    FG   FGA  ...  \
0  1    Mikal Bridges  28.0  NYK  SF  82.0  82.0  3036.0   9.7  19.3  ...   
1  2        Josh Hart  29.0  NYK  SG  77.0  77.0  2897.0   6.9  13.2  ...   
2  3  Anthony Edwards  23.0  MIN  SG  79.0  79.0  2871.0  12.4  27.7  ...   
3  4     Devin Booker  28.0  PHO  SG  75.0  75.0  2795.0  11.6  25.1  ...   
4  5     James Harden  35.0  LAC  PG  79.0  79.0  2789.0   9.4  22.9  ...   

    AST  STL  BLK  TOV   PF   PTS   ORtg   DRtg  Awards  Player-additional\  
0   5.0  1.2  0.7  2.2  2.1  23.6  117.0  118.0     NaN          bridgmi01\  
1   7.8  2.0  0.5  2.7  3.4  18.0  125.0  112.0     NaN           hartjo01\  
2   6.2  1.6  0.9  4.3  2.6  37.4  115.0  112.0  ASNBA2          edwaran01\  
3   9.4  1.2  0.3  3.9  3.5  34.0  119.0  123.0     NaN          bookede01\  
4  12.1  2.1  1.0  6.0  2.9  31.8  114.0  110.0  ASNBA3          hardeja01\  



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df[col].fillna('Unknown', inplace=True) # Example: fill with 'Unknown' for categorical 'Pos' if needed


In [2]:
# --- Initial Data Cleaning and Feature Selection ---

# First, let's make sure we have the merged DataFrame
# If final_df is not defined, we need to run the previous cell first
try:
    final_df
except NameError:
    print("Error: final_df is not defined. Please run the previous cell first to load and merge the data.")
    print("Make sure to run the cells in order: 1) Load and merge data, 2) Clean and prepare data")
    exit()

# 1. Handle redundant/duplicate columns from merging
# Identify columns that are duplicates (e.g., 'MP_per100', 'MP_adv', 'G_per100', 'G_adv', etc.)
# It's good practice to inspect `final_df.columns` to see what you have.

# Example: Drop redundant minute played columns, keeping one (e.g., from per_100_poss)
columns_to_drop_after_merge = []
for col in final_df.columns:
    if col.endswith('_adv') or col.endswith('_shooting') and col not in ['Player', 'Team', 'Age']: # Updated column names
        original_col_name = col.replace('_adv', '').replace('_shooting', '')
        if original_col_name in final_df.columns and original_col_name != col: # Check if original (un-suffixed) exists
            # We assume the first column (from df_per_100) is the one to keep, drop the others
            columns_to_drop_after_merge.append(col)
        elif original_col_name + '_per100' in final_df.columns and original_col_name != col:
             # If a column like 'MP' from advanced stats is now 'MP_adv' and 'MP_per100' exists, drop 'MP_adv'
            columns_to_drop_after_merge.append(col)


# Common columns that often exist in multiple tables but you only need one version:
common_stats = ['G', 'GS', 'MP'] # Games, Games Started, Minutes Played

for stat in common_stats:
    if f"{stat}_adv" in final_df.columns and f"{stat}_per100" in final_df.columns:
        columns_to_drop_after_merge.append(f"{stat}_adv") # Keep the per100 version
    elif f"{stat}_shooting" in final_df.columns and f"{stat}_per100" in final_df.columns:
         columns_to_drop_after_merge.append(f"{stat}_shooting") # Keep the per100 version


final_df.drop(columns=columns_to_drop_after_merge, inplace=True, errors='ignore') # Use errors='ignore' in case some aren't present


# 2. Filter out players with insufficient playing time
# This is crucial for meaningful archetypes. For a full season, 500-700 minutes is a good lower bound.
# Rookies might have less, so consider your minimum. A common threshold is 15-20 games OR 300-500 minutes.
min_mp_threshold = 500 # Example: Minimum 500 minutes played
final_df = final_df[final_df['MP'] >= min_mp_threshold].copy() # .copy() to avoid SettingWithCopyWarning

print(f"\nFiltered to players with >= {min_mp_threshold} minutes. New shape: {final_df.shape}")

# 3. Select features for clustering
# This is where your basketball knowledge comes in!
# Aim for a diverse set of stats that capture different aspects of play.
# Avoid highly correlated features initially to prevent redundancy (though PCA can handle this later).

# Example feature selection (you'll refine this extensively!)
features_for_clustering = [
    # Per 100 Possessions (rate stats are generally best for clustering)
    'PTS', 'AST', 'TRB', 'STL', 'BLK', 'TOV', 'PF', # Basic volume stats
    'FG%', '3P%', '2P%', 'FT%', # Shooting efficiency
    # Advanced Stats
    'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
    'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'BPM', 'VORP',
    # Shooting Stats (some might be redundant with advanced stats, choose carefully)
    'Dist.', # Average Shot Distance
    '2P_FG%', '0-3_FG%', '3-10_FG%', '10-16_FG%', '16-3P_FG%', # FG% by distance
    'Att_3P', 'Att_2P', # Shot attempts (might already be covered by USG% but can be useful)
]

# Ensure all selected features exist in your final DataFrame
# It's common to find some columns might not exist if they were merged with suffixes
# or were only present in one of the original CSVs.
# Remove any features from your list that aren't in final_df.columns
actual_features = [col for col in features_for_clustering if col in final_df.columns]
missing_features = [col for col in features_for_clustering if col not in final_df.columns]

if missing_features:
    print(f"\nWarning: The following desired features are missing and will be excluded: {missing_features}")

df_clustering = final_df[actual_features].copy()

# Add Player and Team for later interpretation (but exclude from clustering features)
player_info = final_df[['Player', 'Team', 'Pos', 'Age']].copy() # Updated column names

print(f"\nDataFrame for clustering created with shape: {df_clustering.shape}")
print("Columns for clustering:")
print(df_clustering.columns.tolist())


Filtered to players with >= 500 minutes. New shape: (436, 82)


DataFrame for clustering created with shape: (436, 30)
Columns for clustering:
['PTS', 'AST', 'TRB', 'STL', 'BLK', 'TOV', 'PF', 'FG%', '3P%', '2P%', 'FT%', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'BPM', 'VORP', 'Dist.']


In [3]:
# --- Handle Missing Values ---
print("\n--- Missing Value Check (before imputation) ---")
print(df_clustering.isnull().sum())

# Strategy: Impute missing numerical values.
# For percentages/ratios: often 0 makes sense if it's truly a "no attempts" scenario.
# For other stats: mean or median imputation can be used.
# A robust approach is to check each column individually.

for col in df_clustering.columns:
    if df_clustering[col].isnull().any():
        # A common imputation for stats where NaN means 'no attempts' is 0
        if 'FG%' in col or '3P%' in col or 'FT%' in col or 'Ar' in col or 'Dist' in col: # Check for percentage/attempt ratio columns
            df_clustering[col].fillna(0, inplace=True)
        else:
            # For other numerical stats, median is often more robust to outliers than mean
            median_val = df_clustering[col].median()
            df_clustering[col].fillna(median_val, inplace=True)
            print(f"Imputed missing values in '{col}' with median: {median_val}")

print("\n--- Missing Value Check (after imputation) ---")
print(df_clustering.isnull().sum())


--- Missing Value Check (before imputation) ---
PTS      0
AST      0
TRB      0
STL      0
BLK      0
TOV      0
PF       0
FG%      0
3P%      0
2P%      0
FT%      0
PER      0
TS%      0
3PAr     0
FTr      0
ORB%     0
DRB%     0
TRB%     0
AST%     0
STL%     0
BLK%     0
TOV%     0
USG%     0
OWS      0
DWS      0
WS       0
WS/48    0
BPM      0
VORP     0
Dist.    0
dtype: int64

--- Missing Value Check (after imputation) ---
PTS      0
AST      0
TRB      0
STL      0
BLK      0
TOV      0
PF       0
FG%      0
3P%      0
2P%      0
FT%      0
PER      0
TS%      0
3PAr     0
FTr      0
ORB%     0
DRB%     0
TRB%     0
AST%     0
STL%     0
BLK%     0
TOV%     0
USG%     0
OWS      0
DWS      0
WS       0
WS/48    0
BPM      0
VORP     0
Dist.    0
dtype: int64


In [4]:
from sklearn.preprocessing import StandardScaler

# --- Feature Scaling ---
# StandardScaler (Z-score normalization) is generally preferred for K-Means.
# It transforms data to have a mean of 0 and a standard deviation of 1.
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_clustering)

# Convert back to a DataFrame for easier handling, keeping column names
df_scaled = pd.DataFrame(scaled_features, columns=df_clustering.columns)

print("\n--- Scaled Features Head ---")
print(df_scaled.head())


--- Scaled Features Head ---
        PTS       AST       TRB       STL       BLK       TOV        PF  \
0  0.219874 -0.103204 -1.177485 -0.811858 -0.359945 -0.458698 -1.571965   
1 -0.603001  0.944976  0.917143  0.589310 -0.614092 -0.001050 -0.423486   
2  2.247671  0.346016 -0.304723 -0.111274 -0.105797  1.463427 -1.130242   
3  1.748069  1.543936 -0.878252 -0.811858 -0.868239  1.097307 -0.335142   
4  1.424797  2.554680 -0.204979  0.764456  0.021276  3.019432 -0.865209   

        FG%       3P%       2P%  ...      BLK%      TOV%      USG%       OWS  \
0  0.465977  0.247549  0.831608  ... -0.396690 -0.691456  0.052407  1.143497   
1  0.839114  0.040330  1.198810  ... -0.661859  0.818912 -0.713998  2.069792   
2 -0.325074  0.652118 -0.591299  ... -0.131520 -0.092517  2.155567  1.633889   
3 -0.116118  0.030463  0.036004  ... -0.927029 -0.118557  1.781276  2.451208   
4 -0.877318  0.227814 -1.035001  ...  0.067357  1.469933  1.834746  1.306961   

        DWS        WS     WS/48       

In [5]:
# Save the cleaned and scaled data for clustering
df_scaled.to_csv('../data/raw/nba_2025_player_stats_scaled_for_clustering.csv', index=False)
player_info.to_csv('../data/raw/nba_2025_player_info.csv', index=False) # Keep player info separate
print("\nCleaned and scaled data saved as 'nba_2025_player_stats_scaled_for_clustering.csv'")
print("Player info saved as 'nba_2025_player_info.csv'")


Cleaned and scaled data saved as 'nba_2025_player_stats_scaled_for_clustering.csv'
Player info saved as 'nba_2025_player_info.csv'
