In [1]:
import os
import glob
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm


ModuleNotFoundError: No module named 'torch'

In [None]:
# ============ CONFIGURATION ============
TEAM_NAME_MAP = {
    'ATL': 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets',
    'BKN': 'Brooklyn Nets', 'CHA': 'Charlotte Hornets', 'CHI': 'Chicago Bulls',
    'CLE': 'Cleveland Cavaliers', 'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons', 'GSW': 'Golden State Warriors', 'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies', 'MIA': 'Miami Heat', 'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans', 'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder', 'ORL': 'Orlando Magic', 'PHI': 'Philadelphia 76ers',
    'PHX': 'Phoenix Suns', 'POR': 'Portland Trail Blazers', 'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs', 'TOR': 'Toronto Raptors', 'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards',
    # full name mappings
    'Detroit Pistons': 'Detroit Pistons',
    'Indiana Pacers': 'Indiana Pacers',
    'San Antonio Spurs': 'San Antonio Spurs',
    'New Jersey Nets': 'New Jersey Nets',
    'Dallas Mavericks': 'Dallas Mavericks'
}


In [None]:
def load_player_stats(base_path):
    player_files = glob.glob(os.path.join(base_path, "Player Stats Regular and Playoff", "*_filtered.xlsx"))
    dfs = []
    
    for file in player_files:
        if '~$' in file:
            continue
            
        season = os.path.basename(file).split('_')[0]
        df = pd.read_excel(file)
        df['Team'] = df['Team'].map(TEAM_NAME_MAP).fillna(df['Team'])
        df['Season'] = season
        
        # select and clean relevant columns
        player_stats = df[['Player', 'Team', 'Season', 'G', 'MP', 'FG%', '3P%', 'eFG%', 'FT%',
                          'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']].copy()
        
        # handle missing values
        for col in ['FG%', '3P%', 'eFG%', 'FT%']:
            player_stats[col] = player_stats[col].fillna(0)  # Assume 0% if no attempts
            
        # fill other missing values with 0 
        player_stats = player_stats.fillna(0)
        
        dfs.append(player_stats)
    
    return pd.concat(dfs, ignore_index=True)


In [None]:
def load_playoff_stats(base_path):
    playoff_files = glob.glob(os.path.join(base_path, "Actual Playoff Team Stats", "*__playoff_actual_team_stats.xlsx"))
    dfs = []
    
    for file in playoff_files:
        season = os.path.basename(file).split('__')[0]
        df = pd.read_excel(file)
        clean_df = df.rename(columns={'Tm': 'Team', 'Rk': 'Playoff_Rank'})[['Team', 'Playoff_Rank']].copy()
        clean_df['Team'] = clean_df['Team'].map(TEAM_NAME_MAP).fillna(clean_df['Team'])
        clean_df['Season'] = season
        dfs.append(clean_df)
    
    playoff_df = pd.concat(dfs, ignore_index=True)
    

In [None]:
    # validate playoff rankings
    for season in playoff_df['Season'].unique():
        season_ranks = playoff_df[playoff_df['Season'] == season]['Playoff_Rank']
        assert season_ranks.min() >= 1, f"Invalid rank <1 in {season}"
        assert len(season_ranks.unique()) == len(season_ranks), f"Duplicate ranks in {season}"
    
    return playoff_df


In [None]:
# ============ PREPROCESSING ============
def preprocess_data(player_df, playoff_df):
    # team aggregation with null handling
    def weighted_avg(values, weights):
        mask = weights > 0
        if mask.any():
            return np.average(values[mask], weights=weights[mask])
        return 0  # return 0 if no valid weights
    
    team_agg = player_df.groupby(['Team', 'Season']).agg({
        'MP': 'sum',
        'FG%': lambda x: weighted_avg(x, player_df.loc[x.index, 'G']),
        '3P%': lambda x: weighted_avg(x, player_df.loc[x.index, 'G']),
        'eFG%': lambda x: weighted_avg(x, player_df.loc[x.index, 'G']),
        'FT%': lambda x: weighted_avg(x, player_df.loc[x.index, 'G']),
        'TRB': 'sum',
        'AST': ['sum', lambda x: x.nlargest(3).mean() if len(x) >= 3 else 0],
        'STL': 'sum',
        'BLK': 'sum',
        'TOV': 'sum',
        'PF': 'sum',
        'PTS': ['sum', lambda x: x.nlargest(3).mean() if len(x) >= 3 else 0],
        'G': ['mean', 'std']
    }).reset_index()
    

In [None]:
    # flatten multi-index columns
    team_agg.columns = [
        'Team', 'Season', 'MP_sum', 'FG%_wt', '3P%_wt', 'eFG%_wt', 'FT%_wt',
        'TRB_sum', 'AST_sum', 'AST_top3', 'STL_sum', 'BLK_sum', 'TOV_sum',
        'PF_sum', 'PTS_sum', 'PTS_top3', 'G_mean', 'G_std'
    ]
    

In [None]:
    # fill any remaining null values
    team_agg = team_agg.fillna(0)
    
    # merge with playoff data
    merged = pd.merge(team_agg, playoff_df, on=['Team', 'Season'], how='inner')
    
    # prepare player arrays
    player_cols = ['MP', 'FG%', '3P%', 'eFG%', 'FT%', 'TRB', 'AST', 
                   'STL', 'BLK', 'TOV', 'PF', 'PTS']
    player_arrays = []
    for _, row in merged.iterrows():
        team_players = player_df[(player_df['Team'] == row['Team']) & 
                               (player_df['Season'] == row['Season'])]
        arr = team_players[player_cols].values
        player_arrays.append(arr)
    

In [None]:
    # pad player arrays to uniform size
    max_players = max(arr.shape[0] for arr in player_arrays)
    player_arrays = np.stack([
        np.pad(arr, ((0, max_players - arr.shape[0]), (0, 0)), 
        mode='constant', constant_values=0)
        for arr in player_arrays
    ])
    

In [None]:
    # scale player stats with null protection
    player_scaler = StandardScaler()
    original_shape = player_arrays.shape
    player_arrays_reshaped = player_arrays.reshape(-1, original_shape[-1])
    player_arrays_reshaped = np.nan_to_num(player_arrays_reshaped, nan=0)
    player_arrays = player_scaler.fit_transform(player_arrays_reshaped).reshape(original_shape)
    
    # prepare team features
    team_features = merged.drop(['Team', 'Season', 'Playoff_Rank'], axis=1).values
    team_scaler = StandardScaler()
    team_features = np.nan_to_num(team_features, nan=0)
    team_features = team_scaler.fit_transform(team_features)
    
    # final validation
    assert not np.isnan(player_arrays).any(), "NaN values in player arrays"
    assert not np.isnan(team_features).any(), "NaN values in team features"
    assert not np.isnan(merged['Playoff_Rank'].values).any(), "NaN values in targets"
    
    return player_arrays, team_features, merged['Playoff_Rank'].values, merged

In [None]:
class NBADataset(Dataset):
    def __init__(self, player_arrays, team_features, targets, original_indices=None):
        self.player_data = torch.FloatTensor(player_arrays)
        self.team_data = torch.FloatTensor(team_features)
        self.targets = torch.FloatTensor(targets)
        self.original_indices = original_indices if original_indices is not None else np.arange(len(targets))
    
    def __len__(self): 
        return len(self.targets)
    
    def __getitem__(self, idx):
        return {
            'player_stats': self.player_data[idx],
            'team_features': self.team_data[idx],
            'target': self.targets[idx],
            'original_idx': self.original_indices[idx]
        }
