In [None]:
# Uncomment Install necessary libraries
# !pip install nba_api

from nba_api.stats.endpoints import leaguegamefinder, teamgamelog, playergamelog
from nba_api.stats.static import teams, players
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import time

# Step 1: Fetch NBA Stats
def fetch_all_game_data():
    """
    Fetches all NBA game data from the API for all teams.
    Returns a DataFrame with historical game data.
    """
    nba_teams = teams.get_teams()
    all_games = []
    for team in nba_teams:
        team_id = team['id']
        gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team_id)
        team_games = gamefinder.get_data_frames()[0]
        team_games['TEAM_ID'] = team_id
        all_games.append(team_games)
        time.sleep(1)  # Prevent rate limiting
    return pd.concat(all_games, ignore_index=True)

def fetch_team_stats():
    """
    Fetches team stats for all teams.
    Returns a DataFrame with team stats.
    """
    nba_teams = teams.get_teams()
    all_team_stats = []
    for team in nba_teams:
        team_id = team['id']
        stats = teamgamelog.TeamGameLog(team_id=team_id).get_data_frames()[0]
        stats['TEAM_ID'] = team_id
        all_team_stats.append(stats)
        time.sleep(1)
    return pd.concat(all_team_stats, ignore_index=True)

def fetch_player_stats():
    """
    Fetches player stats for all players.
    Returns a DataFrame with player stats.
    """
    nba_players = players.get_players()
    all_player_stats = []
    for player in nba_players[:50]:  # Limit to first 50 players to avoid overloading TODO:maybe remove
        player_id = player['id']
        try:
            stats = playergamelog.PlayerGameLog(player_id=player_id).get_data_frames()[0]
            stats['PLAYER_ID'] = player_id
            all_player_stats.append(stats)
            time.sleep(1)
        except Exception as e:
            print(f"Error fetching player stats for {player['full_name']}: {e}")
    return pd.concat(all_player_stats, ignore_index=True)


print("Fetching game data...")
games = fetch_all_game_data()

print("Fetching team stats...")
team_stats = fetch_team_stats()

print("Fetching player stats...")
player_stats = fetch_player_stats()

Fetching game data...
Fetching team stats...
Fetching player stats...


  return pd.concat(all_player_stats, ignore_index=True)


In [2]:
# Step 2: Preprocess Data

def contains_alpha(col):
    # Only apply to columns of type 'object' (string columns)
    if col.dtype == 'object':
        return col.str.contains(r'[a-zA-Z]', na=False).any()
    return False

def preprocess_data(games, team_stats, player_stats):
    """
    Prepares and merges game, team, and player stats for use with a Tabular Transformer.
    """
    # Merge games with team stats
    data = games.merge(team_stats, left_on=['GAME_ID', 'TEAM_ID'], right_on=['Game_ID', 'Team_ID'], how='left', suffixes=('', '_TEAM'))

    # Aggregate player stats by GAME_ID and TEAM_ID
    """ temporarily removed, TODO: add back to model
    player_stats_grouped = player_stats.groupby(['Game_ID', 'TEAM_ID']).mean().reset_index()
    player_stats_grouped.drop(columns=['PLAYER_ID'], inplace=True)

    # Merge aggregated player stats with games
    data = data.merge(player_stats_grouped, on=['GAME_ID', 'TEAM_ID'], how='left', suffixes=('', '_PLAYER'))
    """
    # Encode target variable (e.g., Win/Loss)
    data['TARGET'] = (data['WL'] == 'W').astype(int)

    # add new features
    data['Home'] = data['MATCHUP'].apply(lambda x: 1 if '@' in x else 0)
    data['Win_Streak'] = data.groupby('TEAM_ID')['W'].rolling(window=5).sum().reset_index(drop=True)
    data['Win_Pct_Last_10'] = data.groupby('TEAM_ID')['W'].rolling(window=10).mean().reset_index(drop=True)

    # Drop unnecessary columns
    columns_to_drop = ['GAME_DATE', 'MATCHUP', 'WL', 'GAME_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME']
    data.drop(columns=[col for col in columns_to_drop if col in data.columns], inplace=True)

    print(type(data))
    # Drop the columns with alphabetic characters
    columns_to_drop2 = [col for col in data.columns if contains_alpha(data[col])]
    print(columns_to_drop2)
    print(type(data))
    if(columns_to_drop2):
      data.drop(columns=[col for col in columns_to_drop2 if col in data.columns], inplace=True)
    print(type(data))

    # Handle missing values
    data.fillna(0, inplace=True)

    return data

# Step 3: Prepare Data for Tabular Transformer
def prepare_data_for_transformer(data):
    """
    Splits the data into train, validation, and test sets and scales numeric features.
    """
    # Split data into features and target
    X = data.drop(columns=['TARGET'])
    y = data['TARGET']

    # Split into train, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    # Scale numeric features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [3]:
""" just for debugging TODO:remove cell
print(games.columns)
print(team_stats.columns)
print(player_stats.columns)
print(games.head())
print(team_stats.head())
print(player_stats.sample())
"""




In [3]:
#preserve original data, work on copies
games_copy = games.copy()
team_stats_copy = team_stats.copy()
player_stats_copy = player_stats.copy()

print("Preprocessing data...")
data = preprocess_data(games_copy, team_stats_copy, player_stats_copy)

"""just for debugging TODO:remove from cell
print(data.columns)
print(data.head())
"""
print("Preparing data for Tabular Transformer...")
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data_for_transformer(data)

print("Data is ready!")
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")

Preprocessing data...
<class 'pandas.core.frame.DataFrame'>
['GAME_DATE_TEAM', 'MATCHUP_TEAM', 'WL_TEAM']
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Preparing data for Tabular Transformer...
Data is ready!
Train shape: (75216, 50), Validation shape: (16118, 50), Test shape: (16118, 50)
