In [None]:
#TODO: remove all unneccessary debugand prints
# Install necessary libraries
!pip install nba_api

from nba_api.stats.endpoints import leaguegamefinder, teamgamelog, playergamelog
from nba_api.stats.static import teams, players
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import time

# Step 1: Fetch NBA Stats
def fetch_all_game_data():
    """
    Fetches all NBA game data from the API for all teams.
    Returns a DataFrame with historical game data.
    """
    nba_teams = teams.get_teams()
    all_games = []
    for team in nba_teams:
        team_id = team['id']
        gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team_id)
        team_games = gamefinder.get_data_frames()[0]
        team_games['TEAM_ID'] = team_id
        all_games.append(team_games)
        time.sleep(1)  # Prevent rate limiting
    return pd.concat(all_games, ignore_index=True)

def fetch_team_stats():
    """
    Fetches team stats for all teams.
    Returns a DataFrame with team stats.
    """
    nba_teams = teams.get_teams()
    all_team_stats = []
    for team in nba_teams:
        team_id = team['id']
        stats = teamgamelog.TeamGameLog(team_id=team_id).get_data_frames()[0]
        stats['TEAM_ID'] = team_id
        all_team_stats.append(stats)
        time.sleep(1)
    return pd.concat(all_team_stats, ignore_index=True)

def fetch_player_stats():
    """
    Fetches player stats for all players.
    Returns a DataFrame with player stats.
    """
    nba_players = players.get_players()
    all_player_stats = []
    for player in nba_players[:50]:  # Limit to first 50 players to avoid overloading TODO:maybe remove
        player_id = player['id']
        try:
            stats = playergamelog.PlayerGameLog(player_id=player_id).get_data_frames()[0]
            stats['PLAYER_ID'] = player_id
            all_player_stats.append(stats)
            time.sleep(1)
        except Exception as e:
            print(f"Error fetching player stats for {player['full_name']}: {e}")
    return pd.concat(all_player_stats, ignore_index=True)


print("Fetching game data...")
games = fetch_all_game_data()

print("Fetching team stats...")
team_stats = fetch_team_stats()

print("Fetching player stats...")
player_stats = fetch_player_stats()

Collecting nba_api
  Downloading nba_api-1.7.0-py3-none-any.whl.metadata (5.5 kB)
Downloading nba_api-1.7.0-py3-none-any.whl (280 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.2/280.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nba_api
Successfully installed nba_api-1.7.0
Fetching game data...
Fetching team stats...
Fetching player stats...


  return pd.concat(all_player_stats, ignore_index=True)


In [3]:
# Step 2: Preprocess Data

# helper function that checks if column includes any alphabetical characters
def contains_alpha(col):
    # Only apply to columns of type 'object' (string columns)
    if col.dtype == 'object':
        return col.str.contains(r'[a-zA-Z]', na=False).any()
    return False

# data preperation function that combines the columns into one dataframe, creates new features and deletes unwanted features
def preprocess_data(games, team_stats, player_stats):
    """
    Prepares and merges game, team, and player stats for use with a Tabular Transformer.
    """
    # Merge games with team stats
    data = games.merge(team_stats, left_on=['GAME_ID', 'TEAM_ID'], right_on=['Game_ID', 'Team_ID'], how='left', suffixes=('', '_TEAM'))

    # Aggregate player stats by GAME_ID and TEAM_ID
    """ temporarily removed, TODO: add back to model
    player_stats_grouped = player_stats.groupby(['Game_ID', 'TEAM_ID']).mean().reset_index()
    player_stats_grouped.drop(columns=['PLAYER_ID'], inplace=True)

    # Merge aggregated player stats with games
    data = data.merge(player_stats_grouped, on=['GAME_ID', 'TEAM_ID'], how='left', suffixes=('', '_PLAYER'))
    """
    # Encode target variable (e.g., Win/Loss)
    data['TARGET'] = (data['WL'] == 'W').astype(int)

    # add new features
    data['Home'] = data['MATCHUP'].apply(lambda x: 1 if '@' in x else 0)
    data['Win_Streak'] = data.groupby('TEAM_ID')['W'].rolling(window=5).sum().reset_index(drop=True)
    data['Win_Pct_Last_10'] = data.groupby('TEAM_ID')['W'].rolling(window=10).mean().reset_index(drop=True)

    # Drop unnecessary columns
    columns_to_drop = ['GAME_DATE', 'MATCHUP', 'WL', 'GAME_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME']
    data.drop(columns=[col for col in columns_to_drop if col in data.columns], inplace=True)

    print(type(data))
    # Drop the columns with alphabetic characters
    columns_to_drop2 = [col for col in data.columns if contains_alpha(data[col])]
    print(columns_to_drop2)
    print(type(data))
    if(columns_to_drop2):
      data.drop(columns=[col for col in columns_to_drop2 if col in data.columns], inplace=True)
    print(type(data))

    # Handle missing values
    data.fillna(0, inplace=True)

    return data

# Step 3: Prepare Data for Tabular Transformer, split to train/val/test, normalize values
def prepare_data_for_transformer(data):
    """
    Splits the data into train, validation, and test sets and scales numeric features.
    """
    # Split data into features and target
    X = data
    y = data['TARGET']

    # Split into train, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    # Define features for Standard and MinMax normalization
    standard_features = ['MIN', 'PTS', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'REB',
                         'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS', 'MIN_TEAM', 'FGM_TEAM', 'FGA_TEAM',
                         'FG3M_TEAM', 'FG3A_TEAM', 'FTM_TEAM', 'FTA_TEAM', 'OREB_TEAM', 'DREB_TEAM', 'REB_TEAM',
                         'AST_TEAM', 'STL_TEAM', 'BLK_TEAM', 'TOV_TEAM', 'PF_TEAM', 'PTS_TEAM', 'Win_Streak']

    minmax_features = ['FG_PCT', 'FG3_PCT', 'FT_PCT', 'Home', 'W', 'L', 'W_PCT', 'FG_PCT_TEAM',
                       'FG3_PCT_TEAM', 'FT_PCT_TEAM', 'Win_Pct_Last_10']

    print(type(X_train))
    # Standard Scaling
    standard_scaler = StandardScaler()
    X_train[standard_features] = standard_scaler.fit_transform(X_train[standard_features])
    X_val[standard_features] = standard_scaler.transform(X_val[standard_features])
    X_test[standard_features] = standard_scaler.transform(X_test[standard_features])

    print(type(X_train))
    # MinMax Scaling
    minmax_scaler = MinMaxScaler()
    X_train[minmax_features] = minmax_scaler.fit_transform(X_train[minmax_features])
    X_val[minmax_features] = minmax_scaler.transform(X_val[minmax_features])
    X_test[minmax_features] = minmax_scaler.transform(X_test[minmax_features])
    print(type(X_train))
    return X_train, X_val, X_test, y_train, y_val, y_test

In [8]:
""" just for debugging TODO:remove cell
print(games.columns)
print(team_stats.columns)
print(player_stats.columns)
print(games.head())
print(team_stats.head())
print(player_stats.sample())
"""

' just for debugging TODO:remove cell\nprint(games.columns)\nprint(team_stats.columns)\nprint(player_stats.columns)\nprint(games.head())\nprint(team_stats.head())\nprint(player_stats.sample())\n'

In [None]:
#preserve original data, work on copies
games_copy = games.copy()
team_stats_copy = team_stats.copy()
player_stats_copy = player_stats.copy()

print("Preprocessing data...")
data = preprocess_data(games_copy, team_stats_copy, player_stats_copy)

"""
just for debugging TODO:remove from cell
print("data columns")
print(data.columns)
print("data head()")
print(data.head())
"""

print("Preparing data for Tabular Transformer...")
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data_for_transformer(data)
print(X_train)#TODO:remove
print("Data is ready!")
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")


"""just for debugging TODO:remove from cell
print("X_train columns")
print(X_train.columns)
print("X_val columns")
print(X_val.columns)
print("X_test columns")
print(X_test.columns)
print("X_train head()")
print(X_train.head())
print("X_val head()")
print(X_val.head())
print("X_test head()")
print(X_test.head())
print("y_train head()")
print(y_train.head())
print("y_val head()")
print(y_val.head())
print("y_test head()")
print(y_test.head())
"""

Preprocessing data...
<class 'pandas.core.frame.DataFrame'>
['GAME_DATE_TEAM', 'MATCHUP_TEAM', 'WL_TEAM']
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Preparing data for Tabular Transformer...
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
       SEASON_ID     TEAM_ID       MIN       PTS       FGM       FGA  \
39452      21991  1610612747 -0.033652 -0.164260  0.266745 -0.283549   
102982     22000  1610612765 -0.033652  0.545335  0.431987 -0.058440   
76543      22023  1610612758  2.000504  1.325890  0.597229  1.179659   
16752      21990  1610612741 -0.033652  0.545335  0.927713 -0.283549   
93582      41991  1610612762 -0.033652 -1.015775 -1.220433 -2.084420   
...          ...         ...       ...       ...       ...       ...   
102486     22005  1610612765 -0.115019 -0.093301  0.101503  0.391778   
75696      21991  1610612757 -0.033652  1.325890  0.266745 -0.396103   
19822      220

'just for debugging TODO:remove from cell\nprint("X_train columns")\nprint(X_train.columns)\nprint("X_val columns")\nprint(X_val.columns)\nprint("X_test columns")\nprint(X_test.columns)\nprint("X_train head()")\nprint(X_train.head())\nprint("X_val head()")\nprint(X_val.head())\nprint("X_test head()")\nprint(X_test.head())\nprint("y_train head()")\nprint(y_train.head())\nprint("y_val head()")\nprint(y_val.head())\nprint("y_test head()")\nprint(y_test.head())\n'

In [6]:
# Uncomment to install necessary libraries
#!pip install optuna pytorch-tabular

import optuna
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from sklearn.metrics import accuracy_score

# Step 1: Define the Objective Function
def objective(trial):
    # Hyperparameters to optimize
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    dropout = trial.suggest_uniform('dropout', 0.1, 0.5)
    hidden_layers = trial.suggest_int('hidden_layers', 1, 4)
    hidden_dim = trial.suggest_int('hidden_dim', 32, 256)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])

    # Data Configuration
    data_config = DataConfig(
        target=['TARGET'],
        continuous_cols=[col for col in data.columns if col != 'TARGET'],
        categorical_cols=[],
    )

    # Model Configuration
    model_config = CategoryEmbeddingModelConfig(
        task="classification",
        layers="-".join([str(hidden_dim)] * hidden_layers),
        activation="ReLU",
        dropout=dropout,
    )

    # Trainer Configuration
    trainer_config = TrainerConfig(
        auto_lr_find=False,
        batch_size=batch_size,
        max_epochs=10,#originally was 20
    )

    # Optimizer Configuration
    optimizer_config = OptimizerConfig(
        optimizer="Adam",
    )

    # Initialize the Tabular Model
    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config
    )

    # Train the model
    tabular_model.fit(train=X_train, validation=X_val)

    # Predict and evaluate
    preds = tabular_model.predict(X_val)
    print(preds.columns)
    accuracy = accuracy_score(y_val, preds['TARGET_prediction'])

    return accuracy

# Step 2: Create Study and Optimize
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=2)#was 50 originally

# Step 3: Train Final Model with Best Parameters
best_params = study.best_params
print("Best Parameters:", best_params)

data_config = DataConfig(
        target=['TARGET'],
        continuous_cols=[col for col in data.columns if col != 'TARGET'],
        categorical_cols=[],
    )

final_model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="-".join([str(best_params['hidden_dim'])] * best_params['hidden_layers']),
    activation="ReLU",
    dropout=best_params['dropout'],
)

final_trainer_config = TrainerConfig(
    auto_lr_find=False,
    batch_size=best_params['batch_size'],
    max_epochs=30,
)

final_optimizer_config = OptimizerConfig(
    optimizer="Adam",
    #learning_rate=best_params['learning_rate']
)

final_model = TabularModel(
    data_config=data_config,
    model_config=final_model_config,
    optimizer_config=final_optimizer_config,
    trainer_config=final_trainer_config
)

final_model.fit(train=X_train, validation=X_val)

# Step 4: Evaluate on Test Set
final_preds = final_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds['TARGET_prediction'])

print(f"Final Test Accuracy: {final_accuracy}")

[I 2025-02-02 10:25:29,942] A new study created in memory with name: no-name-6cdb0fa6-c862-4c3d-b836-5fd883c3865f
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
  dropout = trial.suggest_uniform('dropout', 0.1, 0.5)
INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off
INFO:lightning_fabric.utilities.seed:Seed set to 42
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for classification task
 -0.0428363 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[:, self.config.continuous_cols] = self.scaler.fit_transform(
 -0.62592182]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[:, self.config.continuous_cols] = self.scaler.transform(data.loc[:, self.config.continuous_cols])
INFO:pytorch_tabular.tabular_model:Preparing the Model: CategoryEmbeddingModel
INFO:pytorch_tabular.ta

Output()

INFO:pytorch_tabular.tabular_model:Training the model completed
INFO:pytorch_tabular.tabular_model:Loading the best model
  return torch.load(f, map_location=map_location)
 -0.62592182]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[:, self.config.continuous_cols] = self.scaler.transform(data.loc[:, self.config.continuous_cols])
[I 2025-02-02 10:35:41,943] Trial 0 finished with value: 0.9463964801388114 and parameters: {'learning_rate': 0.014623791481441733, 'dropout': 0.3926569304175621, 'hidden_layers': 2, 'hidden_dim': 142, 'batch_size': 32}. Best is trial 0 with value: 0.9463964801388114.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
  dropout = trial.suggest_uniform('dropout', 0.1, 0.5)
INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off
INFO:lightning_fabric.utilities.seed:Seed set to 42
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
INFO:pytorch_tabular.tabular_datamodule

Index(['TARGET_0_probability', 'TARGET_1_probability', 'TARGET_prediction'], dtype='object')


 -0.0428363 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[:, self.config.continuous_cols] = self.scaler.fit_transform(
 -0.62592182]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[:, self.config.continuous_cols] = self.scaler.transform(data.loc[:, self.config.continuous_cols])
INFO:pytorch_tabular.tabular_model:Preparing the Model: CategoryEmbeddingModel
INFO:pytorch_tabular.tabular_model:Preparing the Trainer
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_tabular.tabular_model:Training Started
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /content/saved_models exists and is not empty.


Output()

INFO:pytorch_tabular.tabular_model:Training the model completed
INFO:pytorch_tabular.tabular_model:Loading the best model
  return torch.load(f, map_location=map_location)
 -0.62592182]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[:, self.config.continuous_cols] = self.scaler.transform(data.loc[:, self.config.continuous_cols])
[I 2025-02-02 10:45:17,913] Trial 1 finished with value: 0.9518497862056144 and parameters: {'learning_rate': 0.0024417364316333913, 'dropout': 0.22804378369495684, 'hidden_layers': 4, 'hidden_dim': 71, 'batch_size': 64}. Best is trial 1 with value: 0.9518497862056144.


Index(['TARGET_0_probability', 'TARGET_1_probability', 'TARGET_prediction'], dtype='object')
Best Parameters: {'learning_rate': 0.0024417364316333913, 'dropout': 0.22804378369495684, 'hidden_layers': 4, 'hidden_dim': 71, 'batch_size': 64}


NameError: name 'data_config' is not defined