In [117]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams
import shap
import matplotlib.pyplot as plt
import seaborn as sns

# Setting a style for seaborn, optional
sns.set_style('whitegrid')

In [118]:
# Get all NBA teams info
nba_teams = teams.get_teams()

# Create a dictionary mapping team abbreviations to their team IDs
team_abbr_to_id = {team['abbreviation']: team['id'] for team in nba_teams}

# Initialize an empty DataFrame to store all game data
all_games = pd.DataFrame()

# Loop through all teams and fetch their games
for team in nba_teams:
    team_id = team['id']
    gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team_id)
    games = gamefinder.get_data_frames()[0]
    all_games = pd.concat([all_games, games], ignore_index=True)

print(all_games.columns)

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')


In [120]:
all_games['GAME_DATE'].max()

'2024-12-17'

In [109]:
# Convert GAME_DATE to datetime
all_games['GAME_DATE'] = pd.to_datetime(all_games['GAME_DATE'])

# Create a win column: 1 if W, else 0
all_games['WIN'] = all_games['WL'].apply(lambda x: 1 if x == 'W' else 0)

# Convert PTS to float
all_games['PTS'] = all_games['PTS'].astype(float)

# Calculate average points per game by team
all_games['Points_Per_Game'] = all_games.groupby('TEAM_ID')['PTS'].transform('mean')

def get_opponent_team_id(matchup, team_abbr_to_id, team_id):
    # Matchup looks like "LAL vs. BOS" or "LAL @ BOS"
    if '@' in matchup:
        opponent_abbr = matchup.split(' @ ')[-1]
    else:
        opponent_abbr = matchup.split(' vs. ')[-1]
    return team_abbr_to_id.get(opponent_abbr, team_id)

# Create OPPONENT_TEAM_ID column
all_games['OPPONENT_TEAM_ID'] = all_games.apply(
    lambda row: get_opponent_team_id(row['MATCHUP'], team_abbr_to_id, row['TEAM_ID']), axis=1
)

# HOME_GAME column: 1 if 'vs.' in matchup, else 0
all_games['HOME_GAME'] = all_games['MATCHUP'].apply(lambda x: 1 if 'vs.' in x else 0)

# LAST_GAME_RESULT: previous game's WIN result for each team
all_games['LAST_GAME_RESULT'] = all_games.groupby('TEAM_ID')['WIN'].shift(1).fillna(0)


In [110]:
le = LabelEncoder()
all_games['TEAM_ID'] = le.fit_transform(all_games['TEAM_ID'])
all_games['OPPONENT_TEAM_ID'] = le.fit_transform(all_games['OPPONENT_TEAM_ID'])


In [111]:
X = all_games[['TEAM_ID', 'OPPONENT_TEAM_ID', 'Points_Per_Game', 'HOME_GAME', 'LAST_GAME_RESULT']]
y = all_games['WIN']

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [112]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [113]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.5859327503156714
              precision    recall  f1-score   support

           0       0.58      0.59      0.59     10692
           1       0.59      0.58      0.58     10691

    accuracy                           0.59     21383
   macro avg       0.59      0.59      0.59     21383
weighted avg       0.59      0.59      0.59     21383



In [114]:
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index=X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
print("Feature Importances:\n", feature_importances)


Feature Importances:
                   importance
OPPONENT_TEAM_ID    0.481924
HOME_GAME           0.292751
Points_Per_Game     0.096894
TEAM_ID             0.086160
LAST_GAME_RESULT    0.042272


In [115]:
team_abbr = 'LAL'
opponent_abbr = 'BOS'
average_points_per_game = 110.5

new_data = pd.DataFrame({
    'TEAM_ID': [le.transform([team_abbr_to_id[team_abbr]])[0]],
    'OPPONENT_TEAM_ID': [le.transform([team_abbr_to_id[opponent_abbr]])[0]],
    'Points_Per_Game': [average_points_per_game],
    'HOME_GAME': [1],            # Assuming a home game for Lakers
    'LAST_GAME_RESULT': [1]      # Let's say they won their last game
})

predictions = model.predict(new_data)
prediction_probabilities = model.predict_proba(new_data)

print("Predictions: ", predictions)
print("Prediction Probabilities: ", prediction_probabilities)


Predictions:  [0]
Prediction Probabilities:  [[0.53105066 0.46894934]]


In [135]:
from nba_api.stats.endpoints import ScoreboardV2
from datetime import date, timedelta

def get_date_str(d):
    return d.strftime('%Y-%m-%d')

today_str = get_date_str(date.today())
tomorrow_str = get_date_str(date.today() + timedelta(days=1))

def get_scheduled_games_for_date(date_str):
    sb = ScoreboardV2(game_date=date_str, league_id='00')
    data = sb.get_dict()
    headers = data['resultSets'][0]['headers']
    print("Headers:", headers)  # Print headers once to identify indices
    games_data = data['resultSets'][0]['rowSet']
    return headers, games_data

headers_today, today_games = get_scheduled_games_for_date(today_str)
headers_tomorrow, tomorrow_games = get_scheduled_games_for_date(tomorrow_str)

future_games = today_games + tomorrow_games

print("Games scheduled today:", len(today_games))
print("Games scheduled tomorrow:", len(tomorrow_games))
print("Total future games to predict:", len(future_games))


Headers: ['GAME_DATE_EST', 'GAME_SEQUENCE', 'GAME_ID', 'GAME_STATUS_ID', 'GAME_STATUS_TEXT', 'GAMECODE', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'SEASON', 'LIVE_PERIOD', 'LIVE_PC_TIME', 'NATL_TV_BROADCASTER_ABBREVIATION', 'HOME_TV_BROADCASTER_ABBREVIATION', 'AWAY_TV_BROADCASTER_ABBREVIATION', 'LIVE_PERIOD_TIME_BCAST', 'ARENA_NAME', 'WH_STATUS', 'WNBA_COMMISSIONER_FLAG']
Headers: ['GAME_DATE_EST', 'GAME_SEQUENCE', 'GAME_ID', 'GAME_STATUS_ID', 'GAME_STATUS_TEXT', 'GAMECODE', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'SEASON', 'LIVE_PERIOD', 'LIVE_PC_TIME', 'NATL_TV_BROADCASTER_ABBREVIATION', 'HOME_TV_BROADCASTER_ABBREVIATION', 'AWAY_TV_BROADCASTER_ABBREVIATION', 'LIVE_PERIOD_TIME_BCAST', 'ARENA_NAME', 'WH_STATUS', 'WNBA_COMMISSIONER_FLAG']
Games scheduled today: 0
Games scheduled tomorrow: 13
Total future games to predict: 13


In [132]:
def get_team_ppg(team_id):
    team_games = all_games[all_games['TEAM_ID'] == team_id]
    if team_games.empty:
        return 100.0  # Default PPG if no data found
    else:
        return team_games['PTS'].mean()

def get_last_game_result(team_id):
    team_games = all_games[all_games['TEAM_ID'] == team_id].sort_values('GAME_DATE')
    if team_games.empty:
        return 0
    else:
        last_win = team_games.iloc[-1]['WIN']
        return last_win  # This is already 0 or 1

def encode_team_id(team_id_value):
    return le.transform([team_id_value])[0]


In [133]:
# Assuming WL column exists in all_games:
all_games['WIN'] = all_games['WL'].apply(lambda x: 1 if x == 'W' else 0)

print("Columns in all_games:", all_games.columns)

# Now you can safely call get_last_game_result
test_team_abbr = "LAL"
test_team_id = team_abbr_to_id.get(test_team_abbr, None)
if test_team_id:
    print("PPG for LAL:", get_team_ppg(test_team_id))
    print("Last Game Result for LAL:", get_last_game_result(test_team_id))
    print("Encoded LAL ID:", encode_team_id(test_team_id))
else:
    print("LAL not found in team_abbr_to_id")


Columns in all_games: Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS', 'WIN'],
      dtype='object')
PPG for LAL: 105.0952023988006
Last Game Result for LAL: 1
Encoded LAL ID: 10


In [141]:
if not future_games:
    print("No upcoming games found for today or tomorrow.")
else:
    predictions_for_future = []

    # Identify column indices for necessary fields
    GAME_ID_idx = headers_today.index('GAME_ID')           # or headers if consistent
    HOME_TEAM_ID_idx = headers_today.index('HOME_TEAM_ID')
    VISITOR_TEAM_ID_idx = headers_today.index('VISITOR_TEAM_ID')
    GAME_DATE_EST_idx = headers_today.index('GAME_DATE_EST')

    # Suppose you have a reverse mapping from team_id_to_abbr
    # If not, you need to create one based on your existing data and dictionary:
    team_id_to_abbr = {v: k for k, v in team_abbr_to_id.items()}

    for game in future_games:
        # Extract values by indices
        game_id = game[GAME_ID_idx]
        home_team_id = game[HOME_TEAM_ID_idx]
        away_team_id = game[VISITOR_TEAM_ID_idx]
        game_date_est = game[GAME_DATE_EST_idx]

        # Convert numeric IDs to abbreviations if you have that mapping
        home_team_abbr = team_id_to_abbr.get(home_team_id, None)
        away_team_abbr = team_id_to_abbr.get(away_team_id, None)

        # If abbreviations are not found, skip this game
        if home_team_abbr is None or away_team_abbr is None:
            continue

        # Predict from the home team's perspective
        encoded_home_id = encode_team_id(home_team_id)
        encoded_away_id = encode_team_id(away_team_id)
        home_ppg = get_team_ppg(home_team_id)
        home_last_result = get_last_game_result(home_team_id)

        home_input = pd.DataFrame({
            'TEAM_ID': [encoded_home_id],
            'OPPONENT_TEAM_ID': [encoded_away_id],
            'Points_Per_Game': [home_ppg],
            'HOME_GAME': [1],
            'LAST_GAME_RESULT': [home_last_result]
        })

        home_pred = model.predict(home_input)
        home_pred_proba = model.predict_proba(home_input)
        home_result_str = "WIN" if home_pred[0] == 1 else "LOSS"

        predictions_for_future.append({
            'gameId': game_id,
            'date': game_date_est,
            'team': home_team_abbr,
            'opponent': away_team_abbr,
            'home': True,
            'prediction': home_result_str,
            'win_probability': home_pred_proba[0][1]
        })

        # Predict from the away team's perspective (optional)
        away_ppg = get_team_ppg(away_team_id)
        away_last_result = get_last_game_result(away_team_id)

        away_input = pd.DataFrame({
            'TEAM_ID': [encoded_away_id],
            'OPPONENT_TEAM_ID': [encoded_home_id],
            'Points_Per_Game': [away_ppg],
            'HOME_GAME': [0],
            'LAST_GAME_RESULT': [away_last_result]
        })

        away_pred = model.predict(away_input)
        away_pred_proba = model.predict_proba(away_input)
        away_result_str = "WIN" if away_pred[0] == 1 else "LOSS"

        predictions_for_future.append({
            'gameId': game_id,
            'date': game_date_est,
            'team': away_team_abbr,
            'opponent': home_team_abbr,
            'home': False,
            'prediction': away_result_str,
            'win_probability': away_pred_proba[0][1]
        })

    # Print predictions
    for pred_data in predictions_for_future:
        print(
            f"Game ID: {pred_data['gameId']} on {pred_data['date']}: "
            f"{pred_data['team']} vs. {pred_data['opponent']} "
            f"(Home={pred_data['home']}): Predicted {pred_data['prediction']} with "
            f"win probability {pred_data['win_probability']:.2f}"
        )


Game ID: 0022400360 on 2024-12-19T00:00:00: DET vs. UTA (Home=True): Predicted LOSS with win probability 0.26
Game ID: 0022400360 on 2024-12-19T00:00:00: UTA vs. DET (Home=False): Predicted LOSS with win probability 0.49
Game ID: 0022400361 on 2024-12-19T00:00:00: ORL vs. OKC (Home=True): Predicted LOSS with win probability 0.21
Game ID: 0022400361 on 2024-12-19T00:00:00: OKC vs. ORL (Home=False): Predicted LOSS with win probability 0.36
Game ID: 0022400362 on 2024-12-19T00:00:00: WAS vs. CHA (Home=True): Predicted WIN with win probability 0.61
Game ID: 0022400362 on 2024-12-19T00:00:00: CHA vs. WAS (Home=False): Predicted LOSS with win probability 0.35
Game ID: 0022400363 on 2024-12-19T00:00:00: BOS vs. CHI (Home=True): Predicted WIN with win probability 0.74
Game ID: 0022400363 on 2024-12-19T00:00:00: CHI vs. BOS (Home=False): Predicted LOSS with win probability 0.43
Game ID: 0022400364 on 2024-12-19T00:00:00: TOR vs. BKN (Home=True): Predicted LOSS with win probability 0.49
Game ID: