In [2]:
import pandas as pd
import numpy as np
import json
import time
import requests

from tabulate import tabulate
from datetime import datetime, timezone
from dateutil import parser

from nba_api.stats.static import teams
from nba_api.stats.endpoints import LeagueGameLog
from nba_api.live.nba.endpoints import scoreboard

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import pickle

In [3]:
# Get a list of all NBA teams
all_teams = teams.get_teams()

# Function to print list of teams
def print_teams(team_list):
    print(tabulate(team_list, headers='keys', tablefmt='grid'))

print_teams(all_teams)

+------------+------------------------+----------------+---------------+---------------+----------------------+----------------+
|         id | full_name              | abbreviation   | nickname      | city          | state                |   year_founded |
| 1610612737 | Atlanta Hawks          | ATL            | Hawks         | Atlanta       | Georgia              |           1949 |
+------------+------------------------+----------------+---------------+---------------+----------------------+----------------+
| 1610612738 | Boston Celtics         | BOS            | Celtics       | Boston        | Massachusetts        |           1946 |
+------------+------------------------+----------------+---------------+---------------+----------------------+----------------+
| 1610612739 | Cleveland Cavaliers    | CLE            | Cavaliers     | Cleveland     | Ohio                 |           1970 |
+------------+------------------------+----------------+---------------+---------------+---------

In [4]:
# Query NBA scoreboard and list games in local time zone
def get_games():
    board = scoreboard.ScoreBoard()
    print("ScoreBoardDate: " + board.score_board_date)
    games = board.games.get_dict()
    for game in games:
        gameTimeLTZ = parser.parse(game["gameTimeUTC"]).replace(tzinfo=timezone.utc).astimezone(tz=None)

        print(f"{game['gameId']}: {game['awayTeam']['teamName']} (ID: {game['awayTeam']['teamId']}) vs. {game['homeTeam']['teamName']} (ID: {game['homeTeam']['teamId']}) @ {gameTimeLTZ}")

get_games()

ScoreBoardDate: 2024-03-10
0022300924: Bucks (ID: 1610612749) vs. Clippers (ID: 1610612746) @ 2024-03-10 22:00:00+03:00
0022300925: Pelicans (ID: 1610612740) vs. Hawks (ID: 1610612737) @ 2024-03-11 01:00:00+03:00
0022300926: Wizards (ID: 1610612764) vs. Heat (ID: 1610612748) @ 2024-03-11 01:00:00+03:00
0022300928: Pacers (ID: 1610612754) vs. Magic (ID: 1610612753) @ 2024-03-11 01:00:00+03:00
0022300929: Rockets (ID: 1610612745) vs. Kings (ID: 1610612758) @ 2024-03-11 01:00:00+03:00
0022300927: 76ers (ID: 1610612755) vs. Knicks (ID: 1610612752) @ 2024-03-11 02:00:00+03:00
0022300930: Nets (ID: 1610612751) vs. Cavaliers (ID: 1610612739) @ 2024-03-11 02:00:00+03:00
0022300931: Grizzlies (ID: 1610612763) vs. Thunder (ID: 1610612760) @ 2024-03-11 02:00:00+03:00
0022300932: Timberwolves (ID: 1610612750) vs. Lakers (ID: 1610612747) @ 2024-03-11 04:30:00+03:00


In [5]:
# Specify the season you're interested in
season = '2023-24'

# Request the game logs for the specified season
game_log = LeagueGameLog(season=season)

# Get the data
game_log_data = game_log.get_data_frames()[0]

# Drop irrelevant columns
columns_to_drop = ['SEASON_ID', 'GAME_ID', 'GAME_DATE', 'VIDEO_AVAILABLE', 'MIN', 'MATCHUP', 'TEAM_NAME', 'TEAM_ABBREVIATION']
game_log_data.drop(columns=columns_to_drop, inplace=True)

# Print game log data
print(tabulate(game_log_data.tail(10), headers='keys', tablefmt='grid'))

+------+------------+------+-------+-------+----------+--------+--------+-----------+-------+-------+----------+--------+--------+-------+-------+-------+-------+-------+------+-------+--------------+
|      |    TEAM_ID | WL   |   FGM |   FGA |   FG_PCT |   FG3M |   FG3A |   FG3_PCT |   FTM |   FTA |   FT_PCT |   OREB |   DREB |   REB |   AST |   STL |   BLK |   TOV |   PF |   PTS |   PLUS_MINUS |
| 1912 | 1610612753 | L    |    33 |    88 |    0.375 |      9 |     36 |     0.25  |    22 |    29 |    0.759 |     13 |     33 |    46 |    18 |     7 |     3 |    13 |   27 |    97 |          -14 |
+------+------------+------+-------+-------+----------+--------+--------+-----------+-------+-------+----------+--------+--------+-------+-------+-------+-------+-------+------+-------+--------------+
| 1913 | 1610612745 | W    |    39 |    84 |    0.464 |     12 |     29 |     0.414 |    22 |    27 |    0.815 |      6 |     36 |    42 |    23 |     8 |     1 |    12 |   18 |   112 |           

In [6]:
# Group game_log_data by TEAM_ID and aggregate statistics
team_stats = game_log_data.groupby('TEAM_ID').agg({
    'FGM': 'sum',
    'FGA': 'sum',
    'FG_PCT': 'mean',
    'FG3M': 'sum',
    'FG3A': 'sum',
    'FG3_PCT': 'mean',
    'FTM': 'sum',
    'FTA': 'sum',
    'FT_PCT': 'mean',
    'OREB': 'sum',
    'DREB': 'sum',
    'REB': 'sum',
    'AST': 'sum',
    'STL': 'sum',
    'BLK': 'sum',
    'TOV': 'sum',
    'PF': 'sum',
    'PTS': 'sum',
    'PLUS_MINUS': 'sum'
})

# Print aggregated statistics
print(tabulate(team_stats, headers='keys', tablefmt='grid'))

+------------+-------+-------+----------+--------+--------+-----------+-------+-------+----------+--------+--------+-------+-------+-------+-------+-------+------+-------+--------------+
|    TEAM_ID |   FGM |   FGA |   FG_PCT |   FG3M |   FG3A |   FG3_PCT |   FTM |   FTA |   FT_PCT |   OREB |   DREB |   REB |   AST |   STL |   BLK |   TOV |   PF |   PTS |   PLUS_MINUS |
| 1610612737 |  2766 |  5955 | 0.465438 |    864 |   2401 |  0.359938 |  1260 |  1561 | 0.807656 |    828 |   2057 |  2885 |  1670 |   474 |   291 |   855 | 1178 |  7656 |         -117 |
+------------+-------+-------+----------+--------+--------+-----------+-------+-------+----------+--------+--------+-------+-------+-------+-------+-------+------+-------+--------------+
| 1610612738 |  2752 |  5673 | 0.486095 |   1023 |   2658 |  0.384413 |  1080 |  1334 | 0.809524 |    668 |   2303 |  2971 |  1662 |   408 |   410 |   779 | 1063 |  7607 |          688 |
+------------+-------+-------+----------+--------+--------+------

In [7]:
# Prepare data for classification
X_classification = game_log_data.drop(columns=['WL'])
y_classification = game_log_data['WL']
label_encoder = LabelEncoder()
y_classification_encoded = label_encoder.fit_transform(y_classification)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_classification, y_classification_encoded, test_size=0.2, random_state=42)
classifier = XGBClassifier()
classifier.fit(X_train_cls, y_train_cls)
y_pred_cls = classifier.predict(X_test_cls)
accuracy_cls = accuracy_score(y_test_cls, y_pred_cls)
print("Classification Accuracy:", accuracy_cls)

Classification Accuracy: 1.0


In [8]:
# Prepare data for regression
X_regression = game_log_data.drop(columns=['PTS', 'WL'])
y_regression = game_log_data['PTS']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.2, random_state=42)
regressor = XGBRegressor()
regressor.fit(X_train_reg, y_train_reg)
y_pred_reg = regressor.predict(X_test_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
print("Mean Squared Error:", mse)

Mean Squared Error: 2.825039939406213


In [9]:
# Save the trained model to a file
with open('xgboost_regression_model.pkl', 'wb') as f:
    pickle.dump(regressor, f)

# Load the trained model from the file
with open('xgboost_regression_model.pkl', 'rb') as f:
    regressor = pickle.load(f)

In [10]:
# Prepare input data for prediction
fixtures_data = []
board = scoreboard.ScoreBoard()
games = board.games.get_dict()
for game in games:
    home_team_id = game['homeTeam']['teamId']
    away_team_id = game['awayTeam']['teamId']

    home_team_stats = game_log_data[game_log_data['TEAM_ID'] == home_team_id]
    away_team_stats = game_log_data[game_log_data['TEAM_ID'] == away_team_id]

    fixture_data = {
        'TEAM_ID': home_team_id,
        'REB': home_team_stats['REB'],
        'FT_PCT': home_team_stats['FT_PCT'],
        'OREB': home_team_stats['OREB'],
        'TOV': home_team_stats['TOV'],
        'FTA': home_team_stats['FTA'],
        'BLK': home_team_stats['BLK'],
        'FG3A': home_team_stats['FG3A'],
        'FG_PCT': home_team_stats['FG_PCT'],
        'FTM': home_team_stats['FTM'],
        'PLUS_MINUS': home_team_stats['PLUS_MINUS'],
        'FGM': home_team_stats['FGM'],
        'FG3M': home_team_stats['FG3M'],
        'STL': home_team_stats['STL'],
        'FGA': home_team_stats['FGA'],
        'DREB': home_team_stats['DREB'],
        'PF': home_team_stats['PF'],
        'AST': home_team_stats['AST'],
        'FG3_PCT': home_team_stats['FG3_PCT']
    }
    fixtures_data.append(fixture_data)

    fixture_data = {
        'TEAM_ID': away_team_id,
        'REB': away_team_stats['REB'],
        'FT_PCT': away_team_stats['FT_PCT'],
        'OREB': away_team_stats['OREB'],
        'TOV': away_team_stats['TOV'],
        'FTA': away_team_stats['FTA'],
        'BLK': away_team_stats['BLK'],
        'FG3A': away_team_stats['FG3A'],
        'FG_PCT': away_team_stats['FG_PCT'],
        'FTM': away_team_stats['FTM'],
        'PLUS_MINUS': away_team_stats['PLUS_MINUS'],
        'FGM': away_team_stats['FGM'],
        'FG3M': away_team_stats['FG3M'],
        'STL': away_team_stats['STL'],
        'FGA': away_team_stats['FGA'],
        'DREB': away_team_stats['DREB'],
        'PF': away_team_stats['PF'],
        'AST': away_team_stats['AST'],
        'FG3_PCT': away_team_stats['FG3_PCT']
    }
    fixtures_data.append(fixture_data)

    

In [11]:
# Convert the fixtures data into a DataFrame
df_columns = ['TEAM_ID', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF']
fixtures_df = pd.DataFrame(fixtures_data, columns=df_columns)


# Identify object columns causing the issue
object_columns = fixtures_df.select_dtypes(include=['object']).columns

# Convert object columns to appropriate numeric types
numeric_columns = ['FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF']
for col in numeric_columns:
    fixtures_df[col] = pd.to_numeric(fixtures_df[col], errors='coerce')

# Make predictions using the trained model
predicted_scores = regressor.predict(fixtures_df)

# Output predictions
for idx, game in enumerate(games):
    print(f"Game ID: {game['gameId']}, Predicted Score: {predicted_scores[idx]}")

ValueError: feature_names mismatch: ['TEAM_ID', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'] ['TEAM_ID', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF']
expected PLUS_MINUS in input data

In [None]:
# Make predictions using the trained model
predicted_scores = regressor.predict(fixtures_df)

# Output predictions
for idx, game in enumerate(games):
    home_team = game['homeTeam']['teamName']
    away_team = game['awayTeam']['teamName']
    game_id = game['gameId']
    predicted_score_home = predicted_scores[idx]
    predicted_score_away = predicted_scores[idx + 1]  # Assuming away team follows the home team in the list

    print(f"Game ID: {game_id}, Home Team: {home_team}, Predicted Score: {predicted_score_home}")
    print(f"Game ID: {game_id}, Away Team: {away_team}, Predicted Score: {predicted_score_away}")

ValueError: feature_names mismatch: ['TEAM_ID', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'] ['TEAM_ID', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF']
expected PLUS_MINUS in input data

In [None]:
# Print out the input data for inspection
print(fixtures_df)

# Print out some intermediate steps during prediction for debugging
print("Shape of fixtures_df:", fixtures_df.shape)
print("Shape of predicted_scores:", predicted_scores.shape)

       TEAM_ID  FGM  FGA  FG_PCT  FG3M  FG3A  FG3_PCT  FTM  FTA  FT_PCT  OREB  \
0   1610612746  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
1   1610612749  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
2   1610612737  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
3   1610612740  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
4   1610612748  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
5   1610612764  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
6   1610612753  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
7   1610612754  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
8   1610612758  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
9   1610612745  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
10  1610612752  NaN  NaN     NaN   NaN   NaN      NaN  NaN  NaN     NaN   NaN   
11  1610612755  NaN  NaN    

NameError: name 'predicted_scores' is not defined

In [None]:
# Print out the raw game statistics for inspection
print(game_log_data.head())




      TEAM_ID  WL       FGM       FGA    FG_PCT      FG3M      FG3A   FG3_PCT  \
0  1610612747   0 -0.262132  0.113653 -0.364929 -0.751727 -0.903989 -0.262224   
1  1610612743   1  1.063483  0.257756  0.928016  0.299569 -0.154199  0.548940   
2  1610612756   1 -0.072758  0.834171 -0.619876 -0.488903 -0.304157 -0.407507   
3  1610612744   0 -1.209000  1.698792 -2.185979 -0.751727  1.195423 -1.618200   
4  1610612760   1  0.495362 -1.039176  1.328647  1.613689  0.595591  1.456960   

        FTM       FTA  ...      OREB      DREB       REB       AST       STL  \
0 -0.420893 -0.330335  ...  0.615830 -0.366597  0.059604 -0.726154 -0.858136   
1 -1.436673 -1.470126  ... -0.425510  0.004856 -0.244751  0.452549  0.567210   
2 -0.759486 -0.757757  ...  1.657169  1.862122  2.494442 -0.726154 -0.858136   
3  0.764184  0.809455  ...  1.917504 -0.366597  0.820491 -1.511957  1.279884   
4 -0.420893 -0.615283  ... -1.466850  0.933489 -0.092573  0.649000 -0.501800   

        BLK       TOV        PF 

In [None]:
# Fill missing values with zeros
fixtures_df.fillna(0, inplace=True)

# Convert all columns to numeric data type
fixtures_df = fixtures_df.apply(pd.to_numeric)

# Check the data after preprocessing
print(fixtures_df.head())


      TEAM_ID  FGM  FGA  FG_PCT  FG3M  FG3A  FG3_PCT  FTM  FTA  FT_PCT  OREB  \
0  1610612755  0.0  0.0     0.0   0.0   0.0      0.0  0.0  0.0     0.0   0.0   
1  1610612740  0.0  0.0     0.0   0.0   0.0      0.0  0.0  0.0     0.0   0.0   
2  1610612764  0.0  0.0     0.0   0.0   0.0      0.0  0.0  0.0     0.0   0.0   
3  1610612766  0.0  0.0     0.0   0.0   0.0      0.0  0.0  0.0     0.0   0.0   
4  1610612739  0.0  0.0     0.0   0.0   0.0      0.0  0.0  0.0     0.0   0.0   

   DREB  REB  AST  STL  BLK  TOV   PF  
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  
