#NBA Game Prediction Project

By: Rahul Pothineni

10/03/2025

Data: NBA.com, https://github.com/swar/nba_api?utm_source

Adapted off: https://www.youtube.com/watch?v=ViaGirGFJZY

##Import libraries

In [531]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

##Data Cleaning and Preparation

In [532]:
player_box = pd.read_csv("player_boxscores_2025.csv")
team_box = pd.read_csv("team_boxscores_2025.csv")

In [533]:
player_box.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,...,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_DATE
0,22400061,1610612752,NYK,New York,1628384,OG Anunoby,OG,F,,34:10,...,5.0,5.0,3.0,1.0,0.0,0.0,3.0,4.0,-21.0,2024-10-22
1,22400061,1610612752,NYK,New York,1628404,Josh Hart,Josh,F,,24:30,...,3.0,4.0,3.0,0.0,0.0,1.0,0.0,12.0,-23.0,2024-10-22
2,22400061,1610612752,NYK,New York,1626157,Karl-Anthony Towns,Karl-Anthony,C,,23:37,...,7.0,7.0,3.0,0.0,0.0,0.0,1.0,12.0,-18.0,2024-10-22
3,22400061,1610612752,NYK,New York,1628969,Mikal Bridges,Mikal,G,,34:37,...,0.0,0.0,2.0,0.0,0.0,1.0,0.0,16.0,-33.0,2024-10-22
4,22400061,1610612752,NYK,New York,1628973,Jalen Brunson,Jalen,G,,24:30,...,1.0,1.0,2.0,0.0,0.0,4.0,3.0,22.0,-23.0,2024-10-22


In [534]:
print(player_box.columns)

Index(['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'PLAYER_ID',
       'PLAYER_NAME', 'NICKNAME', 'START_POSITION', 'COMMENT', 'MIN', 'FGM',
       'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS',
       'PLUS_MINUS', 'GAME_DATE'],
      dtype='object')


In [535]:
team_box.head()

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,FGM,FGA,FG_PCT,FG3M,...,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,GAME_DATE
0,22400061,1610612752,Knicks,NYK,New York,240:00,43,78,0.551,11,...,29,34,20,2,3,11,12,109,-23,2024-10-22
1,22400061,1610612738,Celtics,BOS,Boston,240:00,48,95,0.505,29,...,29,40,33,6,3,3,15,132,23,2024-10-22
2,22400062,1610612750,Timberwolves,MIN,Minnesota,240:00,35,85,0.412,13,...,35,47,17,4,1,15,22,103,-7,2024-10-22
3,22400062,1610612747,Lakers,LAL,Los Angeles,240:00,42,95,0.442,5,...,31,46,22,7,8,7,22,110,7,2024-10-22
4,22400064,1610612751,Nets,BKN,Brooklyn,240:00,40,91,0.44,17,...,31,43,21,8,6,17,32,116,-4,2024-10-23


In [536]:
print(team_box.columns)

Index(['GAME_ID', 'TEAM_ID', 'TEAM_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS',
       'PLUS_MINUS', 'GAME_DATE'],
      dtype='object')


###Changing the Date Format

In [537]:
#Convert the 'GameDate' format in play-by-play data to match the 'Date' format in scores dataset
team_box['GAME_DATE'] = pd.to_datetime(team_box['GAME_DATE']).dt.strftime('%m/%d/%Y')

#Retry matching games for 2022 season using the updated method
def match_nba_games(team_box_df):
    matched_games = []

    #Group by each unique game
    for gid, group in team_box_df.groupby('GAME_ID'):
        if len(group) == 2:  #each game has two teams
            team1 = group.iloc[0]
            team2 = group.iloc[1]

            #Create one combined row for each matchup
            row = {
                'GAME_ID': gid,
                'GAME_DATE': team1['GAME_DATE'],
                'TEAM1': team1['TEAM_NAME'],
                'TEAM2': team2['TEAM_NAME'],
                'TEAM1_PTS': team1['PTS'],
                'TEAM2_PTS': team2['PTS'],
                'WINNER': team1['TEAM_NAME'] if team1['PTS'] > team2['PTS'] else team2['TEAM_NAME']
            }
            matched_games.append(row)

    return pd.DataFrame(matched_games)
matched_2025_games = match_nba_games(team_box)
matched_2025_games.head()

Unnamed: 0,GAME_ID,GAME_DATE,TEAM1,TEAM2,TEAM1_PTS,TEAM2_PTS,WINNER
0,22400001,11/12/2024,Celtics,Hawks,116,117,Hawks
1,22400002,11/12/2024,Heat,Pistons,121,123,Pistons
2,22400003,11/12/2024,Magic,Hornets,114,89,Magic
3,22400004,11/12/2024,76ers,Knicks,99,111,Knicks
4,22400005,11/12/2024,Raptors,Bucks,85,99,Bucks


###Changing Scores data to use abbreviations

In [538]:
#Extract unique team names and abbreviations from team box scores
unique_team_names = team_box['TEAM_NAME'].unique()
unique_team_abbrevs = team_box['TEAM_ABBREVIATION'].unique()

#Create a mapping (dictionary) between team names and abbreviations
team_name_to_abbrev = dict(zip(unique_team_names, unique_team_abbrevs))

#check
len(team_name_to_abbrev), list(team_name_to_abbrev.items())[:5]

(30,
 [('Knicks', 'NYK'),
  ('Celtics', 'BOS'),
  ('Timberwolves', 'MIN'),
  ('Lakers', 'LAL'),
  ('Nets', 'BKN')])

###Copying scores to play by play, matching based on date and teams

In [539]:
#Parse dates
team_box["GAME_DATE"] = pd.to_datetime(team_box["GAME_DATE"], errors="coerce")

#Keep only game_ids that have exactly two rows (one per team)
counts = tb["GAME_ID"].value_counts()
valid_game_ids = counts[counts == 2].index
tb = tb[tb["GAME_ID"].isin(valid_game_ids)].copy()

if tb.empty:
    raise ValueError("No complete games (with exactly 2 team rows) found in the selected data.")

#Sort data
for c in ["PTS","AST","REB","TO","FG_PCT","FG3_PCT","FT_PCT"]:
    if c in tb.columns:
        tb[c] = pd.to_numeric(tb[c], errors="coerce")

tb = tb.sort_values(["GAME_ID", "TEAM_ABBREVIATION"]).reset_index(drop=True)

#Build one row per game (T1 = alphabetically first by TEAM_ABBREVIATION)
rows = []
for gid, g in tb.groupby("GAME_ID", sort=False):
    if len(g) != 2:
        continue
    t1, t2 = g.iloc[0], g.iloc[1]
    rows.append({
        "GAME_ID": gid,
        "GAME_DATE": t1["GAME_DATE"].strftime("%m/%d/%Y") if pd.notna(t1["GAME_DATE"]) else None,
        "T1_TEAM": t1["TEAM_ABBREVIATION"], "T2_TEAM": t2["TEAM_ABBREVIATION"],
        "T1_PTS": t1.get("PTS", np.nan), "T2_PTS": t2.get("PTS", np.nan),
        "T1_AST": t1.get("AST", np.nan), "T2_AST": t2.get("AST", np.nan),
        "T1_REB": t1.get("REB", np.nan), "T2_REB": t2.get("REB", np.nan),
        "T1_TO":  t1.get("TO",  np.nan), "T2_TO":  t2.get("TO",  np.nan),
        "T1_FG_PCT":  t1.get("FG_PCT",  np.nan), "T2_FG_PCT":  t2.get("FG_PCT",  np.nan),
        "T1_FG3_PCT": t1.get("FG3_PCT", np.nan), "T2_FG3_PCT": t2.get("FG3_PCT", np.nan),
        "T1_FT_PCT":  t1.get("FT_PCT",  np.nan), "T2_FT_PCT":  t2.get("FT_PCT",  np.nan),
    })

merged_nba = pd.DataFrame(rows)

if merged_nba.empty:
    raise ValueError("merged_nba ended up empty. Check that CSV actually has paired team rows per GAME_ID.")

#Label and winner
merged_nba["T1_WIN"] = (pd.to_numeric(merged_nba["T1_PTS"], errors="coerce") >
                        pd.to_numeric(merged_nba["T2_PTS"], errors="coerce")).astype(int)
merged_nba["WINNER"] = np.where(merged_nba["T1_WIN"] == 1, merged_nba["T1_TEAM"], merged_nba["T2_TEAM"])

#diagnostics
print(f"Built merged_nba with {len(merged_nba)} games "
      f"({merged_nba['GAME_ID'].nunique()} unique GAME_IDs).")
display(merged_nba[["GAME_DATE","T1_TEAM","T2_TEAM","T1_PTS","T2_PTS","WINNER","T1_WIN"]].head())


Built merged_nba with 484 games (484 unique GAME_IDs).


Unnamed: 0,GAME_DATE,T1_TEAM,T2_TEAM,T1_PTS,T2_PTS,WINNER,T1_WIN
0,11/12/2024,ATL,BOS,117,116,ATL,1
1,11/12/2024,DET,MIA,123,121,DET,1
2,11/12/2024,CHA,ORL,89,114,ORL,0
3,11/12/2024,NYK,PHI,111,99,NYK,1
4,11/12/2024,MIL,TOR,99,85,MIL,1


###Adding HomeWon

In [540]:
#call Team 1 the 'home' team
merged_nba["HomeWon"] = merged_nba["T1_PTS"] > merged_nba["T2_PTS"]

#Preview
merged_nba[["GAME_DATE","T1_TEAM","T2_TEAM","T1_PTS","T2_PTS","HomeWon"]].head()

Unnamed: 0,GAME_DATE,T1_TEAM,T2_TEAM,T1_PTS,T2_PTS,HomeWon
0,11/12/2024,ATL,BOS,117,116,True
1,11/12/2024,DET,MIA,123,121,True
2,11/12/2024,CHA,ORL,89,114,False
3,11/12/2024,NYK,PHI,111,99,True
4,11/12/2024,MIL,TOR,99,85,True


##Team Feature Extraction

###Offensive Features

In [541]:
#Load NBA team box scores (one row per team per game)
team_box = pd.read_csv("team_boxscores_2025.csv")
team_box["GAME_DATE"] = pd.to_datetime(team_box["GAME_DATE"])

#Build one row per game with both teams side-by-side
games = team_box.sort_values(["GAME_ID","TEAM_ID"])
pairs = []
for gid, g in games.groupby("GAME_ID"):
    if len(g) != 2:
        continue
    t1, t2 = g.iloc[0], g.iloc[1]
    pairs.append({
        "GAME_ID": gid,
        "GAME_DATE": t1["GAME_DATE"],
        "T1_TEAM": t1["TEAM_ABBREVIATION"], "T1_PTS": t1["PTS"],
        "T2_TEAM": t2["TEAM_ABBREVIATION"], "T2_PTS": t2["PTS"]
    })
merged_nba = pd.DataFrame(pairs)

#format: one row per (team, opponent, game)
t1_long = merged_nba.rename(columns={
    "T1_TEAM":"TEAM", "T1_PTS":"PTS", "T2_TEAM":"OPP", "T2_PTS":"OPP_PTS"
})[["GAME_ID","GAME_DATE","TEAM","OPP","PTS","OPP_PTS"]]

t2_long = merged_nba.rename(columns={
    "T2_TEAM":"TEAM", "T2_PTS":"PTS", "T1_TEAM":"OPP", "T1_PTS":"OPP_PTS"
})[["GAME_ID","GAME_DATE","TEAM","OPP","PTS","OPP_PTS"]]

long_df = pd.concat([t1_long, t2_long], ignore_index=True)
long_df["WIN"] = (long_df["PTS"] > long_df["OPP_PTS"]).astype(int)

#Per-team features
avg_points_scored   = long_df.groupby("TEAM")["PTS"].mean()
avg_points_allowed  = long_df.groupby("TEAM")["OPP_PTS"].mean()
win_rate            = long_df.groupby("TEAM")["WIN"].mean()  #proportion of games won

team_features = pd.DataFrame({
    "TEAM": avg_points_scored.index,
    "AvgPointsScored": avg_points_scored.values,
    "AvgPointsAllowed": avg_points_allowed.values,
    "WinRate": win_rate.values
}).sort_values("TEAM").reset_index(drop=True)

team_features.head()

Unnamed: 0,TEAM,AvgPointsScored,AvgPointsAllowed,WinRate
0,ATL,117.848485,118.818182,0.545455
1,BKN,107.84375,112.78125,0.375
2,BOS,119.636364,108.909091,0.727273
3,CHA,106.1875,112.84375,0.21875
4,CHI,117.545455,121.030303,0.454545


In [542]:
upcoming_games.head()

Unnamed: 0,GAME_DATE,HOME_TEAM,AWAY_TEAM
0,2025-01-01,DET,ORL
1,2025-01-01,WAS,CHI
2,2025-01-01,MIA,NOP
3,2025-01-01,NYK,UTA
4,2025-01-01,TOR,BKN


###Defensive Features

Now we do the same thing but with defensive features

In [543]:

#Load team box scores
team_box = pd.read_csv("team_boxscores_2025.csv")
team_box["GAME_DATE"] = pd.to_datetime(team_box["GAME_DATE"])

#Build one row per game with both teams side-by-side
games = team_box.sort_values(["GAME_ID","TEAM_ID"])
pairs = []
for gid, g in games.groupby("GAME_ID"):
    if len(g) != 2:
        continue
    t1, t2 = g.iloc[0], g.iloc[1]
    pairs.append({
        "GAME_ID": gid,
        "GAME_DATE": t1["GAME_DATE"],
        #T1 (team)
        "T1": t1["TEAM_ABBREVIATION"], "T1_PTS": t1["PTS"], "T1_FGM": t1["FGM"], "T1_FGA": t1["FGA"],
        "T1_FG3M": t1["FG3M"], "T1_FG3A": t1["FG3A"], "T1_FTA": t1["FTA"], "T1_OREB": t1["OREB"],
        "T1_DREB": t1["DREB"], "T1_REB": t1["REB"], "T1_TO": t1["TO"], "T1_STL": t1["STL"], "T1_BLK": t1["BLK"],
        #T2 (opponent)
        "T2": t2["TEAM_ABBREVIATION"], "T2_PTS": t2["PTS"], "T2_FGM": t2["FGM"], "T2_FGA": t2["FGA"],
        "T2_FG3M": t2["FG3M"], "T2_FG3A": t2["FG3A"], "T2_FTA": t2["FTA"], "T2_OREB": t2["OREB"],
        "T2_DREB": t2["DREB"], "T2_REB": t2["REB"], "T2_TO": t2["TO"], "T2_STL": t2["STL"], "T2_BLK": t2["BLK"],
    })
merged = pd.DataFrame(pairs)

#format: one row per (TEAM vs OPP) with both sides' stats available
t1_long = merged.rename(columns={
    "T1":"TEAM", "T2":"OPP",
    "T1_PTS":"PTS", "T1_FGM":"FGM", "T1_FGA":"FGA", "T1_FG3M":"FG3M", "T1_FG3A":"FG3A", "T1_FTA":"FTA",
    "T1_OREB":"OREB", "T1_DREB":"DREB", "T1_REB":"REB", "T1_TO":"TO", "T1_STL":"STL", "T1_BLK":"BLK",
    "T2_PTS":"OPP_PTS", "T2_FGM":"OPP_FGM", "T2_FGA":"OPP_FGA", "T2_FG3M":"OPP_FG3M", "T2_FG3A":"OPP_FG3A",
    "T2_FTA":"OPP_FTA", "T2_OREB":"OPP_OREB", "T2_DREB":"OPP_DREB", "T2_REB":"OPP_REB", "T2_TO":"OPP_TO",
    "T2_STL":"OPP_STL", "T2_BLK":"OPP_BLK",
})[[
    "GAME_ID","GAME_DATE","TEAM","OPP","PTS","FGM","FGA","FG3M","FG3A","FTA","OREB","DREB","REB","TO","STL","BLK",
    "OPP_PTS","OPP_FGM","OPP_FGA","OPP_FG3M","OPP_FG3A","OPP_FTA","OPP_OREB","OPP_DREB","OPP_REB","OPP_TO","OPP_STL","OPP_BLK"
]]

t2_long = merged.rename(columns={
    "T2":"TEAM", "T1":"OPP",
    "T2_PTS":"PTS", "T2_FGM":"FGM", "T2_FGA":"FGA", "T2_FG3M":"FG3M", "T2_FG3A":"FG3A", "T2_FTA":"FTA",
    "T2_OREB":"OREB", "T2_DREB":"DREB", "T2_REB":"REB", "T2_TO":"TO", "T2_STL":"STL", "T2_BLK":"BLK",
    "T1_PTS":"OPP_PTS", "T1_FGM":"OPP_FGM", "T1_FGA":"OPP_FGA", "T1_FG3M":"OPP_FG3M", "T1_FG3A":"OPP_FG3A",
    "T1_FTA":"OPP_FTA", "T1_OREB":"OPP_OREB", "T1_DREB":"OPP_DREB", "T1_REB":"OPP_REB", "T1_TO":"OPP_TO",
    "T1_STL":"OPP_STL", "T1_BLK":"OPP_BLK",
})[[
    "GAME_ID","GAME_DATE","TEAM","OPP","PTS","FGM","FGA","FG3M","FG3A","FTA","OREB","DREB","REB","TO","STL","BLK",
    "OPP_PTS","OPP_FGM","OPP_FGA","OPP_FG3M","OPP_FG3A","OPP_FTA","OPP_OREB","OPP_DREB","OPP_REB","OPP_TO","OPP_STL","OPP_BLK"
]]

long_df = pd.concat([t1_long, t2_long], ignore_index=True)
long_df["WIN"] = (long_df["PTS"] > long_df["OPP_PTS"]).astype(int)

#Possessions estimate
#estimate: Poss = FGA - OREB + TO + 0.44*FTA (NBA.com)
long_df["TEAM_POSS"] = long_df["FGA"] - long_df["OREB"] + long_df["TO"] + 0.44 * long_df["FTA"]
long_df["OPP_POSS"]  = long_df["OPP_FGA"] - long_df["OPP_OREB"] + long_df["OPP_TO"] + 0.44 * long_df["OPP_FTA"]

#Defensive metrics (per game/team), averaged across games
#1) AvgPointsDefended (points allowed)
avg_points_defended = long_df.groupby("TEAM")["OPP_PTS"].mean()

#2) Opponent eFG% (defensive shot quality allowed)
#eFG% = (FGM + 0.5*FG3M) / FGA
opp_efg = ((long_df["OPP_FGM"] + 0.5 * long_df["OPP_FG3M"]) / long_df["OPP_FGA"]).replace([pd.NA, pd.NaT], 0).fillna(0)
avg_opp_efg = long_df.assign(OPP_EFG=opp_efg).groupby("TEAM")["OPP_EFG"].mean()

#3) Turnovers forced rate (opponent turnovers per opponent possession)
opp_tov_rate = (long_df["OPP_TO"] / long_df["OPP_POSS"]).replace([pd.NA, pd.NaT], 0).fillna(0)
avg_opp_tov_rate = long_df.assign(OPP_TOV_RATE=opp_tov_rate).groupby("TEAM")["OPP_TOV_RATE"].mean()

#4) Defensive rebounding percentage: DREB / (DREB + Opp OREB)
drb_pct = (long_df["DREB"] / (long_df["DREB"] + long_df["OPP_OREB"])).replace([pd.NA, pd.NaT], 0).fillna(0)
avg_drb_pct = long_df.assign(DRB_PCT=drb_pct).groupby("TEAM")["DRB_PCT"].mean()

#5) Steals & Blocks per game
stl_pg = long_df.groupby("TEAM")["STL"].mean()
blk_pg = long_df.groupby("TEAM")["BLK"].mean()

team_features_defensive = pd.DataFrame({
    "TEAM": avg_points_defended.index,
    "AvgPointsDefended": avg_points_defended.values,
    "AvgOppEFG": avg_opp_efg.values,
    "AvgOppTOVRate": avg_opp_tov_rate.values,
    "AvgDRB_PCT": avg_drb_pct.values,
    "Stl_perG": stl_pg.values,
    "Blk_perG": blk_pg.values,
}).sort_values("TEAM").reset_index(drop=True)

team_features_defensive.head()


Unnamed: 0,TEAM,AvgPointsDefended,AvgOppEFG,AvgOppTOVRate,AvgDRB_PCT,Stl_perG,Blk_perG
0,ATL,118.818182,0.556922,0.151263,0.759365,10.393939,5.363636
1,BKN,112.78125,0.56559,0.148566,0.759698,7.21875,3.71875
2,BOS,108.909091,0.524821,0.1271,0.774866,7.666667,5.575758
3,CHA,112.84375,0.548846,0.138484,0.760461,7.96875,4.9375
4,CHI,121.030303,0.546564,0.109633,0.773962,7.424242,4.636364


###Additional offensive features

In [544]:
games = team_box.sort_values(["GAME_ID","TEAM_ID"])
pairs = []
for gid, g in games.groupby("GAME_ID"):
    if len(g) != 2:
        continue
    t1, t2 = g.iloc[0], g.iloc[1]
    pairs.append({
        "GAME_ID": gid,
        "GAME_DATE": t1["GAME_DATE"],
        "T1": t1["TEAM_ABBREVIATION"], "T1_PTS": t1["PTS"], "T1_FGM": t1["FGM"], "T1_FGA": t1["FGA"], "T1_FG3M": t1["FG3M"], "T1_FG3A": t1["FG3A"], "T1_FTA": t1["FTA"], "T1_OREB": t1["OREB"], "T1_DREB": t1["DREB"], "T1_TO": t1["TO"], "T1_AST": t1["AST"],
        "T2": t2["TEAM_ABBREVIATION"], "T2_PTS": t2["PTS"], "T2_FGM": t2["FGM"], "T2_FGA": t2["FGA"], "T2_FG3M": t2["FG3M"], "T2_FG3A": t2["FG3A"], "T2_FTA": t2["FTA"], "T2_OREB": t2["OREB"], "T2_DREB": t2["DREB"], "T2_TO": t2["TO"], "T2_AST": t2["AST"],
    })
merged = pd.DataFrame(pairs)

t1_long = merged.rename(columns={
    "T1":"TEAM","T2":"OPP","T1_PTS":"PTS","T1_FGM":"FGM","T1_FGA":"FGA","T1_FG3M":"FG3M","T1_FG3A":"FG3A","T1_FTA":"FTA","T1_OREB":"OREB","T1_DREB":"DREB","T1_TO":"TO","T1_AST":"AST",
    "T2_PTS":"OPP_PTS","T2_FGM":"OPP_FGM","T2_FGA":"OPP_FGA","T2_FG3M":"OPP_FG3M","T2_FG3A":"OPP_FG3A","T2_FTA":"OPP_FTA","T2_OREB":"OPP_OREB","T2_DREB":"OPP_DREB","T2_TO":"OPP_TO"
})[["GAME_ID","GAME_DATE","TEAM","OPP","PTS","FGM","FGA","FG3M","FG3A","FTA","OREB","DREB","TO","AST","OPP_PTS","OPP_FGM","OPP_FGA","OPP_FG3M","OPP_FG3A","OPP_FTA","OPP_OREB","OPP_DREB","OPP_TO"]]

t2_long = merged.rename(columns={
    "T2":"TEAM","T1":"OPP","T2_PTS":"PTS","T2_FGM":"FGM","T2_FGA":"FGA","T2_FG3M":"FG3M","T2_FG3A":"FG3A","T2_FTA":"FTA","T2_OREB":"OREB","T2_DREB":"DREB","T2_TO":"TO","T2_AST":"AST",
    "T1_PTS":"OPP_PTS","T1_FGM":"OPP_FGM","T1_FGA":"OPP_FGA","T1_FG3M":"OPP_FG3M","T1_FG3A":"OPP_FG3A","T1_FTA":"OPP_FTA","T1_OREB":"OPP_OREB","T1_DREB":"OPP_DREB","T1_TO":"OPP_TO"
})[["GAME_ID","GAME_DATE","TEAM","OPP","PTS","FGM","FGA","FG3M","FG3A","FTA","OREB","DREB","TO","AST","OPP_PTS","OPP_FGM","OPP_FGA","OPP_FG3M","OPP_FG3A","OPP_FTA","OPP_OREB","OPP_DREB","OPP_TO"]]

long_df = pd.concat([t1_long, t2_long], ignore_index=True)
long_df["WIN"] = (long_df["PTS"] > long_df["OPP_PTS"]).astype(int)
long_df["TEAM_POSS"] = long_df["FGA"] - long_df["OREB"] + long_df["TO"] + 0.44 * long_df["FTA"]
long_df["OPP_POSS"]  = long_df["OPP_FGA"] - long_df["OPP_OREB"] + long_df["OPP_TO"] + 0.44 * long_df["OPP_FTA"]

avg_points_scored  = long_df.groupby("TEAM")["PTS"].mean()
avg_points_allowed = long_df.groupby("TEAM")["OPP_PTS"].mean()
win_rate           = long_df.groupby("TEAM")["WIN"].mean()

team_features = pd.DataFrame({
    "TEAM": avg_points_scored.index,
    "AvgPointsScored": avg_points_scored.values,
    "AvgPointsAllowed": avg_points_allowed.values,
    "WinRate": win_rate.values
}).sort_values("TEAM").reset_index(drop=True)

opp_efg = ((long_df["OPP_FGM"] + 0.5 * long_df["OPP_FG3M"]) / long_df["OPP_FGA"]).replace([np.inf, -np.inf], np.nan).fillna(0)
avg_opp_efg = long_df.assign(OPP_EFG=opp_efg).groupby("TEAM")["OPP_EFG"].mean()
opp_tov_rate = (long_df["OPP_TO"] / long_df["OPP_POSS"]).replace([np.inf, -np.inf], np.nan).fillna(0)
avg_opp_tov_rate = long_df.assign(OPP_TOV_RATE=opp_tov_rate).groupby("TEAM")["OPP_TOV_RATE"].mean()
drb_pct = (long_df["DREB"] / (long_df["DREB"] + long_df["OPP_OREB"])).replace([np.inf, -np.inf], np.nan).fillna(0)
avg_drb_pct = long_df.assign(DRB_PCT=drb_pct).groupby("TEAM")["DRB_PCT"].mean()
stl_pg = long_df.groupby("TEAM")["STL"].mean() if "STL" in long_df.columns else pd.Series(0, index=team_features["TEAM"])
blk_pg = long_df.groupby("TEAM")["BLK"].mean() if "BLK" in long_df.columns else pd.Series(0, index=team_features["TEAM"])

team_features_def = pd.DataFrame({
    "TEAM": avg_opp_efg.index,
    "AvgOppEFG": avg_opp_efg.values,
    "AvgOppTOVRate": avg_opp_tov_rate.values,
    "AvgDRB_PCT": avg_drb_pct.values,
    "Stl_perG": stl_pg.reindex(avg_opp_efg.index).fillna(0).values,
    "Blk_perG": blk_pg.reindex(avg_opp_efg.index).fillna(0).values,
}).sort_values("TEAM").reset_index(drop=True)

efg = ((long_df["FGM"] + 0.5 * long_df["FG3M"]) / long_df["FGA"]).replace([np.inf, -np.inf], np.nan).fillna(0)
ts = (long_df["PTS"] / (2 * (long_df["FGA"] + 0.44 * long_df["FTA"]).replace(0, np.nan))).replace([np.inf, -np.inf], np.nan).fillna(0)
to_rate = (long_df["TO"] / long_df["TEAM_POSS"]).replace([np.inf, -np.inf], np.nan).fillna(0)
orb_pct = (long_df["OREB"] / (long_df["OREB"] + long_df["OPP_DREB"])).replace([np.inf, -np.inf], np.nan).fillna(0)
ast_pg = long_df.groupby("TEAM")["AST"].mean() if "AST" in long_df.columns else pd.Series(0, index=team_features["TEAM"])
ppp = (long_df["PTS"] / long_df["TEAM_POSS"]).replace([np.inf, -np.inf], np.nan).fillna(0)

off_agg = long_df.assign(EFG=efg, TS=ts, TOR=to_rate, ORB_PCT=orb_pct, PPP=ppp) \
                 .groupby("TEAM")[["EFG","TS","TOR","ORB_PCT","PPP"]].mean()

new_offensive_features = off_agg.reset_index().rename(columns={
    "EFG":"AvgEFG","TS":"AvgTS","TOR":"AvgTOR","ORB_PCT":"AvgORB_PCT","PPP":"AvgPPP"
})
new_offensive_features["AST_perG"] = ast_pg.reindex(new_offensive_features["TEAM"]).values

team_features_combined = team_features.merge(team_features_def, on="TEAM", how="inner")
team_features_expanded = team_features_combined.merge(new_offensive_features, on="TEAM", how="inner")

team_features_expanded.head()


Unnamed: 0,TEAM,AvgPointsScored,AvgPointsAllowed,WinRate,AvgOppEFG,AvgOppTOVRate,AvgDRB_PCT,Stl_perG,Blk_perG,AvgEFG,AvgTS,AvgTOR,AvgORB_PCT,AvgPPP,AST_perG
0,ATL,117.848485,118.818182,0.545455,0.556922,0.151263,0.759365,0,0,0.539962,0.57729,0.147371,0.264902,1.113837,29.818182
1,BKN,107.84375,112.78125,0.375,0.56559,0.148566,0.759698,0,0,0.539947,0.577541,0.142202,0.21574,1.096739,25.25
2,BOS,119.636364,108.909091,0.727273,0.524821,0.1271,0.774866,0,0,0.562621,0.594572,0.113137,0.241094,1.182499,25.69697
3,CHA,106.1875,112.84375,0.21875,0.548846,0.138484,0.760461,0,0,0.508135,0.540144,0.150469,0.275891,1.055062,23.5
4,CHI,117.545455,121.030303,0.454545,0.546564,0.109633,0.773962,0,0,0.557682,0.586467,0.136454,0.20138,1.114538,29.454545


###Additional Defensive Features

In [545]:
games = team_box.sort_values(["GAME_ID","TEAM_ID"])
pairs = []
for gid, g in games.groupby("GAME_ID"):
    if len(g) != 2:
        continue
    t1, t2 = g.iloc[0], g.iloc[1]
    pairs.append({
        "GAME_ID": gid,
        "GAME_DATE": t1["GAME_DATE"],
        "T1": t1["TEAM_ABBREVIATION"], "T1_PTS": t1["PTS"], "T1_FGM": t1["FGM"], "T1_FGA": t1["FGA"], "T1_FG3M": t1["FG3M"], "T1_FG3A": t1["FG3A"], "T1_FTA": t1["FTA"], "T1_OREB": t1["OREB"], "T1_DREB": t1["DREB"], "T1_TO": t1["TO"],
        "T2": t2["TEAM_ABBREVIATION"], "T2_PTS": t2["PTS"], "T2_FGM": t2["FGM"], "T2_FGA": t2["FGA"], "T2_FG3M": t2["FG3M"], "T2_FG3A": t2["FG3A"], "T2_FTA": t2["FTA"], "T2_OREB": t2["OREB"], "T2_DREB": t2["DREB"], "T2_TO": t2["TO"],
    })
merged = pd.DataFrame(pairs)

t1_long = merged.rename(columns={
    "T1":"TEAM","T2":"OPP","T1_PTS":"PTS","T1_FGM":"FGM","T1_FGA":"FGA","T1_FG3M":"FG3M","T1_FG3A":"FG3A","T1_FTA":"FTA","T1_OREB":"OREB","T1_DREB":"DREB","T1_TO":"TO",
    "T2_PTS":"OPP_PTS","T2_FGM":"OPP_FGM","T2_FGA":"OPP_FGA","T2_FG3M":"OPP_FG3M","T2_FG3A":"OPP_FG3A","T2_FTA":"OPP_FTA","T2_OREB":"OPP_OREB","T2_DREB":"OPP_DREB","T2_TO":"OPP_TO"
})[["GAME_ID","GAME_DATE","TEAM","OPP","PTS","FGM","FGA","FG3M","FG3A","FTA","OREB","DREB","TO","OPP_PTS","OPP_FGM","OPP_FGA","OPP_FG3M","OPP_FG3A","OPP_FTA","OPP_OREB","OPP_DREB","OPP_TO"]]

t2_long = merged.rename(columns={
    "T2":"TEAM","T1":"OPP","T2_PTS":"PTS","T2_FGM":"FGM","T2_FGA":"FGA","T2_FG3M":"FG3M","T2_FG3A":"FG3A","T2_FTA":"FTA","T2_OREB":"OREB","T2_DREB":"DREB","T2_TO":"TO",
    "T1_PTS":"OPP_PTS","T1_FGM":"OPP_FGM","T1_FGA":"OPP_FGA","T1_FG3M":"OPP_FG3M","T1_FG3A":"OPP_FG3A","T1_FTA":"OPP_FTA","T1_OREB":"OPP_OREB","T1_DREB":"OPP_DREB","T1_TO":"OPP_TO"
})[["GAME_ID","GAME_DATE","TEAM","OPP","PTS","FGM","FGA","FG3M","FG3A","FTA","OREB","DREB","TO","OPP_PTS","OPP_FGM","OPP_FGA","OPP_FG3M","OPP_FG3A","OPP_FTA","OPP_OREB","OPP_DREB","OPP_TO"]]

long_df = pd.concat([t1_long, t2_long], ignore_index=True)

long_df["TEAM_POSS"] = long_df["FGA"] - long_df["OREB"] + long_df["TO"] + 0.44 * long_df["FTA"]
long_df["OPP_POSS"]  = long_df["OPP_FGA"] - long_df["OPP_OREB"] + long_df["OPP_TO"] + 0.44 * long_df["OPP_FTA"]

opp_efg      = ((long_df["OPP_FGM"] + 0.5 * long_df["OPP_FG3M"]) / long_df["OPP_FGA"]).replace([np.inf,-np.inf], np.nan).fillna(0)
opp_ppp      = (long_df["OPP_PTS"] / long_df["OPP_POSS"]).replace([np.inf,-np.inf], np.nan).fillna(0)
opp_3par     = (long_df["OPP_FG3A"] / long_df["OPP_FGA"]).replace([np.inf,-np.inf], np.nan).fillna(0)
opp_fta_rate = (long_df["OPP_FTA"] / long_df["OPP_FGA"]).replace([np.inf,-np.inf], np.nan).fillna(0)
opp_tov_rate = (long_df["OPP_TO"]  / long_df["OPP_POSS"]).replace([np.inf,-np.inf], np.nan).fillna(0)
drb_pct      = (long_df["DREB"]    / (long_df["DREB"] + long_df["OPP_OREB"])).replace([np.inf,-np.inf], np.nan).fillna(0)

new_defensive_features = long_df.assign(
    OPP_EFG=opp_efg,
    OPP_PPP=opp_ppp,
    OPP_3PAr=opp_3par,
    OPP_FTA_RATE=opp_fta_rate,
    OPP_TOV_RATE=opp_tov_rate,
    DRB_PCT=drb_pct
).groupby("TEAM")[["OPP_PTS","OPP_EFG","OPP_PPP","OPP_3PAr","OPP_FTA_RATE","OPP_TOV_RATE","DRB_PCT"]].mean().reset_index().rename(columns={
    "OPP_PTS":"AvgPointsDefended",
    "OPP_EFG":"AvgOppEFG",
    "OPP_PPP":"AvgOppPPP",
    "OPP_3PAr":"AvgOpp3PAr",
    "OPP_FTA_RATE":"AvgOppFTAr",
    "OPP_TOV_RATE":"AvgOppTOVRate",
    "DRB_PCT":"AvgDRB_PCT"
})

team_features_complete = team_features_expanded.merge(new_defensive_features, left_on="TEAM", right_on="TEAM", how="inner")

team_features_complete.head()


Unnamed: 0,TEAM,AvgPointsScored,AvgPointsAllowed,WinRate,AvgOppEFG_x,AvgOppTOVRate_x,AvgDRB_PCT_x,Stl_perG,Blk_perG,AvgEFG,...,AvgORB_PCT,AvgPPP,AST_perG,AvgPointsDefended,AvgOppEFG_y,AvgOppPPP,AvgOpp3PAr,AvgOppFTAr,AvgOppTOVRate_y,AvgDRB_PCT_y
0,ATL,117.848485,118.818182,0.545455,0.556922,0.151263,0.759365,0,0,0.539962,...,0.264902,1.113837,29.818182,118.818182,0.556922,1.116062,0.449563,0.238969,0.151263,0.759365
1,BKN,107.84375,112.78125,0.375,0.56559,0.148566,0.759698,0,0,0.539947,...,0.21574,1.096739,25.25,112.78125,0.56559,1.140037,0.434845,0.301065,0.148566,0.759698
2,BOS,119.636364,108.909091,0.727273,0.524821,0.1271,0.774866,0,0,0.562621,...,0.241094,1.182499,25.69697,108.909091,0.524821,1.079054,0.400123,0.202727,0.1271,0.774866
3,CHA,106.1875,112.84375,0.21875,0.548846,0.138484,0.760461,0,0,0.508135,...,0.275891,1.055062,23.5,112.84375,0.548846,1.124624,0.449503,0.274212,0.138484,0.760461
4,CHI,117.545455,121.030303,0.454545,0.546564,0.109633,0.773962,0,0,0.557682,...,0.20138,1.114538,29.454545,121.030303,0.546564,1.142793,0.419012,0.235122,0.109633,0.773962


###Encoding upcoming games with features

In [546]:
#Load upcoming games from your CSV
upcoming_games = pd.read_csv("nba_games_January_2025.csv")

#Clean column names to match what your model expects
upcoming_games["IS_HOME"] = upcoming_games["MATCHUP"].str.contains("vs")
home = upcoming_games[upcoming_games["IS_HOME"]].rename(columns={"TEAM_ABBREVIATION": "HOME_TEAM"})
away = upcoming_games[~upcoming_games["IS_HOME"]].rename(columns={"TEAM_ABBREVIATION": "AWAY_TEAM"})

#Merge home and away sides by GAME_ID
upcoming_games = pd.merge(home, away, on=["GAME_ID", "GAME_DATE"], suffixes=("_HOME", "_AWAY"))

#format for model
upcoming_games = upcoming_games[["GAME_DATE", "HOME_TEAM", "AWAY_TEAM"]]

print(upcoming_games.head())

upcoming_encoded_home = upcoming_games.merge(
    team_features_complete, left_on="HOME_TEAM", right_on="TEAM", how="left"
)
upcoming_encoded_both = upcoming_encoded_home.merge(
    team_features_complete,
    left_on="AWAY_TEAM",
    right_on="TEAM",
    suffixes=("_HOME", "_AWAY"),
    how="left"
)

upcoming_games.to_csv("BEST_New_Upcoming_Schedule.csv", index=False)
#Merge home team stats
upcoming_encoded_home = upcoming_games.merge(
    team_features_complete, left_on="HOME_TEAM", right_on="TEAM", how="left"
)

#Merge visitor stats
upcoming_encoded_both = upcoming_encoded_home.merge(
    team_features_complete,
    left_on="AWAY_TEAM",
    right_on="TEAM",
    suffixes=("_HOME", "_AWAY"),
    how="left"
)

    GAME_DATE HOME_TEAM AWAY_TEAM
0  2025-01-01       DET       ORL
1  2025-01-01       WAS       CHI
2  2025-01-01       MIA       NOP
3  2025-01-01       NYK       UTA
4  2025-01-01       TOR       BKN


In [547]:
upcoming_games

Unnamed: 0,GAME_DATE,HOME_TEAM,AWAY_TEAM
0,2025-01-01,DET,ORL
1,2025-01-01,WAS,CHI
2,2025-01-01,MIA,NOP
3,2025-01-01,NYK,UTA
4,2025-01-01,TOR,BKN
...,...,...,...
220,2025-01-31,PHI,DEN
221,2025-01-31,TOR,CHI
222,2025-01-31,NOP,BOS
223,2025-01-31,SAS,MIL


In [548]:
#Calculate the difference in NBA features between Home and Away teams
nba_diff_features = [
    'AvgPointsScored', 'AvgPointsAllowed', 'WinRate',
    'AvgOppEFG', 'AvgOppPPP', 'AvgOpp3PAr', 'AvgOppFTAr', 'AvgOppTOVRate',
    'AvgDRB_PCT', 'AvgEFG', 'AvgTS', 'AvgTOR', 'AvgORB_PCT', 'AvgPPP'
]

for col in nba_diff_features:
    home_col = f'{col}_HOME'
    away_col = f'{col}_AWAY'
    if home_col in upcoming_encoded_both.columns and away_col in upcoming_encoded_both.columns:
        upcoming_encoded_both[f'Diff_{col}'] = (
            upcoming_encoded_both[home_col] - upcoming_encoded_both[away_col]
        )

#Keep only the key identifying columns + differences
upcoming_encoded_final = upcoming_encoded_both[
    ['HOME_TEAM', 'AWAY_TEAM'] +
    [col for col in upcoming_encoded_both.columns if col.startswith('Diff_')]
]

upcoming_encoded_final.head()


Unnamed: 0,HOME_TEAM,AWAY_TEAM,Diff_AvgPointsScored,Diff_AvgPointsAllowed,Diff_WinRate,Diff_AvgOppPPP,Diff_AvgOpp3PAr,Diff_AvgOppFTAr,Diff_AvgEFG,Diff_AvgTS,Diff_AvgTOR,Diff_AvgORB_PCT,Diff_AvgPPP
0,DET,ORL,6.306985,11.626838,-0.150735,0.083539,0.035154,-0.034372,0.027248,0.017066,-0.000692,-0.00093,0.029177
1,WAS,CHI,-9.178788,1.369697,-0.287879,0.02935,0.008771,0.041537,-0.047653,-0.04223,0.012969,0.02133,-0.080963
2,MIA,NOP,4.366667,-9.239394,0.381818,-0.063488,-0.002745,-0.018514,0.032034,0.036185,-0.01918,-0.050044,0.063526
3,NYK,UTA,6.880743,-9.925709,0.471163,-0.075568,-0.010663,-0.018071,0.034721,0.028442,-0.050859,-0.036822,0.095858
4,TOR,BKN,3.247159,6.491477,-0.162879,0.017008,-0.004418,0.013172,-0.01715,-0.023197,0.013991,0.066972,-0.028538


##Training Data Preparation

In [549]:
games = team_box.sort_values(["GAME_ID","TEAM_ID"])
rows = []
for gid, g in games.groupby("GAME_ID"):
    if len(g) != 2:  #skip incomplete games
        continue
    t1, t2 = g.iloc[0], g.iloc[1]
    rows.append({
        "GAME_ID": gid,
        "GAME_DATE": t1["GAME_DATE"],
        "T1_TEAM": t1["TEAM_ABBREVIATION"],
        "T2_TEAM": t2["TEAM_ABBREVIATION"],
        "T1_PTS": t1["PTS"],
        "T2_PTS": t2["PTS"],
    })

merged_nba = pd.DataFrame(rows)
merged_nba["T1_WIN"] = (merged_nba["T1_PTS"] > merged_nba["T2_PTS"]).astype(int)

#   Merge team features for both sides.
t1_feat = team_features_complete.rename(columns={"TEAM":"T1_TEAM"})
t2_feat = team_features_complete.rename(columns={"TEAM":"T2_TEAM"})

training_encoded_both = (
    merged_nba
    .merge(t1_feat, on="T1_TEAM", how="left", suffixes=("", "_drop"))
    .merge(t2_feat, on="T2_TEAM", how="left", suffixes=("_T1", "_T2"))
)

#Build DIFFERENCE features (Team1 - Team2) for NBA stats that exist in features table.
nba_feature_candidates = [
    'AvgPointsScored', 'AvgPointsAllowed', 'WinRate',
    'AvgOppEFG', 'AvgOppPPP', 'AvgOpp3PAr', 'AvgOppFTAr', 'AvgOppTOVRate',
    'AvgDRB_PCT', 'AvgEFG', 'AvgTS', 'AvgTOR', 'AvgORB_PCT', 'AvgPPP'
]

for col in nba_feature_candidates:
    c1, c2 = f"{col}_T1", f"{col}_T2"
    if c1 in training_encoded_both.columns and c2 in training_encoded_both.columns:
        training_encoded_both[f"Diff_{col}"] = training_encoded_both[c1] - training_encoded_both[c2]

#Final training matrix (X) and labels (y)
diff_cols = [c for c in training_encoded_both.columns if c.startswith("Diff_")]
training_data   = training_encoded_both[diff_cols].fillna(0)
training_labels = training_encoded_both["T1_WIN"].astype(int)

print(training_data.shape, training_labels.shape)
training_data.head()


(484, 11) (484,)


Unnamed: 0,Diff_AvgPointsScored,Diff_AvgPointsAllowed,Diff_WinRate,Diff_AvgOppPPP,Diff_AvgOpp3PAr,Diff_AvgOppFTAr,Diff_AvgEFG,Diff_AvgTS,Diff_AvgTOR,Diff_AvgORB_PCT,Diff_AvgPPP
0,-1.787879,9.909091,-0.181818,0.037008,0.04944,0.036241,-0.022658,-0.017282,0.034233,0.023808,-0.068662
1,-1.185417,-6.45625,0.095833,-0.032783,0.002718,-0.044944,-0.008053,-0.000296,-0.023088,-0.03268,0.016491
2,-0.275735,-9.314338,0.369485,-0.072721,-0.061194,0.016863,0.00797,0.016588,-0.001525,-0.022504,0.015488
3,10.615152,-0.451515,0.263636,5.7e-05,-0.015793,-0.033726,0.060476,0.051133,-0.012685,0.007012,0.109614
4,1.58651,-7.917889,0.336266,-0.052872,-0.001184,-0.081373,0.041397,0.036588,-0.022254,-0.092735,0.049771


In [550]:
training_data.head()

Unnamed: 0,Diff_AvgPointsScored,Diff_AvgPointsAllowed,Diff_WinRate,Diff_AvgOppPPP,Diff_AvgOpp3PAr,Diff_AvgOppFTAr,Diff_AvgEFG,Diff_AvgTS,Diff_AvgTOR,Diff_AvgORB_PCT,Diff_AvgPPP
0,-1.787879,9.909091,-0.181818,0.037008,0.04944,0.036241,-0.022658,-0.017282,0.034233,0.023808,-0.068662
1,-1.185417,-6.45625,0.095833,-0.032783,0.002718,-0.044944,-0.008053,-0.000296,-0.023088,-0.03268,0.016491
2,-0.275735,-9.314338,0.369485,-0.072721,-0.061194,0.016863,0.00797,0.016588,-0.001525,-0.022504,0.015488
3,10.615152,-0.451515,0.263636,5.7e-05,-0.015793,-0.033726,0.060476,0.051133,-0.012685,0.007012,0.109614
4,1.58651,-7.917889,0.336266,-0.052872,-0.001184,-0.081373,0.041397,0.036588,-0.022254,-0.092735,0.049771


In [551]:
training_data.shape

(484, 11)

In [552]:
training_labels.head()

Unnamed: 0,T1_WIN
0,1
1,0
2,1
3,1
4,1


##AI Model Training

In [553]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

#Initialize the logistic regression model
logreg = LogisticRegression(max_iter=1000)

#Evaluate the model's performance using cross-validation
cross_val_scores = cross_val_score(logreg, training_data, training_labels, cv=5)

cross_val_scores_mean = cross_val_scores.mean()

cross_val_scores_mean


np.float64(0.6880584192439863)

In [554]:
#Checking the shape of the training data
training_data.shape

(484, 11)

In [555]:
#Checking for NaN values in the training data
nan_columns = training_data.columns[training_data.isna().any()].tolist()

#Displaying columns with NaN values and the number of NaN values in them
nan_counts = training_data[nan_columns].isna().sum()
nan_counts


Unnamed: 0,0


In [556]:
#Exclude rows with NaN values from the training data and labels
training_data_cleaned = training_data.dropna()
training_labels_cleaned = training_labels.loc[training_data_cleaned.index]

#Checking the shape of the cleaned data
training_data_cleaned.shape, training_labels_cleaned.shape


((484, 11), (484,))

In [557]:
#Re-evaluate the model's performance using cross-validation on the cleaned data
cross_val_scores_cleaned = cross_val_score(logreg, training_data_cleaned, training_labels_cleaned, cv=5)

cross_val_scores_cleaned_mean = cross_val_scores_cleaned.mean()

cross_val_scores_cleaned_mean


np.float64(0.6880584192439863)

In [558]:
#Train the logistic regression model on the entire cleaned training dataset
logreg.fit(training_data_cleaned, training_labels_cleaned)

##Make Predictions On Upcoming Games

In [559]:
#Predict the probability of the home team winning for the upcoming games
upcoming_game_probabilities = logreg.predict_proba(upcoming_encoded_final[[col for col in upcoming_encoded_final.columns if 'Diff_' in col]])

In [560]:
upcoming_game_probabilities

array([[0.64295045, 0.35704955],
       [0.74551782, 0.25448218],
       [0.16949884, 0.83050116],
       [0.12656011, 0.87343989],
       [0.61589544, 0.38410456],
       [0.45038786, 0.54961214],
       [0.40938399, 0.59061601],
       [0.43260087, 0.56739913],
       [0.38384491, 0.61615509],
       [0.68798788, 0.31201212],
       [0.32308912, 0.67691088],
       [0.23155802, 0.76844198],
       [0.40219884, 0.59780116],
       [0.29363554, 0.70636446],
       [0.34389421, 0.65610579],
       [0.78828654, 0.21171346],
       [0.58082321, 0.41917679],
       [0.44914873, 0.55085127],
       [0.35591977, 0.64408023],
       [0.68866166, 0.31133834],
       [0.42725131, 0.57274869],
       [0.69706311, 0.30293689],
       [0.48907841, 0.51092159],
       [0.53884332, 0.46115668],
       [0.60783486, 0.39216514],
       [0.49357202, 0.50642798],
       [0.20967587, 0.79032413],
       [0.74850985, 0.25149015],
       [0.24880236, 0.75119764],
       [0.54407808, 0.45592192],
       [0.

These numbers are the probabilities of each team winning the game

In [561]:
#Extract probability that the HOME team will win (second column)
upcoming_game_prob_home_win = upcoming_game_probabilities[:, 1]

#Add predictions to the DataFrame
upcoming_encoded_final['HomeWinProbability'] = upcoming_game_prob_home_win

#Sort for readability
upcoming_predictions = (
    upcoming_encoded_final[['HOME_TEAM', 'AWAY_TEAM', 'HomeWinProbability']]
    .sort_values(by='HomeWinProbability', ascending=False)
    .reset_index(drop=True)
)

#Display
upcoming_predictions


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  upcoming_encoded_final['HomeWinProbability'] = upcoming_game_prob_home_win


Unnamed: 0,HOME_TEAM,AWAY_TEAM,HomeWinProbability
0,OKC,UTA,0.921773
1,CLE,TOR,0.921043
2,BOS,NOP,0.917805
3,CLE,CHA,0.910357
4,MEM,NOP,0.899343
...,...,...,...
220,TOR,BOS,0.117789
221,WAS,HOU,0.113223
222,POR,OKC,0.102963
223,NOP,BOS,0.091406
