# Predict NBA Games Tutorial using LightGBM Classifier

Follows tutorial code from DataQuest to predict outcomes of NBA games. Use a **lightGBM classifier** model instead of a RidgeClassifier.

In [58]:
import pandas as pd

In [59]:
df = pd.read_csv("nba_games_runtime.csv", index_col=0) # read NBA data, first col is our pandas index
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,240.0,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22040,240.0,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True
22041,240.0,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,...,33.3,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False
22042,240.0,240.0,49.0,92.0,0.533,20.0,42.0,0.476,22.0,28.0,...,66.7,30.8,175.0,146.0,UTA,137,1,2024,2024-02-15,True


In [60]:
df = df.sort_values("date")
df = df.reset_index(drop=True) # drop old index
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,240.0,240.0,42.0,90.0,0.467,11.0,44.0,0.250,15.0,20.0,...,25.0,32.0,161.0,118.0,MEM,113,1,2024,2024-02-15,False
22040,240.0,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22041,240.0,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True
22042,240.0,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,...,33.3,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False


In [61]:
# Remove extraneous columns
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,240.0,42.0,90.0,0.467,11.0,44.0,0.250,15.0,20.0,0.750,...,25.0,32.0,161.0,118.0,MEM,113,1,2024,2024-02-15,False
22040,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,0.632,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22041,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,0.926,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True
22042,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,0.700,...,33.3,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False


In [62]:
# This adds a "target" column to our data frame, which is what we should predict as the outcome of the subsequent game
def add_target(team):
    new_team_df = team.copy()
    new_team_df["target"] = new_team_df["won"].shift(-1)
    return new_team_df

df = df.groupby("team", group_keys=False).apply(add_target)

  df = df.groupby("team", group_keys=False).apply(add_target)


In [63]:
df["team"]

0        DET
1        ATL
2        CLE
3        CHI
4        NOP
        ... 
22039    MIL
22040    MEM
22041    MIN
22042    POR
22043    UTA
Name: team, Length: 22044, dtype: object

In [64]:
df[df["team"] == "TOR"]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
16,240.0,36.0,80.0,0.450,7.0,18.0,0.389,27.0,39.0,0.692,...,31.1,127.0,110.0,IND,99,0,2016,2015-10-28,True,True
58,240.0,36.0,82.0,0.439,11.0,26.0,0.423,30.0,35.0,0.857,...,32.7,120.0,116.0,BOS,103,1,2016,2015-10-30,True,True
77,240.0,41.0,90.0,0.456,11.0,25.0,0.440,13.0,20.0,0.650,...,53.8,150.0,130.0,MIL,87,0,2016,2015-11-01,True,True
115,240.0,37.0,82.0,0.451,5.0,17.0,0.294,23.0,26.0,0.885,...,100.0,136.0,114.0,DAL,91,1,2016,2015-11-03,True,True
125,240.0,33.0,79.0,0.418,5.0,14.0,0.357,32.0,39.0,0.821,...,39.0,139.0,119.0,OKC,98,1,2016,2015-11-04,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21920,240.0,43.0,81.0,0.531,17.0,33.0,0.515,20.0,24.0,0.833,...,36.6,209.0,137.0,CHO,117,1,2024,2024-02-07,True,True
21942,240.0,40.0,95.0,0.421,10.0,33.0,0.303,17.0,19.0,0.895,...,30.9,300.0,118.0,HOU,104,0,2024,2024-02-09,True,False
21960,240.0,35.0,87.0,0.402,9.0,31.0,0.290,16.0,23.0,0.696,...,27.2,229.0,113.0,CLE,119,0,2024,2024-02-10,False,False
21981,240.0,39.0,98.0,0.398,8.0,31.0,0.258,13.0,16.0,0.813,...,34.3,155.0,107.0,SAS,122,0,2024,2024-02-12,False,False


In [65]:
df.loc[pd.isnull(df["target"]), "target"] = 2

In [66]:
df["target"] = df["target"].astype(int, errors="ignore")

In [67]:
# Check our dataset is balanced
df["won"].value_counts()

won
True     11022
False    11022
Name: count, dtype: int64

In [68]:
df["target"].value_counts()

target
1    11008
0    11006
2       30
Name: count, dtype: int64

In [69]:
nulls = pd.isnull(df)

In [70]:
nulls = nulls.sum()

In [71]:
nulls = nulls[nulls > 0]
nulls

+/-             22044
mp_max          22044
mp_max.1        22044
+/-_max            17
+/-_opp         22044
mp_max_opp      22044
mp_max_opp.1    22044
+/-_max_opp        17
dtype: int64

In [72]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=140)

In [73]:
df = df[valid_columns].copy()

In [74]:
df  # should have 142 columns

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,...,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True,1
1,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
2,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
3,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
4,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,240.0,42.0,90.0,0.467,11.0,44.0,0.250,15.0,20.0,0.750,...,32.0,161.0,118.0,MEM,113,1,2024,2024-02-15,False,2
22040,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,0.632,...,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True,2
22041,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,0.926,...,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True,2
22042,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,0.700,...,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False,2


In [75]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming 'df' is your DataFrame and 'selected_columns' are the features after initial preprocessing
X = df[selected_columns]
y = df['target']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and train a LightGBM model
lgbm = LGBMClassifier(n_estimators=100, learning_rate=0.05, num_leaves=70, random_state=42)
lgbm.fit(X_train, y_train)

# Predictions
y_pred = lgbm.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15954
[LightGBM] [Info] Number of data points in the train set: 17635, number of used features: 132
[LightGBM] [Info] Start training from score -0.694339
[LightGBM] [Info] Start training from score -0.694679
[LightGBM] [Info] Start training from score -6.599587
Accuracy: 0.5271036516216829
