# Feature Engineering For Team Data

This notebook contains the feature engineering of our raw game data. We remove extraneous and unimportant features, as well as calculate advanced statistics like Elo rating in an effort to gauge a team's likelihood of winning a game. This notebook references a previous project found [here](https://towardsdatascience.com/predicting-the-outcome-of-nba-games-with-machine-learning-a810bb768f20) and Nate Silver's formula for calculating Elo, which has been detailed by Matteo Hoch [here](https://www.ergosum.co/nate-silvers-nba-elo-algorithm/).

## Load the data
The data we will train our model contains team stat summaries of every regular season game from the start of the 2015-16 season to 2/15/2024.

In [1]:
PATH_TO_TEAM_DATA = "../data/raw/nba_games_runtime.csv"

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv(PATH_TO_TEAM_DATA, index_col=0)
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,240.0,240.0,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22040,240.0,240.0,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True
22041,240.0,240.0,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,...,33.3,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False
22042,240.0,240.0,49.0,92.0,0.533,20.0,42.0,0.476,22.0,28.0,...,66.7,30.8,175.0,146.0,UTA,137,1,2024,2024-02-15,True


In [4]:
print(df.columns.tolist())  # show what features we're working with

['mp', 'mp.1', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', '+/-', 'ts%', 'efg%', '3par', 'ftr', 'orb%', 'drb%', 'trb%', 'ast%', 'stl%', 'blk%', 'tov%', 'usg%', 'ortg', 'drtg', 'mp_max', 'mp_max.1', 'fg_max', 'fga_max', 'fg%_max', '3p_max', '3pa_max', '3p%_max', 'ft_max', 'fta_max', 'ft%_max', 'orb_max', 'drb_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'tov_max', 'pf_max', 'pts_max', '+/-_max', 'ts%_max', 'efg%_max', '3par_max', 'ftr_max', 'orb%_max', 'drb%_max', 'trb%_max', 'ast%_max', 'stl%_max', 'blk%_max', 'tov%_max', 'usg%_max', 'ortg_max', 'drtg_max', 'team', 'total', 'home', 'index_opp', 'mp_opp', 'mp_opp.1', 'fg_opp', 'fga_opp', 'fg%_opp', '3p_opp', '3pa_opp', '3p%_opp', 'ft_opp', 'fta_opp', 'ft%_opp', 'orb_opp', 'drb_opp', 'trb_opp', 'ast_opp', 'stl_opp', 'blk_opp', 'tov_opp', 'pf_opp', 'pts_opp', '+/-_opp', 'ts%_opp', 'efg%_opp', '3par_opp', 'ftr_opp', 'orb%_opp', 'drb%_opp', 'trb%_opp', 'ast%_

## Data Cleaning and Feature Engineering

In [5]:
df["mp_max.1"]

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
22039   NaN
22040   NaN
22041   NaN
22042   NaN
22043   NaN
Name: mp_max.1, Length: 22044, dtype: float64

In [6]:
# Remove extraneous columns
del df["mp"]
del df["mp.1"]
del df["mp_opp"]
del df["mp_opp.1"]
del df["index_opp"]
del df["mp_max"]
del df["mp_max.1"]
del df["mp_max_opp"]
del df["mp_max_opp.1"]
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,23.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,7.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,11.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,7.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,8.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,0.632,7.0,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22040,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,0.926,11.0,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True
22041,31.0,83.0,0.373,8.0,29.0,0.276,21.0,30.0,0.700,9.0,...,33.3,33.0,181.0,101.0,MIN,128,0,2024,2024-02-15,False
22042,49.0,92.0,0.533,20.0,42.0,0.476,22.0,28.0,0.786,13.0,...,66.7,30.8,175.0,146.0,UTA,137,1,2024,2024-02-15,True


### Fix Team Abbreviations

Some of the team abbreviations pulled from Basketball Reference do not match the official NBA abbreviations. We'll fix them now.

In [10]:
# Print out all the team codes
all_team_codes = df["team"].unique()

print(len(all_team_codes))  # We should have 30 teams
print(all_team_codes)

30
['DET' 'ATL' 'CLE' 'CHI' 'NOP' 'GSW' 'SAS' 'OKC' 'WAS' 'DAL' 'PHO' 'LAC'
 'POR' 'MIL' 'SAC' 'IND' 'TOR' 'NYK' 'ORL' 'CHO' 'PHI' 'BOS' 'BRK' 'UTA'
 'MIA' 'HOU' 'MIN' 'LAL' 'MEM' 'DEN']


In [12]:
import numpy as np

# Fix team codes that do not match the NBA team codes
# BRK -> BKN
# CHO -> CHA
# PHO -> PHX

df = df.replace({"BRK": "BKN", "CHO": "CHA", "PHO": "PHX"})
all_team_codes = np.sort(df["team"].unique())
all_opp_team_codes = np.sort(df["team_opp"].unique())

print(all_team_codes)

assert np.array_equal(all_team_codes, all_opp_team_codes)

['ATL' 'BKN' 'BOS' 'CHA' 'CHI' 'CLE' 'DAL' 'DEN' 'DET' 'GSW' 'HOU' 'IND'
 'LAC' 'LAL' 'MEM' 'MIA' 'MIL' 'MIN' 'NOP' 'NYK' 'OKC' 'ORL' 'PHI' 'PHX'
 'POR' 'SAC' 'SAS' 'TOR' 'UTA' 'WAS']


### Elo Calculations

The Elo rating formula is from Nate Silver, founder of FiveThirtyEight. They have calculated Elo ratings of each team from 1946-2023 in [this website](https://projects.fivethirtyeight.com/complete-history-of-the-nba).

The formula has been replicated/detailed by Matteo Hoch in his blog [here](https://www.ergosum.co/nate-silvers-nba-elo-algorithm/).

We will be adhering to this formula. However, our dataset begins at the start of the 2015-16 NBA season.

To be as accurate to the "true" Elo as possible, I manually pulled the elo ratings from the start of the 2015-16 season (from FiveThirtyEight) to set as our initial elo.

Note, our calculations will not include playoff games, so our ratings may deviate in later seasons.

In [15]:
# Set initial Elo ratings manually

initial_elo = dict()  # Create a dictionary to hold initial elos

# ELO RATINGS AT START OF 2015-16 SEASON
initial_elo["ATL"] = 1562
initial_elo["BKN"] = 1470
initial_elo["BOS"] = 1520
initial_elo["CHA"] = 1427
initial_elo["CHI"] = 1570
initial_elo["CLE"] = 1645
initial_elo["DAL"] = 1544
initial_elo["DEN"] = 1443
initial_elo["DET"] = 1472
initial_elo["GSW"] = 1743
initial_elo["HOU"] = 1617
initial_elo["IND"] = 1505
initial_elo["LAC"] = 1647
initial_elo["LAL"] = 1339
initial_elo["MEM"] = 1583
initial_elo["MIA"] = 1468
initial_elo["MIL"] = 1459
initial_elo["MIN"] = 1324
initial_elo["NOP"] = 1521
initial_elo["NYK"] = 1318
initial_elo["OKC"] = 1564
initial_elo["ORL"] = 1360
initial_elo["PHI"] = 1333
initial_elo["PHX"] = 1476
initial_elo["POR"] = 1544
initial_elo["SAC"] = 1440
initial_elo["SAS"] = 1667
initial_elo["TOR"] = 1502
initial_elo["UTA"] = 1543
initial_elo["WAS"] = 1536

In [16]:
# Sort games by date
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values("date")
df = df.reset_index(drop=True) # drop old index
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,23.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,7.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,11.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,7.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,8.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22039,49.0,92.0,0.533,20.0,42.0,0.476,22.0,28.0,0.786,13.0,...,66.7,30.8,175.0,146.0,UTA,137,1,2024,2024-02-15,True
22040,42.0,90.0,0.467,11.0,44.0,0.250,15.0,20.0,0.750,11.0,...,25.0,32.0,161.0,118.0,MEM,113,1,2024,2024-02-15,False
22041,44.0,83.0,0.530,13.0,26.0,0.500,12.0,19.0,0.632,7.0,...,40.0,34.5,170.0,121.0,MIL,110,0,2024,2024-02-15,True
22042,46.0,89.0,0.517,11.0,30.0,0.367,25.0,27.0,0.926,11.0,...,25.8,31.5,200.0,137.0,POR,91,1,2024,2024-02-15,True


In [None]:
# Create DataFrames to store elo calculations for each game and elos for each team
elo_df = pd.DataFrame(columns=['date', 'season', 'team', 'team_opp', 'team_elo_before', 'team_opp_elo_before', 
                               'team_elo_after', 'team_opp_elo_after'])
# teams_elo_df = pd

# TODO: Find game_ids for each game!! Refer to the find_game_ids.ipynb notebook that I am working on!