# NBA Game Predictor
---
Predict outcome of NBA game based on box score data

DS 231 Assignment Documentation:

>I will predict the WINorLOSS column using a classification model. There are two classes within the target(W and L), and they are perfectly balanced at 50/50.
>I will use accuracy as my main evaluation metric, with some consideration given to precision and recall as well.
>To address the issue of leaky features, I will only be including a total combined score for each game and drop each team's point total from the data.

In [57]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from category_encoders import OrdinalEncoder

In [89]:
def wrangle(filepath):
    df = pd.read_csv(filepath,
                     parse_dates=['Date'],
                     index_col='Date')
    
    # Drop extraneous 'unnamed' column
    df.drop(columns='Unnamed: 0', inplace=True)
    
    # Transform Home feature into 1/0 (1 for home, 0 for away)
    df['Home'] = df['Home'].apply(lambda x: 1 if x == 'Home' else 0)
    
    # Transform WINorLOSS to Win and 1/0
    df['Win'] = df['WINorLOSS'].apply(lambda x: 1 if x == 'W' else 0)
    df.drop(columns=['WINorLOSS'], inplace=True)
    
    # Create Total Points feature
    df['TotalPoints'] = df['TeamPoints'] + df['OpponentPoints']
    
    # Create Defensive Rebounds feature
    df['DefRebound'] = df['TotalRebounds'] - df['OffRebounds']
    df['Opp.DefRebound'] = df['Opp.TotalRebounds'] - df['Opp.OffRebounds']
    
    ## Create features for the 'Four Factors'
    # Shooting : (FG + 0.5 * 3FG) / FGA
    df['Team_EFG'] = (df['FieldGoals'] + 0.5 * df['X3PointShots']) / df['FieldGoalsAttempted']
    df['Opp_EFG'] = (df['Opp.FieldGoals'] + 0.5 * df['Opp.3PointShots']) / df['Opp.FieldGoalsAttempted']
    
    # Turnovers : TOV / (FGA + 0.44 * FTA + TOV)
    df['Team_TOV'] = df['Turnovers'] / (df['FieldGoalsAttempted'] + 0.44 * df['FreeThrowsAttempted'] + df['Turnovers'])
    df['Opp_TOV'] = df['Opp.Turnovers'] / (df['Opp.FieldGoalsAttempted'] + 0.44 * df['Opp.FreeThrowsAttempted'] + df['Opp.Turnovers'])
    
    # Rebounding : DRB / (Opp ORB + DRB)
    df['Team_DRBP'] = df['DefRebound'] / (df['Opp.OffRebounds'] + df['DefRebound'])
    df['Opp_DRBP'] = df['Opp.DefRebound'] / (df['OffRebounds'] + df['Opp.DefRebound'])
    
    # Freethrows : FT / FGA
    df['Team_FTR'] = df['FreeThrows'] / df['FieldGoalsAttempted']
    df['Opp_FTR'] = df['Opp.FreeThrows'] / df['Opp.FieldGoalsAttempted']
    
    # Drop redundant features
    redundant = [col for col in df.columns if col[-1] == '.']
    df.drop(columns=redundant, inplace=True)
    
    # Drop leaky columns
    leaky_cols = ['TeamPoints', 'OpponentPoints']
    df.drop(columns=leaky_cols, inplace=True)
    
    return df

df = wrangle('data/nba.games.stats.csv')

In [55]:
print(df.columns)
print(df.shape)
#df.head()

Index(['Team', 'Game', 'Home', 'Opponent', 'FieldGoals', 'FieldGoalsAttempted',
       'X3PointShots', 'X3PointShotsAttempted', 'FreeThrows',
       'FreeThrowsAttempted', 'OffRebounds', 'TotalRebounds', 'Assists',
       'Steals', 'Blocks', 'Turnovers', 'TotalFouls', 'Opp.FieldGoals',
       'Opp.FieldGoalsAttempted', 'Opp.3PointShots',
       'Opp.3PointShotsAttempted', 'Opp.FreeThrows', 'Opp.FreeThrowsAttempted',
       'Opp.OffRebounds', 'Opp.TotalRebounds', 'Opp.Assists', 'Opp.Steals',
       'Opp.Blocks', 'Opp.Turnovers', 'Opp.TotalFouls', 'Win', 'TotalPoints',
       'DefRebound', 'Opp.DefRebound', 'Team_EFG', 'Opp_EFG', 'Team_TOV',
       'Opp_TOV', 'Team_DRBP', 'Opp_DRBP', 'Team_FTR', 'Opp_FTR'],
      dtype='object')
(9840, 42)


0

In [87]:
# Split Data
features = ['Team', 'Game', 'Home', 'Opponent', 'TotalPoints', 'Team_EFG', 'Opp_EFG', 'Team_TOV',
       'Opp_TOV', 'Team_DRBP', 'Opp_DRBP', 'Team_FTR', 'Opp_FTR']
target = 'Win'
X = df.drop(columns=target)
#X = df[features]
y = df[target]

# Train-Val-Test Split
train_cutoff = slice("2014-10","2016-04")
X_train, y_train = X.loc[train_cutoff], y.loc[train_cutoff]

val_cutoff = slice("2016-10","2017-04")
X_val, y_val = X.loc[val_cutoff], y.loc[val_cutoff]

test_cutoff = slice("2017-10","2018-04")
X_test, y_test = X.loc[test_cutoff], y.loc[test_cutoff]

  X_train, y_train = X.loc[train_cutoff], y.loc[train_cutoff]
  X_val, y_val = X.loc[val_cutoff], y.loc[val_cutoff]
  X_test, y_test = X.loc[test_cutoff], y.loc[test_cutoff]


In [88]:
# Build model
model = make_pipeline(
    OrdinalEncoder(),
    RandomForestClassifier(random_state=42, n_jobs=-1)
)
model.fit(X_train, y_train)
print('Accuracy:', accuracy_score(y_val, model.predict(X_val)))

Accuracy: 0.9089430894308943
