In [None]:
pip install scikeras

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


In [None]:
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.6-py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.6 kt-legacy-1.0.5


In [None]:
import keras
import keras_tuner
import numpy as np
import pandas as pd
from keras import layers
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
from keras_tuner.tuners import RandomSearch
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [None]:
raw_df = pd.read_csv('NBA_PBP_2018-19.csv')
raw_df = raw_df.drop(columns=['URL'])
raw_df.head()

Unnamed: 0,GameType,Location,Date,Time,WinningTeam,Quarter,SecLeft,AwayTeam,AwayPlay,AwayScore,...,FreeThrowNum,EnterGame,LeaveGame,TurnoverPlayer,TurnoverType,TurnoverCause,TurnoverCauser,JumpballAwayPlayer,JumpballHomePlayer,JumpballPoss
0,regular,TD Garden Boston Massachusetts,October 16 2018,8:00 PM,BOS,1,720,PHI,Jump ball: J. Embiid vs. A. Horford (B. Simmon...,0,...,,,,,,,,J. Embiid - embiijo01,A. Horford - horfoal01,B. Simmons - simmobe01
1,regular,TD Garden Boston Massachusetts,October 16 2018,8:00 PM,BOS,1,700,PHI,R. Covington misses 3-pt jump shot from 27 ft,0,...,,,,,,,,,,
2,regular,TD Garden Boston Massachusetts,October 16 2018,8:00 PM,BOS,1,700,PHI,,0,...,,,,,,,,,,
3,regular,TD Garden Boston Massachusetts,October 16 2018,8:00 PM,BOS,1,675,PHI,,0,...,,,,,,,,,,
4,regular,TD Garden Boston Massachusetts,October 16 2018,8:00 PM,BOS,1,673,PHI,Defensive rebound by D. ari,0,...,,,,,,,,,,


In [None]:
# dropped certain columns because they give away the play (i.e., what we are setting as the target class for our classification task)
dropped = ['Shooter', 'ShotType', 'ShotOutcome', 'ShotDist',
                   'Assister', 'Blocker', 'FoulType', 'Fouler', 'Fouled',
                   'Rebounder', 'ReboundType', 'ViolationPlayer',
                   'ViolationType', 'FreeThrowShooter', 'FreeThrowOutcome',
                   'FreeThrowNum', 'EnterGame', 'LeaveGame', 'TurnoverPlayer',
                   'TurnoverType', 'TurnoverCause', 'TurnoverCauser',
                   'JumpballAwayPlayer', 'JumpballHomePlayer', 'JumpballPoss', 'TimeoutTeam']


data = raw_df.copy()
data.drop(columns=dropped, inplace=True)

# function to convert AwayPlay and HomePlay descriptions into few word representations of the play
# this takes columns `AwayPlay` and `HomePlay`, and converts them into `RefactoredAwayPlay` and `RefactoredHomePlay`
def refactor_play(play):
    if pd.isnull(play):
        return "no-play"
    elif "makes" in play:
        return "make"
    elif "misses" in play:
        return "miss"
    elif "rebound" in play:
        return "rebound"
    elif "turnover" in play:
        return "turnover"
    elif "Jump ball" in play:
        return "jump-ball"
    elif "foul" in play:
        return "foul"
    else:
        return "other"

data['RefactoredAwayPlay'] = data['AwayPlay'].apply(lambda x: refactor_play(x))
data['RefactoredHomePlay'] = data['HomePlay'].apply(lambda x: refactor_play(x))

data.drop(columns=['AwayPlay', 'HomePlay'], inplace=True)

data.head()

Unnamed: 0,GameType,Location,Date,Time,WinningTeam,Quarter,SecLeft,AwayTeam,AwayScore,HomeTeam,HomeScore,RefactoredAwayPlay,RefactoredHomePlay
0,regular,TD Garden Boston Massachusetts,October 16 2018,8:00 PM,BOS,1,720,PHI,0,BOS,0,jump-ball,no-play
1,regular,TD Garden Boston Massachusetts,October 16 2018,8:00 PM,BOS,1,700,PHI,0,BOS,0,miss,no-play
2,regular,TD Garden Boston Massachusetts,October 16 2018,8:00 PM,BOS,1,700,PHI,0,BOS,0,no-play,rebound
3,regular,TD Garden Boston Massachusetts,October 16 2018,8:00 PM,BOS,1,675,PHI,0,BOS,0,no-play,miss
4,regular,TD Garden Boston Massachusetts,October 16 2018,8:00 PM,BOS,1,673,PHI,0,BOS,0,rebound,no-play


In [None]:
# converted date and time into meaningful representations by splitting them into year, month, day, hour and minute
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

data['Time'] = pd.to_datetime(data['Time']).dt.strftime('%H:%M:%S')
data['Hour'] = pd.to_datetime(data['Time']).dt.hour
data['Minute'] = pd.to_datetime(data['Time']).dt.minute

data.drop(columns=['Date', 'Time'], inplace=True)

data.head()

Unnamed: 0,GameType,Location,WinningTeam,Quarter,SecLeft,AwayTeam,AwayScore,HomeTeam,HomeScore,RefactoredAwayPlay,RefactoredHomePlay,Year,Month,Day,Hour,Minute
0,regular,TD Garden Boston Massachusetts,BOS,1,720,PHI,0,BOS,0,jump-ball,no-play,2018,10,16,20,0
1,regular,TD Garden Boston Massachusetts,BOS,1,700,PHI,0,BOS,0,miss,no-play,2018,10,16,20,0
2,regular,TD Garden Boston Massachusetts,BOS,1,700,PHI,0,BOS,0,no-play,rebound,2018,10,16,20,0
3,regular,TD Garden Boston Massachusetts,BOS,1,675,PHI,0,BOS,0,no-play,miss,2018,10,16,20,0
4,regular,TD Garden Boston Massachusetts,BOS,1,673,PHI,0,BOS,0,rebound,no-play,2018,10,16,20,0


In [None]:
# encoded the column team and location columns
# did not one-hot encode teams and locations because it would produce too many columns
# instead a dictionary maps every unique team and location to a unique index
# also print `finalDf` and notice the differences from the above data's head()
finalDf = data.copy()
teams = pd.concat([finalDf['AwayTeam'], finalDf['HomeTeam']])
uniqueTeams = teams.unique()
teamDict = {value: index + 1 for index, value in enumerate(uniqueTeams)}

def encodeTeams(column):
    column.replace(teamDict, inplace=True)

encodeTeams(finalDf['AwayTeam'])
encodeTeams(finalDf['HomeTeam'])
encodeTeams(finalDf['WinningTeam'])

locs = finalDf['Location'].unique()
locsDict = {value: index + 1 for index, value in enumerate(locs)}

def encodeLocs(column):
    column.replace(locsDict, inplace=True)

encodeLocs(finalDf['Location'])
finalDf.head()

Unnamed: 0,GameType,Location,WinningTeam,Quarter,SecLeft,AwayTeam,AwayScore,HomeTeam,HomeScore,RefactoredAwayPlay,RefactoredHomePlay,Year,Month,Day,Hour,Minute
0,regular,1,19,1,720,1,0,19,0,jump-ball,no-play,2018,10,16,20,0
1,regular,1,19,1,700,1,0,19,0,miss,no-play,2018,10,16,20,0
2,regular,1,19,1,700,1,0,19,0,no-play,rebound,2018,10,16,20,0
3,regular,1,19,1,675,1,0,19,0,no-play,miss,2018,10,16,20,0
4,regular,1,19,1,673,1,0,19,0,rebound,no-play,2018,10,16,20,0


In [None]:
teamDict

{'PHI': 1,
 'OKC': 2,
 'MIL': 3,
 'BRK': 4,
 'MEM': 5,
 'MIA': 6,
 'ATL': 7,
 'CLE': 8,
 'NOP': 9,
 'MIN': 10,
 'UTA': 11,
 'DEN': 12,
 'DAL': 13,
 'CHI': 14,
 'LAL': 15,
 'CHO': 16,
 'NYK': 17,
 'SAC': 18,
 'BOS': 19,
 'IND': 20,
 'GSW': 21,
 'TOR': 22,
 'ORL': 23,
 'DET': 24,
 'PHO': 25,
 'SAS': 26,
 'HOU': 27,
 'WAS': 28,
 'LAC': 29,
 'POR': 30}

In [None]:
locsDict

{'TD Garden Boston Massachusetts': 1,
 'Oracle Arena Oakland California': 2,
 'Spectrum Center Charlotte North Carolina': 3,
 'Little Caesars Arena Detroit Michigan': 4,
 'Bankers Life Fieldhouse Indianapolis Indiana': 5,
 'Amway Center Orlando Florida': 6,
 'Madison Square Garden (IV) New York New York': 7,
 'Scotiabank Arena Toronto Canada': 8,
 'Toyota Center Houston Texas': 9,
 'AT&T Center San Antonio Texas': 10,
 'Golden 1 Center Sacramento California': 11,
 'STAPLES Center Los Angeles California': 12,
 'Talking Stick Resort Arena Phoenix Arizona': 13,
 'Wells Fargo Center Philadelphia Pennsylvania': 14,
 'Capital One Arena Washington District of Columbia': 15,
 'Moda Center Portland Oregon': 16,
 'Barclays Center Brooklyn New York': 17,
 'FedEx Forum Memphis Tennessee': 18,
 'Target Center Minneapolis Minnesota': 19,
 'Smoothie King Center New Orleans Louisiana': 20,
 'Fiserv Forum Milwaukee Wisconsin': 21,
 'Vivint Smart Home Arena Salt Lake City Utah': 22,
 'United Center Chic

In [None]:
# normalize all numerical columns and put into normDf
scaler = MinMaxScaler()
numCols = ['Location', 'Quarter', 'SecLeft', 'AwayScore', 'HomeScore', 'Year', 'Month', 'Day', 'Hour', 'Minute']
normDf = pd.DataFrame(scaler.fit_transform(finalDf[numCols]), columns=numCols)


# one-hot encoding categorical columns
categorical_cols = ['GameType', 'RefactoredAwayPlay', 'RefactoredHomePlay']
encoder = OneHotEncoder()
encodedCol = encoder.fit_transform(finalDf[categorical_cols])
encodedDf = pd.DataFrame(encodedCol.toarray(), columns=encoder.get_feature_names_out(categorical_cols))


finalDf = pd.concat([normDf, encodedDf], axis=1)
finalDf.head()

Unnamed: 0,Location,Quarter,SecLeft,AwayScore,HomeScore,Year,Month,Day,Hour,Minute,...,RefactoredAwayPlay_other,RefactoredAwayPlay_rebound,RefactoredAwayPlay_turnover,RefactoredHomePlay_foul,RefactoredHomePlay_make,RefactoredHomePlay_miss,RefactoredHomePlay_no-play,RefactoredHomePlay_other,RefactoredHomePlay_rebound,RefactoredHomePlay_turnover
0,0.0,0.0,1.0,0.0,0.0,0.0,0.818182,0.5,0.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.972222,0.0,0.0,0.0,0.818182,0.5,0.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.972222,0.0,0.0,0.0,0.818182,0.5,0.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.9375,0.0,0.0,0.0,0.818182,0.5,0.8,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.934722,0.0,0.0,0.0,0.818182,0.5,0.8,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
# possible categories: no-play, make, miss, rebound, turnover, jump ball, foul, other

# why is there 8 for away play and 7 for home play ???

targets = ['RefactoredAwayPlay_foul', 'RefactoredAwayPlay_make',
       'RefactoredAwayPlay_miss', 'RefactoredAwayPlay_no-play',
       'RefactoredAwayPlay_other', 'RefactoredAwayPlay_rebound',
       'RefactoredAwayPlay_turnover','RefactoredAwayPlay_jump-ball',
        'RefactoredHomePlay_make', 'RefactoredHomePlay_foul',
       'RefactoredHomePlay_miss', 'RefactoredHomePlay_no-play',
       'RefactoredHomePlay_other', 'RefactoredHomePlay_rebound',
       'RefactoredHomePlay_turnover']
X = finalDf.drop(targets, axis=1)
y = finalDf[targets]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.sum()

RefactoredAwayPlay_foul          24443.0
RefactoredAwayPlay_make          60863.0
RefactoredAwayPlay_miss          56531.0
RefactoredAwayPlay_no-play      243357.0
RefactoredAwayPlay_other         49223.0
RefactoredAwayPlay_rebound       55400.0
RefactoredAwayPlay_turnover         38.0
RefactoredAwayPlay_jump-ball      1757.0
RefactoredHomePlay_make          62246.0
RefactoredHomePlay_foul          24745.0
RefactoredHomePlay_miss          55774.0
RefactoredHomePlay_no-play      248260.0
RefactoredHomePlay_other         43493.0
RefactoredHomePlay_rebound       57055.0
RefactoredHomePlay_turnover         39.0
dtype: float64

In [None]:
classifier = Sequential()

classifier.add(Dense(units = 14, input_dim = 12))
classifier.add(Dense(units = 14, activation = 'sigmoid'))
classifier.add(Dense(units = 14, activation = 'sigmoid'))
classifier.add(Dense(units = 15, activation = 'softmax'))

In [None]:
opt = Adam(learning_rate=0.01)
classifier.compile(optimizer = opt, loss = 'categorical_crossentropy')

In [None]:
classifier.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 14)                182       
                                                                 
 dense_1 (Dense)             (None, 14)                210       
                                                                 
 dense_2 (Dense)             (None, 14)                210       
                                                                 
 dense_3 (Dense)             (None, 15)                225       
                                                                 
Total params: 827 (3.23 KB)
Trainable params: 827 (3.23 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
neuralnet = classifier.fit(X_train, y_train, batch_size=700, epochs = 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
preds = classifier.predict(X_test)
preds



array([[0.01088743, 0.05317276, 0.04512713, ..., 0.03820551, 0.06323677,
        0.        ],
       [0.01088743, 0.05317276, 0.04512713, ..., 0.03820551, 0.06323677,
        0.        ],
       [0.01088743, 0.05317276, 0.04512713, ..., 0.03820551, 0.06323677,
        0.        ],
       ...,
       [0.01088743, 0.05317276, 0.04512713, ..., 0.03820551, 0.06323677,
        0.        ],
       [0.01088743, 0.05317276, 0.04512713, ..., 0.03820551, 0.06323677,
        0.        ],
       [0.01088743, 0.05317276, 0.04512713, ..., 0.03820551, 0.06323677,
        0.        ]], dtype=float32)

In [None]:
for pred in preds:
  maxprob = max(pred)

  for i in range(len(pred)):
    if pred[i] != maxprob:
      pred[i] = 0
    else:
      pred[i] = 1

In [None]:
confusion_matrix(y_test.values.argmax(axis=1), preds.argmax(axis=1))

array([[    0,     0,     0,     0,     0,     0,     0,     0,  6087],
       [    0,     0,     0,     0,     0,     0,     0,     0, 15099],
       [    0,     0,     0,     0,     0,     0,     0,     0, 14138],
       [    0,     0,     0,     0,     0,     0,     0,     0, 60852],
       [    0,     0,     0,     0,     0,     0,     0,     0, 12388],
       [    0,     0,     0,     0,     0,     0,     0,     0, 13863],
       [    0,     0,     0,     0,     0,     0,     0,     0,     6],
       [    0,     0,     0,     0,     0,     0,     0,     0,   471],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [None]:
#classification report
print(classification_report(y_test,preds))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      6087
           1       0.00      0.00      0.00     15099
           2       0.00      0.00      0.00     14138
           3       0.00      0.00      0.00     60852
           4       0.00      0.00      0.00     12388
           5       0.00      0.00      0.00     13863
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00       471
           8       0.00      0.00      0.00     15714
           9       0.00      0.00      0.00      6203
          10       0.00      0.00      0.00     13944
          11       0.50      1.00      0.67     62052
          12       0.00      0.00      0.00     10905
          13       0.00      0.00      0.00     14068
          14       0.00      0.00      0.00        18

   micro avg       0.50      0.25      0.34    245808
   macro avg       0.03      0.07      0.04    245808
weighted avg       0.13   

In [None]:
#error for test
from sklearn.metrics import log_loss
loss=log_loss(y_test, preds)
print(loss)

23.8357524123188


In [None]:
#error for train
preds2=classifier.predict(X_train)
from sklearn.metrics import log_loss
loss=log_loss(y_train, preds2)
print(loss)

4.43106098661003


We can see that the training error is much lower than the testing error. This might be because the model might be overfitting to the training data and
when it sees new data in the test data, the accuracy is way off since it isn't used to seeing anything other than the training data for so many iterations. As such, we can say that on the fitting graph we have seen in lecture, the model would be far to the right of the fitting graph. In the fitting graph, the lower the model complexity, the training and test errors are both high (Which is underfitting). As we increase the complexity both errors will get lower till at a certain (optimal) complexity, both train and test erros will reach their lowest collective point (where both are similar to eachother). As we keep increasing model complexity, we will get much lower training error and test error will start to increase a lot. At this point we are overfitting to the training data (which is where our current model is now).

# Next Two Models

Two more models for predicting NBA play outcomes could include a CNN and a DNN. A CNN and a DNN could be trained to learn spatial patterns and relationships within play descriptions or game context, leveraging features such as text embeddings. In order to accurately reflect and model the complexities of the data, there's a lot of things we could improve on to make a better neural network. One of the things we aim to improve is the major imbalance in the target classifications. No play especially has a lot of data, at around 200K, while other classes have data in the tens of thousands. In our next models, in order to solve the data imbalance and poor model performance, we're going to preprocess our data differently by merging Away Play and Home Play to just one column that holds all the Plays. This way, when we one hot encode, we will end up with 8 classifications instead of the 15 we have now.

# Conclusion

Our analysis of the model trained on the NBA dataset reveals several key findings. The model architecture consists of multiple layers with sigmoid and softmax activations, and prior to training, the dataset undergoes preprocessing steps including feature selection, one-hot encoding, and feature normalization in order to analyse the "Play" column accordingly. However, during the training process, the model demonstrates an increase in loss over epochs, which indicates potential issues with vanishing or exploding gradients. The classification report and log loss highlight poor performance across various classes and a significant discrepancy between the training and testing errors, suggesting overfitting. This overfitting is likely due to the model's high complexity, as indicated by the large number of parameters. To improve performance, we will need to resample the data and further refine the preprocessing steps.