In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
filename = '../../../data/NBAShot Locations1997-2020.csv'
df = pd.read_csv(filename)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4729512 entries, 0 to 4729511
Data columns (total 22 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   Game ID            int64 
 1   Game Event ID      int64 
 2   Player ID          int64 
 3   Player Name        object
 4   Team ID            int64 
 5   Team Name          object
 6   Period             int64 
 7   Minutes Remaining  int64 
 8   Seconds Remaining  int64 
 9   Action Type        object
 10  Shot Type          object
 11  Shot Zone Basic    object
 12  Shot Zone Area     object
 13  Shot Zone Range    object
 14  Shot Distance      int64 
 15  X Location         int64 
 16  Y Location         int64 
 17  Shot Made Flag     int64 
 18  Game Date          int64 
 19  Home Team          object
 20  Away Team          object
 21  Season Type        object
dtypes: int64(12), object(10)
memory usage: 793.8+ MB


In [4]:
shot_type_map = {
    'Jump Shot': 'Jump Shot',
    'Layup Shot': 'Layup',
    'Driving Layup Shot': 'Layup',
    'Tip Shot': 'No Shot',  # updated mapping for Tip Shot
    'Running Jump Shot': 'Jump Shot',
    'Slam Dunk Shot': 'No Shot',  # updated mapping for Dunk Shot
    'Dunk Shot': 'No Shot',  # updated mapping for Dunk Shot
    'Driving Dunk Shot': 'No Shot',  # updated mapping for Dunk Shot
    'Hook Shot': 'Hook Shot',
    'No Shot': 'No Shot',
    'Turnaround Jump Shot': 'Jump Shot',
    'Reverse Layup Shot': 'Layup',
    'Running Layup Shot': 'Layup',
    'Driving Finger Roll Shot': 'Layup',
    'Alley Oop Dunk Shot': 'No Shot',  # updated mapping for Alley Oop Dunk Shot
    'Finger Roll Shot': 'Layup',
    'Reverse Dunk Shot': 'No Shot',  # updated mapping for Reverse Dunk Shot
    'Running Hook Shot': 'Hook Shot',
    'Running Dunk Shot': 'No Shot',  # updated mapping for Running Dunk Shot
    'Turnaround Hook Shot': 'Hook Shot',
    'Running Finger Roll Shot': 'Layup',
    'Driving Hook Shot': 'Hook Shot',
    'Running Tip Shot': 'No Shot',  # updated mapping for Running Tip Shot
    'Alley Oop Layup shot': 'Layup',
    'Turnaround Finger Roll Shot': 'Layup',
    'Fadeaway Jump Shot': 'Jump Shot',
    'Jump Hook Shot': 'Hook Shot',
    'Follow Up Dunk Shot': 'No Shot',  # updated mapping for Follow Up Dunk Shot
    'Jump Bank Shot': 'Jump Shot',
    'Hook Bank Shot': 'Hook Shot',
    'Driving Finger Roll Layup Shot': 'Layup',
    'Running Finger Roll Layup Shot': 'Layup',
    'Floating Jump shot': 'Jump Shot',
    'Putback Dunk Shot': 'No Shot',  # updated mapping for Putback Dunk Shot
    'Turnaround Fadeaway shot': 'Jump Shot',
    'Running Reverse Layup Shot': 'Layup',
    'Putback Layup Shot': 'Layup',
    'Finger Roll Layup Shot': 'Layup',
    'Driving Slam Dunk Shot': 'No Shot',  # updated mapping for Driving Slam Dunk Shot
    'Driving Reverse Layup Shot': 'Layup',
    'Pullup Jump shot': 'Jump Shot',
    'Running Bank shot': 'Jump Shot',
    'Step Back Jump shot': 'Jump Shot',
    'Driving Jump shot': 'Jump Shot',
    'Reverse Slam Dunk Shot': 'No Shot',  # updated mapping for Reverse Slam Dunk Shot
    'Driving Bank shot': 'Jump Shot',
    'Putback Slam Dunk Shot': 'No Shot',  # updated mapping for Putback Slam Dunk Shot
    'Driving Bank Hook Shot': 'Hook Shot',
    'Running Slam Dunk Shot': 'No Shot',  # updated mapping for Running Slam Dunk Shot
    'Turnaround Bank shot': 'Jump Shot',
    'Turnaround Bank Hook Shot': 'Hook Shot',
    'Jump Bank Hook Shot': 'Hook Shot',
    'Fadeaway Bank shot': 'Jump Shot',
    'Pullup Bank shot': 'Jump Shot',
    'Putback Reverse Dunk Shot': 'No Shot',
    'Running Bank Hook Shot': 'Hook Shot',
    'Tip Layup Shot': 'Layup',
    'Cutting Dunk Shot': 'No Shot',
    'Cutting Layup Shot': 'Layup',
    'Tip Dunk Shot': 'No Shot',
    'Running Alley Oop Dunk Shot': 'No Shot',
    'Running Alley Oop Layup Shot': 'Layup',
    'Driving Floating Jump Shot': 'Jump Shot',
    'Cutting Finger Roll Layup Shot': 'Layup',
    'Running Pull-Up Jump Shot': 'Jump Shot',
    'Driving Floating Bank Jump Shot' : 'Jump Shot', 
    'Step Back Bank Jump Shot': 'Jump Shot',
    'Turnaround Fadeaway Bank Jump Shot': 'Jump Shot', 
    'Driving Reverse Dunk Shot': 'Jump Shot',
    'Running Reverse Dunk Shot' : 'Jump Shot',
}

df['Action Type'] = df['Action Type'].map(shot_type_map)


In [5]:
le_action_type = LabelEncoder()
df['Action Type Encoded'] = le_action_type.fit_transform(df['Action Type'])

le_shot_zone_basic = LabelEncoder()
df['Shot Zone Basic Encoded'] = le_shot_zone_basic.fit_transform(df['Shot Zone Basic'])

le_shot_zone_area = LabelEncoder()
df['Shot Zone Area Encoded'] = le_shot_zone_area.fit_transform(df['Shot Zone Area'])

le_shot_zone_range = LabelEncoder()
df['Shot Zone Range Encoded'] = le_shot_zone_range.fit_transform(df['Shot Zone Range'])

le_shot_type = LabelEncoder()
df['Shot Type Encoded'] = le_shot_type.fit_transform(df['Shot Type'])


In [6]:
player_fg_percentage = df.groupby('Player ID').agg(
    player_name=('Player Name', 'first'),
    field_goal_percentage=('Shot Made Flag', 'mean')
).reset_index()

player_fg_percentage.head()

Unnamed: 0,Player ID,player_name,field_goal_percentage
0,3,Grant Long,0.42611
1,7,Dan Schayes,0.413965
2,12,Chris King,0.285714
3,15,Eric Piatkowski,0.433307
4,17,Clyde Drexler,0.419698


In [7]:
df = df.merge(player_fg_percentage[['Player ID', 'field_goal_percentage']], on='Player ID')


In [8]:
action_type_pct = df.groupby(['Player ID', 'Action Type']).agg(
    action_type_percentage=('Shot Made Flag', 'mean')
).reset_index()
action_type_pct.head()

shot_zone_basic_pct = df.groupby(['Player ID', 'Shot Zone Basic']).agg(
    shot_zone_basic_percentage=('Shot Made Flag', 'mean')
).reset_index()

# Calculate shooting percentage for each player based on Shot Zone Area
shot_zone_area_pct = df.groupby(['Player ID', 'Shot Zone Area']).agg(
    shot_zone_area_percentage=('Shot Made Flag', 'mean')
).reset_index()

df = df.merge(action_type_pct, on=['Player ID', 'Action Type'], how='left')
df = df.merge(shot_zone_basic_pct, on=['Player ID', 'Shot Zone Basic'], how='left')
df = df.merge(shot_zone_area_pct, on=['Player ID', 'Shot Zone Area'], how='left')

In [9]:
X = df[['Shot Distance', 'Action Type Encoded', 'Shot Zone Basic Encoded', 'Shot Zone Area Encoded', 'Period', 'Minutes Remaining', 'Seconds Remaining', 'field_goal_percentage', 'action_type_percentage', 'shot_zone_basic_percentage', 'shot_zone_area_percentage']]
y = df['Shot Made Flag']


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42),
    #'SVM': SVC(random_state=42),
    'k-NN': KNeighborsClassifier(),
    'GBM': GradientBoostingClassifier(random_state=42),
}

for classifier_name, classifier in classifiers.items():
    # Train the classifier
    classifier.fit(X_train_scaled, y_train)
    
    # Make predictions on the test set
    y_pred = classifier.predict(X_test_scaled)
    
    # Evaluate the classifier
    print(f"{classifier_name} Results:")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nAccuracy Score:")
    print(accuracy_score(y_test, y_pred))
    print("\n" + "-" * 80 + "\n")


Logistic Regression Results:
Confusion Matrix:
[[409399 108176]
 [250404 177924]]

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.79      0.70    517575
           1       0.62      0.42      0.50    428328

    accuracy                           0.62    945903
   macro avg       0.62      0.60      0.60    945903
weighted avg       0.62      0.62      0.61    945903


Accuracy Score:
0.6209125037133829

--------------------------------------------------------------------------------

k-NN Results:
Confusion Matrix:
[[332865 184710]
 [222010 206318]]

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.64      0.62    517575
           1       0.53      0.48      0.50    428328

    accuracy                           0.57    945903
   macro avg       0.56      0.56      0.56    945903
weighted avg       0.57      0.57      0.57    945903


Accuracy Score:
0.57001933602

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=100, learning_rate_init=0.01 ,learning_rate='adaptive', random_state=42, verbose=True, early_stopping=True)
mlp.fit(X_train_scaled, y_train)

y_pred = mlp.predict(X_test_scaled)
    
print(f"MLP Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))
print("\n" + "-" * 80 + "\n")

In [11]:
import xgboost as xgb

#xgb.set_config(verbosity=2)
# Train the XGBoost classifier
xgb_clf = xgb.XGBClassifier(random_state=42, use_label_encoder=False)
xgb_clf.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = xgb_clf.predict(X_test_scaled)

# Evaluate the classifier
print("XGBoost Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))



XGBoost Results:
Confusion Matrix:
[[421859  95716]
 [259192 169136]]

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.82      0.70    517575
           1       0.64      0.39      0.49    428328

    accuracy                           0.62    945903
   macro avg       0.63      0.60      0.60    945903
weighted avg       0.63      0.62      0.61    945903


Accuracy Score:
0.62479450852783


In [12]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_clf.predict(X_test_scaled)

# Evaluate the classifier
print("Random Forest Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Random Forest Results:
Confusion Matrix:
[[354095 163480]
 [236581 191747]]

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.68      0.64    517575
           1       0.54      0.45      0.49    428328

    accuracy                           0.58    945903
   macro avg       0.57      0.57      0.56    945903
weighted avg       0.57      0.58      0.57    945903


Accuracy Score:
0.5770591699148856


In [16]:
# Make predictions on the test set
y_pred = rf_clf.predict(X_test_scaled)

# Evaluate the classifier
print("Random Forest Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

Random Forest Results:
Confusion Matrix:
[[354095 163480]
 [236581 191747]]

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.68      0.64    517575
           1       0.54      0.45      0.49    428328

    accuracy                           0.58    945903
   macro avg       0.57      0.57      0.56    945903
weighted avg       0.57      0.58      0.57    945903


Accuracy Score:
0.5770591699148856


In [18]:
from torch import nn
import torch
from torch.utils.data import Dataset, DataLoader

In [31]:
class NBAData(Dataset):
    def __init__(self, X, y):
        print(type(X))
        print(type(y))
        y = y.to_numpy()
        print(type(y))
        torchX = torch.from_numpy(X).float()
        torchy = torch.from_numpy(y).float()
        self.X = torchX
        self.y = torchy
        self.n = X.shape[0]
        
    def __len__(self):
        return self.n
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [32]:
X = df[['Shot Distance', 'Action Type Encoded', 'Shot Zone Basic Encoded', 'Shot Zone Area Encoded', 'Period', 'Minutes Remaining', 'Seconds Remaining', 'field_goal_percentage', 'action_type_percentage', 'shot_zone_basic_percentage', 'shot_zone_area_percentage']]
y = df['Shot Made Flag']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
dataset = NBAData(X_scaled, y)

print(len(dataset))
print(dataset[0])


<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


TypeError: expected np.ndarray (got Series)

In [None]:
class 