<a href="https://colab.research.google.com/github/smccracken13/NBA-All-Star-Prediction-Project/blob/main/(McCracken)_NBA_Pre_pro_and_Training_Data_Dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Pre-processing and Training Data Development

The goals of this notebook are to:

1. Create dummies for categorical data ('pos', 'team')
2. create train_test_split
3. standardize the data

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# sklearn libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score,matthews_corrcoef,classification_report,roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# load the nba_clean2 dataframe
url = 'https://github.com/smccracken13/NBA-All-Star-Prediction-Project/blob/main/Data/nba_clean2.csv?raw=true'
df = pd.read_csv(url)

Create dummy variables for categorical data

In [3]:
df.select_dtypes(exclude=['int','float']).dtypes

player    object
pos       object
tm        object
dtype: object

In [4]:
# value counts of teams
df['tm'].value_counts()

# value counts of teams for players that were all-stars
df[df['all_star']==1]['tm'].value_counts()

# I am going to drop the team column because I don't see it factoring into all_star selection.
# It was interesting however to see that Lakers are the most represented by quite a bit

LAL    51
SAS    41
UTA    34
HOU    33
PHO    33
PHI    28
BOS    24
GSW    24
DET    23
CHI    23
MIA    21
MIL    19
SEA    19
DAL    17
OKC    17
CLE    16
LAC    16
POR    16
ORL    16
NYK    14
MIN    14
DEN    13
SAC    13
ATL    12
IND    11
TOR     9
NJN     8
WAS     6
NOH     5
CHH     5
TOT     4
MEM     4
NOP     3
WSB     3
BRK     2
CHA     1
CHO     1
KCK     1
Name: tm, dtype: int64

In [5]:
# player and team will not be used in analysis but I would like to keep it for reference
reference_df = df[['season', 'player_id', 'player','tm']]
df = df.drop(['season', 'player_id', 'player','tm'],axis=1)
# drop weird extra columns (don't need this if I fix this issue earlier)
# set seas_id to index
df = df.drop(['Unnamed: 0.1', 'Unnamed: 0'],axis=1)
df.set_index('seas_id')

Unnamed: 0_level_0,pos,experience,g,ts_percent,x3p_ar,f_tr,orb_percent,trb_percent,ast_percent,stl_percent,...,ast,stl,blk,tov,pf,pts,all_star,attend,tm_win_percent,seas_avg_attend
seas_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
29646,PF,8,75,0.602,0.312,0.276,6.1,10.3,11.6,0.9,...,188,44.0,44.0,133.0,148,1126,0,695262.000000,0.585366,692948.753846
29647,SF,1,6,0.200,0.200,0.000,0.0,3.3,0.0,0.0,...,0,0.0,2.0,2.0,2,2,0,846867.000000,0.621951,692948.753846
29648,PG,4,63,0.544,0.305,0.201,2.6,6.5,20.7,2.0,...,153,42.0,9.0,67.0,92,400,0,692948.753846,0.500000,692948.753846
29649,PG,4,41,0.547,0.313,0.164,1.5,5.6,17.1,1.8,...,78,24.0,9.0,39.0,60,251,0,641499.000000,0.426829,692948.753846
29650,PG,4,22,0.538,0.290,0.266,4.6,8.2,27.4,2.4,...,75,18.0,0.0,28.0,32,149,0,663171.000000,0.780488,692948.753846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8343,C,12,82,0.545,0.003,0.328,11.2,18.5,15.6,1.0,...,366,65.0,61.0,153.0,249,794,0,588867.333333,0.475610,588867.333333
8344,SF,1,52,0.466,0.012,0.325,11.5,15.6,6.1,1.2,...,33,17.0,23.0,48.0,100,180,0,588867.333333,0.500000,588867.333333
8345,PG,4,62,0.442,0.225,0.165,5.4,6.1,29.3,3.2,...,259,75.0,1.0,95.0,110,299,0,588867.333333,0.451220,588867.333333
8346,SG,2,64,0.518,0.009,0.291,4.8,6.3,12.8,2.4,...,95,59.0,19.0,96.0,132,546,0,588867.333333,0.414634,588867.333333


In [6]:
# Check categories for position
df['pos'].value_counts()

PF          4384
SG          4245
C           4163
PG          4149
SF          3926
SF-SG         40
SG-SF         33
PG-SG         33
SG-PG         32
PF-C          31
C-PF          31
PF-SF         31
SF-PF         24
SG-PF          4
SG-PG-SF       1
SF-C           1
PG-SF          1
Name: pos, dtype: int64

In [7]:
# one_hot_encode position
df = pd.get_dummies(df, columns=['pos'], prefix='Pos', drop_first=True)

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
seas_id,29646.0,29647.0,29648.0,29649.0,29650.0
experience,8.0,1.0,4.0,4.0,4.0
g,75.0,6.0,63.0,41.0,22.0
ts_percent,0.602,0.2,0.544,0.547,0.538
x3p_ar,0.312,0.2,0.305,0.313,0.29
f_tr,0.276,0.0,0.201,0.164,0.266
orb_percent,6.1,0.0,2.6,1.5,4.6
trb_percent,10.3,3.3,6.5,5.6,8.2
ast_percent,11.6,0.0,20.7,17.1,27.4
stl_percent,0.9,0.0,2.0,1.8,2.4


Split into Train and Test sets

In [9]:
# Create train_test_split
X=df.loc[:, df.columns != 'all_star']
y=df['all_star']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

Standardize the data

In [10]:
# Instantiate StandardScaler
scaler = StandardScaler()

# Fit scaler to the training data
scaler.fit(X_train)

# Transform the train and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)