In [1]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
from numpy import diff
import plotly.graph_objects as go

In [57]:
url = 'https://raw.githubusercontent.com/silogramd/NBA-Draft-Project/master/draft_player_data'
df = pd.read_csv(url,index_col=0,parse_dates=[0])
df.fillna(0,inplace=True)
df = df[df['year'] > 1988]
df.reset_index(inplace=True)

index                               688
year                               2013
draft_pick                           29
name                     Archie Goodwin
link          /players/g/goodwar01.html
g_total                             165
mp_total                           2396
pts_total                          1040
trb_total                           331
ast_total                           206
fg                                0.429
fg3                               0.236
mp_pg                              14.5
pts_pg                              6.3
trb_pg                                2
ast_pg                              1.2
ws                                  1.1
ws/48                             0.022
bpm                                -4.2
vorp                               -1.3
Name: 148, dtype: object

DRAFT "BUCKETS":

1. 1st Overall Pick
2. High Lottery (2-5)
3. Rest of Lottery (6-14)
4. Rest of First Round (15-30)
5. Second Round (31-45)
6. Second Round 2 (46-60)

In [48]:
def bucket(pick):
    if pick == 1:
        return 1
    elif pick > 1 and pick < 6:
        return 2
    elif pick > 5 and pick < 15:
        return 3
    elif pick > 14 and pick < 31:
        return 4
    elif pick > 30 and pick < 46:
        return 5
    elif pick > 45 and pick < 61:
        return 6

df['bucket'] = df['draft_pick'].apply(bucket)

In [49]:
target = df['draft_pick']
features = df.drop(["draft_pick", 'name','link','g_total','mp_total','pts_total',
                   'trb_total','ast_total','ws'], axis=1)
features = StandardScaler().fit_transform(features)
#features['pos'] = df['pos']

In [20]:
pca_list = []

for n_components in range(2, 11):
    pca = PCA(n_components = n_components)
    components = pca.fit_transform(features)
    pca_variance = sum(pca.explained_variance_ratio_)
    pca_list.append(pca_variance)
    print("For n_components = {}, explained variance ratio is {}".format(n_components, pca_variance))
    
dx = 1
y = pca_list
dy = diff(y)/dx
frame = pd.DataFrame()
frame['dy']=dy
frame['n']=range(2,10)
px.line(frame, x='n',y='dy', title="Differential of Explained Variance vs. Number of Principal Components").show()

For n_components = 2, explained variance ratio is 0.6234816624348724
For n_components = 3, explained variance ratio is 0.7291215826607437
For n_components = 4, explained variance ratio is 0.8162563708419203
For n_components = 5, explained variance ratio is 0.8762311230505125
For n_components = 6, explained variance ratio is 0.9160733475394394
For n_components = 7, explained variance ratio is 0.9525568099064068
For n_components = 8, explained variance ratio is 0.973140360082333
For n_components = 9, explained variance ratio is 0.9869228543191638
For n_components = 10, explained variance ratio is 0.9957266626448401


In [50]:
pca = PCA(n_components = 7)

components = pca.fit_transform(features)

pca_df7 = pd.DataFrame(data = components, columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5', 'PC 6', 'pc 7'])

pca = PCA(n_components = 5)

components = pca.fit_transform(features)

pca_df5 = pd.DataFrame(data = components, columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5'])


CHANGING THE NUMBER OF COMPONENTS DRASTICALLY CHANGES HOW EACH MODEL PERFORMS COMPARED TO EACH OTHER.
BY GRAPH, THERE ARE TWO MAIN POINTS WHERE THE DERIVATIVE OF THE EXPLAINED VARIANCE CHANGES THE MOST:
N_COMPONENTS = 5, AND N_COMPONENTS = 7. 

In [51]:
features = df.drop(["draft_pick", 'name','link','g_total','mp_total','pts_total',
                   'trb_total','ast_total', 'ws','year','index'], axis=1)
target = df['bucket']

In [52]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
estimators = {
    'k-Nearest Neighbor': KNeighborsClassifier(), 
    'Support Vector Machine': SVC(gamma='scale'),
    'Gaussian Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()}

In [59]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

pca_df = pca_df5
test = pd.DataFrame(pca_df.iloc[148]).T
X_train, X_test, y_train, y_test = train_test_split(pca_df, target, random_state=3000)
for name, obj in estimators.items():
    obj.fit(X=X_train, y=y_train)
    predicted = obj.predict(X=X_test)
    score = obj.score(X_test,y_test)
    print("Archie Godwin's predicted bucket:", str(obj.predict(test)))
    print(name + ": \n\t" + f'accuracy={score:.2%}, ' + "\n")

Archie Godwin's predicted bucket: [4]
k-Nearest Neighbor: 
	accuracy=57.80%, 

Archie Godwin's predicted bucket: [4]
Support Vector Machine: 
	accuracy=64.19%, 

Archie Godwin's predicted bucket: [4]
Gaussian Naive Bayes: 
	accuracy=43.48%, 

Archie Godwin's predicted bucket: [4]
Decision Tree: 
	accuracy=51.66%, 

Archie Godwin's predicted bucket: [4]
Random Forest: 
	accuracy=56.27%, 



In [58]:
from sklearn.model_selection import GridSearchCV 
  
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True) 
  
# fitting the model for grid search 
grid.fit(X_train, y_train) 

grid.best_params_





{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}

USING PCA MAKES MODEL LESS ACCURATE-- CERTAIN OUTLIER STATLINES WERE BEING CORRECTLY PREDICTED WHEN THERE WAS NO REASON FOR THEM TO BE

In [46]:
clusters = pd.read_csv("positional-clustering/final-csv-data/clustered-nba.csv",index_col=0)
clusters.head()

Unnamed: 0,Player,Cluster,Role,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,USG%
0,James Harden,7,Star ball handler,36.1,6.6,7.5,2.0,0.7,0.442,0.368,0.879,40.5
1,Paul George,7,Star ball handler,28.0,8.2,4.1,2.2,0.4,0.438,0.386,0.839,29.5
2,Giannis Antetokounmpo,5,Star big,27.7,12.5,5.9,1.3,1.5,0.578,0.256,0.729,32.3
3,Joel Embiid,5,Star big,27.5,13.6,3.7,0.7,1.9,0.484,0.3,0.804,33.3
4,LeBron James,7,Star ball handler,27.4,8.5,8.3,1.3,0.6,0.51,0.339,0.665,31.6


In [47]:
clusterT = clusters["Cluster"]
clusterFeatures = clusters.drop(["Player", "Role","Cluster"], axis=1)

In [51]:
for estimator_name, estimator_object in estimators.items():
    kfold = KFold(n_splits=10, random_state=3000, shuffle=True)
    
    scores = cross_val_score(estimator=estimator_object, X=clusterFeatures, y=clusterT, cv=kfold)
    
    print(estimator_name + ": \n\t" + f'mean accuracy={scores.mean():.2%}, ' + f'standard deviation={scores.std():.2%}' +"\n")

k-Nearest Neighbor: 
	mean accuracy=51.98%, standard deviation=8.81%

Support Vector Machine: 
	mean accuracy=44.55%, standard deviation=6.20%

Gaussian Naive Bayes: 
	mean accuracy=77.77%, standard deviation=7.84%

Decision Tree: 
	mean accuracy=59.77%, standard deviation=10.72%

