In [101]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
from numpy import diff
import plotly.graph_objects as go

In [107]:
df = pd.read_csv("draft_player_data", index_col = 0)
df.dropna(inplace=True)
df = df[df['year'] > 1988]

DRAFT "BUCKETS":

1. 1st Overall Pick
2. High Lottery (2-5)
3. Rest of Lottery (6-14)
4. Rest of First Round (15-30)
5. Second Round (31-45)
6. Second Round 2 (46-60)

In [108]:
def bucket(pick):
    if pick == 1:
        return 1
    elif pick > 1 and pick < 6:
        return 2
    elif pick > 5 and pick < 15:
        return 3
    elif pick > 14 and pick < 31:
        return 4
    elif pick > 30 and pick < 46:
        return 5
    elif pick > 45 and pick < 61:
        return 6

df['bucket'] = df['draft_pick'].apply(bucket)
df['bucket']

0       1
1       2
2       2
3       2
4       2
5       3
6       3
7       3
8       3
9       3
10      3
11      3
12      3
13      3
14      4
15      4
16      4
17      4
18      4
19      4
20      4
21      4
22      4
23      4
24      4
26      4
27      4
28      4
29      4
30      5
       ..
2061    3
2062    4
2063    4
2064    4
2065    4
2066    4
2068    4
2069    4
2070    4
2071    4
2072    4
2073    4
2074    4
2075    4
2077    4
2078    5
2080    5
2081    5
2082    5
2083    5
2085    5
2086    5
2087    5
2088    5
2090    5
2092    5
2096    6
2097    6
2098    6
2099    6
Name: bucket, Length: 1644, dtype: int64

In [111]:
target = df['bucket']
features = df.drop(["draft_pick", 'name','link','g_total','mp_total','pts_total',
                   'trb_total','ast_total'], axis=1)
features = StandardScaler().fit_transform(features)
#features['pos'] = df['pos']

In [112]:
pca_list = []

for n_components in range(2, 11):
    pca = PCA(n_components = n_components)
    components = pca.fit_transform(features)
    pca_variance = sum(pca.explained_variance_ratio_)
    pca_list.append(pca_variance)
    print("For n_components = {}, explained variance ratio is {}".format(n_components, pca_variance))
    
dx = 1
y = pca_list
dy = diff(y)/dx
frame = pd.DataFrame()
frame['dy']=dy
frame['n']=range(2,10)
px.line(frame, x='n',y='dy').show()

For n_components = 2, explained variance ratio is 0.6332967266507588
For n_components = 3, explained variance ratio is 0.7358181254063225
For n_components = 4, explained variance ratio is 0.8146194821993697
For n_components = 5, explained variance ratio is 0.8760342125821551
For n_components = 6, explained variance ratio is 0.9170467797779729
For n_components = 7, explained variance ratio is 0.9510406324661898
For n_components = 8, explained variance ratio is 0.9708280641303056
For n_components = 9, explained variance ratio is 0.9836840748837153
For n_components = 10, explained variance ratio is 0.9915205330307962


In [113]:
pca = PCA(n_components = 5)

components = pca.fit_transform(features)

pca_df = pd.DataFrame(data = components, columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5'])

In [114]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
estimators = {
    'k-Nearest Neighbor': KNeighborsClassifier(), 
    'Support Vector Machine': SVC(gamma='scale'),
    'Gaussian Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier()}

In [115]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

for estimator_name, estimator_object in estimators.items():
    kfold = KFold(n_splits=10, random_state=3000, shuffle=True)
    scores = cross_val_score(estimator=estimator_object, X=pca_df, y=target, cv=kfold)
    print(estimator_name + ": \n\t" + f'mean accuracy={scores.mean():.2%}, ' + f'standard deviation={scores.std():.2%}' +"\n")

k-Nearest Neighbor: 
	mean accuracy=77.68%, standard deviation=3.40%

Support Vector Machine: 
	mean accuracy=85.28%, standard deviation=3.76%

Gaussian Naive Bayes: 
	mean accuracy=59.37%, standard deviation=3.68%

Decision Tree: 
	mean accuracy=83.82%, standard deviation=2.25%



In [46]:
clusters = pd.read_csv("positional-clustering/final-csv-data/clustered-nba.csv",index_col=0)
clusters.head()

Unnamed: 0,Player,Cluster,Role,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,USG%
0,James Harden,7,Star ball handler,36.1,6.6,7.5,2.0,0.7,0.442,0.368,0.879,40.5
1,Paul George,7,Star ball handler,28.0,8.2,4.1,2.2,0.4,0.438,0.386,0.839,29.5
2,Giannis Antetokounmpo,5,Star big,27.7,12.5,5.9,1.3,1.5,0.578,0.256,0.729,32.3
3,Joel Embiid,5,Star big,27.5,13.6,3.7,0.7,1.9,0.484,0.3,0.804,33.3
4,LeBron James,7,Star ball handler,27.4,8.5,8.3,1.3,0.6,0.51,0.339,0.665,31.6


In [47]:
clusterT = clusters["Cluster"]
clusterFeatures = clusters.drop(["Player", "Role","Cluster"], axis=1)

In [51]:
for estimator_name, estimator_object in estimators.items():
    kfold = KFold(n_splits=10, random_state=3000, shuffle=True)
    
    scores = cross_val_score(estimator=estimator_object, X=clusterFeatures, y=clusterT, cv=kfold)
    
    print(estimator_name + ": \n\t" + f'mean accuracy={scores.mean():.2%}, ' + f'standard deviation={scores.std():.2%}' +"\n")

k-Nearest Neighbor: 
	mean accuracy=51.98%, standard deviation=8.81%

Support Vector Machine: 
	mean accuracy=44.55%, standard deviation=6.20%

Gaussian Naive Bayes: 
	mean accuracy=77.77%, standard deviation=7.84%

Decision Tree: 
	mean accuracy=59.77%, standard deviation=10.72%

