# Analyzing Player Archetypes from Synergy Play-type data using KMeans Clustering
# For Blog: Method

In [None]:
import os, sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))
from nbafuns import *
import seaborn.objects as snso
from sklearn.metrics.pairwise import pairwise_distances_argmin

export_DIR = "../fdata/"

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
season = "2022"
df = pd.read_parquet(export_DIR +f"NBA_Synergy_Player_PG_{season}.parquet")
df.columns = map(str.lower, df.columns)
dft = pd.read_parquet(export_DIR +f"NBA_Synergy_Player_Tot_{season}.parquet")
dft.columns = map(str.lower, dft.columns)

In [None]:
df1 = df.query("type_grouping == 'Offensive'")
df1 = df1[["player_name","play_type","poss_pct"]]
df1 = df1.sort_values(by=["player_name","play_type"])
df1 = df1.fillna(0)

In [None]:
dfs = df1.pivot_table(index= "player_name", columns= "play_type", values= "poss_pct", aggfunc= "mean")
dfs = dfs.fillna(0)
dfs = dfs.drop(columns= ["Misc","Transition"])

In [None]:
dfs = dfs[['PRBallHandler','Isolation','Spotup','OffScreen','Handoff','Cut','PRRollMan','Postup','OffRebound']]

In [None]:
len(dfs.columns)

In [None]:
dftp = dft.groupby("player_name")[["poss"]].agg({"poss":["sum"]})
dftp.columns = ["poss"]
dftp = dftp.sort_values(by=["player_name"])

In [None]:
len(dftp.query("poss > 300"))

In [None]:
dfs = dfs[dftp["poss"]>=200]

In [None]:
features = dfs.to_numpy()
scaled_features = features

# Finding Optimal Number of Clusters

In [None]:
kmeans_kwargs = {"init": "random", "n_init": 10,"random_state": 42}

 # A list holds the SSE values for each k
sse = []
for k in range(1, 20):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    sse.append(kmeans.inertia_)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(1, 20), sse)
plt.xticks(range(1, 20))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
silhouette_coefficients = []

# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 20):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    score = silhouette_score(scaled_features, kmeans.labels_)
    silhouette_coefficients.append(score)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, 20), silhouette_coefficients)
plt.xticks(range(2, 20))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

# Looking at clusters

In [None]:
def plot_clusters(df,dft,n,var1,var2):
    fig, ax = plt.subplots(1,1, figsize=(6.4,4.8))
    sns.scatterplot(data=df,x= var1, y= var2 , ax= ax)
    sns.scatterplot(data=dft,x= var1, y= var2, ax= ax)
    p = snso.Plot(data=dft, x= var1, y= var2, text="player_name").add(snso.Text(valign="bottom"))
    p.on(ax).show()

## 3 Clusters

In [None]:
n = 3
kmeans = KMeans(
    init="random",
    n_clusters=n,
    n_init=10,
    max_iter=300,
    random_state=42
)
kmeans.fit(scaled_features)
clusters = kmeans.cluster_centers_
clusters = np.round(clusters,3)
dfc = dfs.copy()
for i in range(len(clusters)):
    dfc.loc[f"c{i+1}"] = clusters[i]
dfc["Ball"] = dfc["PRBallHandler"] + dfc["Isolation"]
dfc["Shoot"] = dfc["Spotup"] + dfc["OffScreen"] + dfc["Handoff"]
dfc["Big"] = dfc["Cut"] + dfc["PRRollMan"]	+ dfc["Postup"] + dfc["Postup"]
dfct = dfc.tail(n)
dfc = dfc.head(-n)
dfc["label"] = kmeans.labels_
display(dfct)

In [None]:
dfc["label"] = dfc["label"].astype("category")
dfc["label"] = dfc["label"].cat.rename_categories(["wing","ballh","big"])
dfc["label"].value_counts()/len(dfc)

In [None]:
plot_clusters(dfc,dfct,n,"PRBallHandler","Spotup")
plot_clusters(dfc,dfct,n,"Cut","Spotup")
plot_clusters(dfc,dfct,n,"PRRollMan","Cut")

In [None]:
X = scaled_features
k_means_labels = pairwise_distances_argmin(X, clusters)
colors = sns.color_palette(n_colors=n)
fig, ax = plt.subplots(1,1)
var1 = 0
var2 = 6
for k,col in zip(range(n),colors):
    members = k_means_labels == k
    ax.plot(X[members,var1],X[members,var2],'.', markerfacecolor = col)
    ax.plot(clusters[k,var1],clusters[k,var2],"o", markerfacecolor = col)

## 12 Clusters

In [None]:
n = 12
kmeans = KMeans(
    init="random",
    n_clusters=n,
    n_init=10,
    max_iter=300,
    random_state=42
)
kmeans.fit(scaled_features)
clusters = kmeans.cluster_centers_
clusters = np.round(clusters,3)
dfc = dfs.copy()
for i in range(len(clusters)):
    dfc.loc[f"c{i+1}"] = clusters[i]
dfc["Ball"] = dfc["PRBallHandler"] + dfc["Isolation"]
dfc["Shoot"] = dfc["Spotup"] + dfc["OffScreen"] + dfc["Handoff"]
dfc["Big"] = dfc["Cut"] + dfc["PRRollMan"]	+ dfc["Postup"] + dfc["Postup"]
dfct = dfc.tail(n)
dfc = dfc.head(-n)
dfc["label"] = kmeans.labels_
# display(dfct)

In [None]:
dfct.sort_values(by="PRBallHandler")

c1: Spot up + Cut + Roll + Post up : Versatile Post up big  
c2: 2ry Ball handler + Spot up: Shooter with ball handling  
c3: Ball handler  
c4: Ball handler + Spot up: Combo guard  
c5: Spot up: Standstill shooter  
c6: Jack of all trades  
c7: Spot up + Off Screen + Handoff: Movement shooter  
c8: Cut + Roll + OffReb: PnR Big  
c9: Spot up + Cut + Roll: Shooting Big  
c10: Ball handler + ISO + Spot up + Tall: Play-makers  
c11: Spot up + little bit of versatility: Utility Standstill shooter  
c12: Spot up + Cut + Tall: Tall Shooting Wing 

In [None]:
i = 11
dfc.query(f"label == {i-1}")