# Clustering

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from scipy import stats

# Visualization
import plotly.express as px
import plotly.io as pio
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"

In [None]:
df_players = pd.read_csv("./datasets/players.csv", index_col=0)
#df_players = df_players[df_players.gender == "m"]

In [None]:
feautures = ["rel_df", "rel_1stIn", "rel_2ndWon", "1WonOn1In", "rel_bpSaved"]
df_data = df_players[feautures].reset_index(drop=True)
# df_data = pd.DataFrame(StandardScaler().fit_transform(df_data), columns=df_data.columns)
#df_data = pd.DataFrame(RobustScaler(unit_variance=True).fit_transform(df_data), columns=df_data.columns)
#df_data = df_data[(np.abs(stats.zscore(df_data)) < 2).all(axis=1)]
df_data = pd.DataFrame(QuantileTransformer().fit_transform(df_data), columns=df_data.columns)
df_data = df_data.round(3)

df_data.boxplot(column=feautures)

In [None]:
df_data

## K-means

Find Optimal K

In [None]:
sse_scores = list()
silhoutte_scores = list()
davies_scores = list()
calinski_harabasz_scores = list()

max_k = 30
for k in range(2, max_k + 1):
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100, init="k-means++")
    kmeans.fit(df_data)    
    
    # Sum of squared distances of samples to their closest cluster center
    sse_scores.append(kmeans.inertia_)
    davies_scores.append(davies_bouldin_score(df_data, kmeans.labels_))
    silhoutte_scores.append(silhouette_score(df_data, kmeans.labels_))
    calinski_harabasz_scores.append(calinski_harabasz_score(df_data, kmeans.labels_))


In [None]:
df = pd.DataFrame({"K": list(range(2, max_k + 1)), "sse": sse_scores, "sil": silhoutte_scores, "davies": davies_scores, "calinski": calinski_harabasz_scores})
df.plot(x="K", y=["sse"], kind="line").update_traces(mode='lines+markers').show()
df.plot(x="K", y=["calinski"], kind="line").update_traces(mode='lines+markers').show()
df.plot(x="K", y=["sil", "davies"], kind="line").update_traces(mode='lines+markers').show()

In [None]:
optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, n_init=10, max_iter=300, init="k-means++")
kmeans.fit(df_data)
print("SSE:",sse_scores[optimal_k - 2]," - SILHOUETTE:",silhoutte_scores[optimal_k - 2])

In [None]:
hist, bins = np.histogram(kmeans.labels_, bins=range(0, len(set(kmeans.labels_)) + 1))
clust_dict = dict(zip(bins, hist))
print(clust_dict)

In [None]:
px.scatter_matrix(df_data,
    dimensions=feautures,
    color=kmeans.labels_.astype(str))

In [None]:

for feature in df_players.columns.drop(["name"]).to_list():
  px.histogram(df_players, x=feature, facet_col=kmeans.labels_.astype(str), color=df_players.gender).show()

In [None]:
pd.set_option('display.max_columns', None)
df_players["cluster"] = kmeans.labels_
df_players.groupby("cluster").describe()

## Density-based

## Hierarchical

## Optional

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=56da3ab5-e195-41aa-a609-f5fefeb3379d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>