# Clustering

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

# KMEANS
from sklearn.cluster import KMeans
from yellowbrick.cluster.elbow import KElbowVisualizer 
from yellowbrick.cluster import silhouette_visualizer 

# DBSCAN
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import pdist, squareform
from sklearn.neighbors import NearestNeighbors

# HIERARCHICAL
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.cluster import AgglomerativeClustering

# EMA
from pyclustering.cluster.ema import ema, ema_visualizer
from pyclustering.cluster import cluster_visualizer_multidim

# Visualization
import plotly.express as px
import plotly.io as pio
import plotly.figure_factory as ff
import plotly.graph_objects as go
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

## Preparation

In [None]:
df_players = pd.read_csv("./datasets/players.csv", index_col=0)
feautures = ['max_tourney_revenue', 'mean_rank_points', 'lrpOnMxrp', 'matches_won_ratio']

In [None]:
for feature in feautures:
    df_players[feature].hist().show()

## K-means

### Normalization

In this particular case, it is applied a logarithmic transformation over `mean_rank_points` and `variance_rank_points` to make them more gaussian-like.

In [None]:
df_data = df_players[feautures].reset_index(drop=True)
df_data = df_data.round(3)

# Transformations
df_data['mean_rank_points'] = np.log(df_data['mean_rank_points'])

# Plot
df_data['mean_rank_points'].hist().show()
df_data = pd.DataFrame(MinMaxScaler().fit_transform(df_data), columns=df_data.columns)
df_data.boxplot(column=feautures).show()

### Find Optimal K

In [None]:
model = KMeans(n_init=10, max_iter=100, init="k-means++")
sse_visualizer = KElbowVisualizer(model, k=(2,8), timings=False)
sse_visualizer.fit(df_data)
sse_visualizer.show()

sil_visualizer = KElbowVisualizer(model, k=(2,8), timings=False, metric="silhouette")
sil_visualizer.fit(df_data)
sil_visualizer.show()

Picking optimal K

The optimal `k` is 4

In [None]:
optimal_k = sse_visualizer.elbow_value_
kmeans = KMeans(n_clusters=optimal_k, n_init=10, max_iter=100, init="k-means++")
kmeans.fit(df_data)

df_players["cluster_kmeans"] = kmeans.labels_.astype(str)

x = silhouette_visualizer(KMeans(optimal_k, random_state=42), df_data)
print("The silhoutte score is: " + str(x.silhouette_score_))

### Result analysis

In [None]:
interesting_features = ['mean_rank_points', 'lrpOnAvgrp', 'age', 'total_matches_played', 'performance_index_entropy']
df_players.groupby("cluster_kmeans").agg({"cluster_kmeans":"count", "mean_rank_points": "mean", "lrpOnAvgrp": "mean", "age": "mean", "matches_won_ratio": "mean", "total_matches_played": "mean", "performance_index_entropy":"mean"}).sort_values(by="mean_rank_points", ascending=False).round(2).rename(columns={"cluster_kmeans": "cluster size"})

Plot of the k-means centers

In [None]:
plt.figure(figsize=(15, 4))
for i in range(0, len(kmeans.cluster_centers_)):
    plt.plot(kmeans.cluster_centers_[i], marker='o', label='Cluster %s' % i)
plt.xticks(range(0, len(df_data.columns)), df_data.columns, fontsize=15)
plt.legend(fontsize=10)
plt.show()

#### PCA visualization

In [None]:
df = df_players[df_players.select_dtypes(include = np.number).columns.tolist()].drop(columns = ['ht'])
components_df = pd.DataFrame(PCA(n_components=2).fit_transform(df))
px.scatter(x=components_df[0], y=components_df[1], color=df_players["cluster_kmeans"]).show()

#### Scatter matrix of selected features

In [None]:
px.scatter_matrix(df_players,
    dimensions=feautures,
    color="cluster_kmeans")

#### Scatter matrix of interesting features

In [None]:
px.scatter_matrix(df_players,
    dimensions=interesting_features,
    color="cluster_kmeans")

#### Histograms of interesting features by gender
The only important difference between male and female players that can be seen is that female players tend to be more than the counterpart, nevertheless no discrimination is made

In [None]:
for feature in interesting_features:
  px.histogram(df_players, x=feature, facet_col="cluster_kmeans", color=df_players.gender).show()

#### Interpretation
Looking at the values ​​within the clusters, the following interpretation can be made:

- Players high ranked, played a lot of matches, performance *varies a lot* from one tournament to another
  - Cluster 0: decreasing performance and high age
  - Cluster 3: increasing performance and low age
- Players low ranked, played few matches, performance *varies* from one tournament to another
  - Cluster 2: decreasing performance and high age
  - Cluster 1: increasing performance and low age

In **summary** k-means identifies strong and weak players, and for both of them it identifies the ones with a rising score that are young and the ones with a decreasing score that are old. Stronger players have played more matches and attract more spectators and more money.

## Density-based

In [None]:
df_data = df_players[feautures].reset_index(drop=True)
df_data = pd.DataFrame(StandardScaler().fit_transform(df_data), columns=df_data.columns).round(3)
df_data.boxplot(column=feautures)

In [None]:
# pair-wise distance and then compute distance matrix
dist = pdist(X=df_data, metric='euclidean')
dist = squareform(dist)

kmin, kmax = 3, 30
kth_distances = {k:[] for k in range(kmin, kmax + 1)}

for d in dist:
    indexes_to_sort_d = np.argsort(d)
    for k in range(kmin, kmax + 1):
        kth_distances[k].append(d[indexes_to_sort_d[k]])

fig = go.Figure()
for k in kth_distances.keys():
    fig.add_trace(go.Scatter(x = np.array(range(0, len(kth_distances[k]))), y = sorted(kth_distances[k]), mode = 'lines' , name = str(k)))
fig.show()

### Find optimal hyper-parameters

In [None]:
def get_metrics(eps, min_samples, dataset, iter_):
    # Fit the model
    dbscan_model_ = DBSCAN( eps = eps, min_samples = min_samples)
    dbscan_model_.fit(dataset)
    
    # Mean noise point distance metric
    noise_indices = dbscan_model_.labels_ == -1
    
    if True in noise_indices:
        neighboors = NearestNeighbors(n_neighbors = 6).fit(dataset)
        distances, indices = neighboors.kneighbors(dataset)
        noise_distances = distances[noise_indices, 1:]
        noise_mean_distance = round(noise_distances.mean(), 3)
    else:
        noise_mean_distance = None
        
    # Number of clusters metric
    number_of_clusters = len(set(dbscan_model_.labels_[dbscan_model_.labels_ >= 0]))
    return(noise_mean_distance, number_of_clusters)

eps_to_test = [round(eps,1) for eps in np.arange(0.1, 3, 0.1)]
min_samples_to_test = range(3, 30, 2)

# Dataframe per la metrica sulla distanza media dei noise points dai K punti più vicini
results_noise = pd.DataFrame( 
    data = np.zeros((len(eps_to_test),len(min_samples_to_test))), # Empty dataframe
    columns = min_samples_to_test, 
    index = eps_to_test
)

# Dataframe per la metrica sul numero di cluster
results_clusters = pd.DataFrame( 
    data = np.zeros((len(eps_to_test),len(min_samples_to_test))), # Empty dataframe
    columns = min_samples_to_test, 
    index = eps_to_test
)

iter_ = 0
for eps in eps_to_test:
    for min_samples in min_samples_to_test:
        iter_ += 1
        # Calcolo le metriche
        noise_metric, cluster_metric = get_metrics(eps, min_samples, df_data, iter_)

        results_noise.loc[eps, min_samples] = noise_metric
        results_clusters.loc[eps, min_samples] = cluster_metric

In [None]:
sm = (results_clusters >= 2) & (results_clusters <= 5)
sm = (results_clusters == 3)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8) )
sns.heatmap(results_noise, annot = True, ax = ax1, cbar = False).set_title("METRIC: Mean Noise Points Distance")
sns.heatmap(results_clusters, annot = True, ax = ax2, cbar = False).set_title("METRIC: Number of clusters")
ax1.set_xlabel("N"); ax2.set_xlabel("N")
ax1.set_ylabel("EPSILON"); ax2.set_ylabel("EPSILON")
plt.tight_layout(); plt.show()

### Result analysis

In [None]:
# dbscan = DBSCAN(eps=0.9, min_samples=3).fit(df_data)
dbscan = DBSCAN(eps=0.4, min_samples=29).fit(df_data)
# -1: 977 players with 
# 0: young 20, increasing a lot
# 1: old 27, decreasing a lot
# 2: 23, stable
results = np.unique(dbscan.labels_, return_counts=True)
print(f"Clusters labels: {results[0]}\nElements per cluster: {results[1]}")

df_players["cluster_dbscan"] = dbscan.labels_.astype(str)
df_players.groupby("cluster_dbscan").mean()

#### PCA visualization

In [None]:
# TODO remove these line if you want to plot over the normalized data where k-means was executed
df = df_players[df_players.select_dtypes(include = np.number).columns.tolist()].drop(columns = ['ht'])

components_df = pd.DataFrame(PCA(n_components=2).fit_transform(df_data))
px.scatter(x=components_df[0], y=components_df[1], color=df_players["cluster_dbscan"]).show()

#### Interpretation

In [None]:
df_players.groupby("cluster_kmeans").agg(pd.Series.mode)

In [None]:
df_players.groupby("cluster_dbscan").mean().loc[:, ["mean_rank_points", "lrpOnAvgrp", "age", "total_matches_played", "mean_minutes", "matches_won_ratio"]].sort_values(by="mean_rank_points", ascending=False)
x = df_players.groupby("cluster_dbscan").agg({"cluster_kmeans":"count", "mean_rank_points": "mean", "lrpOnAvgrp": "mean", "age": "mean", "total_matches_played": "mean", "mean_tourney_revenue": "mean"}).sort_values(by="mean_rank_points", ascending=False)
x = x.round(2)
# rename column cluster_kmeans to cluster size
x.rename(columns={"cluster_kmeans": "cluster size"}, inplace=True)

import plotly.figure_factory as ff
ff.create_table(x, index=True, index_title="Cluster")

In [None]:
px.scatter_matrix(df_players,
    dimensions=feautures,
    color="cluster_dbscan")

In [None]:
px.scatter_matrix(df_players,
    dimensions=interesting_features,
    color="cluster_dbscan")

## Hierarchical

In [None]:
df_data = df_players[feautures].reset_index(drop=True)
df_data = pd.DataFrame(StandardScaler().fit_transform(df_data), columns=df_data.columns)
df_data = df_data.round(3)
df_data.boxplot(column=feautures)

In [None]:
dend = linkage(df_data, method='ward', metric="euclidean")
dendrogram(dend, p = 10, truncate_mode = 'lastp')
plt.show()

In [None]:
cluster = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='ward')
cluster.fit_predict(df_data)

results = np.unique(cluster.labels_, return_counts=True)
print(f"Clusters labels: {results[0]}\nElements per cluster: {results[1]}")

df_players["cluster_hierarchical"] = cluster.labels_.astype(str)
df_players = df_players.round(3)

### Result analysis

In [None]:
# TODO remove these line if you want to plot over the normalized data where k-means was executed
df = df_players[df_players.select_dtypes(include = np.number).columns.tolist()].drop(columns = ['ht'])

components_df = pd.DataFrame(PCA(n_components=2).fit_transform(df))
px.scatter(x=components_df[0], y=components_df[1], color=df_players["cluster_hierarchical"]).show()

In [None]:
df_players.loc[df_players["cluster_hierarchical"] == "1", "cluster_hierarchical"] = "0"
df_players.loc[df_players["cluster_hierarchical"] == "4", "cluster_hierarchical"] = "0"


In [None]:
df_players.groupby("cluster_hierarchical").mean().loc[:, ["age", "mean_rank_points", "lrpOnAvgrp", "total_matches_played", "mean_minutes", "matches_won_ratio"]]

## Gaussian Mixture (EMA)

In [None]:
df_data = df_players[feautures].reset_index(drop=True)
#df_data = pd.DataFrame(MinMaxScaler().fit_transform(df_data), columns=df_data.columns)
df_data = df_data.round(3)

In [None]:
df = df_data.values.tolist()
ema_instance = ema(df, 5)
ema_instance.process()

# Get clustering results.
clusters = ema_instance.get_clusters()
covariances = ema_instance.get_covariances()
means = ema_instance.get_centers()
# Visualize obtained clustering results.
# ema_visualizer.show_clusters(clusters, sample, covariances, means)
# x = cluster_visualizer_multidim()
# x.append_clusters(clusters, df)
# x.show()
for i, cluster in enumerate(clusters):
    print(f"Cluster {i}: {len(cluster)}")

for i, cluster in zip(range(len(clusters)), clusters):
    df_players.loc[df_players.index[cluster], 'cluster_gm'] = str(i)

df_players.groupby("cluster_gm").mean().loc[:, ["age", "mean_rank_points", "lrpOnAvgrp", "total_matches_played", "mean_minutes", "matches_won_ratio"]]

### Result analysis

In [None]:
# TODO remove these line if you want to plot over the normalized data where k-means was executed
df = df_players[df_players.select_dtypes(include = np.number).columns.tolist()].drop(columns = ['ht'])

components_df = pd.DataFrame(PCA(n_components=2).fit_transform(df))
px.scatter(x=components_df[0], y=components_df[1], color=df_players["cluster_dbscan"]).show()

In [None]:
df_players.groupby("cluster_gm").mean()