In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time
import pandas as pd
import plotly.express as px
import operator

from scipy.io import arff
from sklearn import cluster
from sklearn import metrics
from sklearn import neighbors
import hdbscan

In [None]:
def get_ops():
    return {'>': operator.gt, '<': operator.lt }

def determine_eps(data):
    """
    Determine a suitable value for eps for use with DBSCAN
    
    Args:
        data: zipped list containing data points, like [(x1,y1), (x2,y2), ...]
    
    Returns:
        An eps value greater than the distance separating 85% of points from each other.
    """
    tree = neighbors.KDTree(data)
    distances = [] # average distance to 5 closest points
    
    for (i, point) in enumerate(data):
        dist, _ = tree.query(data[i:i+1], k=5+1) # k=1 compares distance to itself
        distances.append(np.mean(dist[0][1:]))
    
    """ Optional historgram for sanity check """
    # plt.hist(distances, bins=50)
    # plt.show()

    """ Optional scatter plot for sanity check """
    # plt.scatter(np.linspace(1,len(distances),len(distances)), sorted(distances), s=1)
    # plt.show()
    
    p85 = int(np.floor(len(distances) * 0.85))
    
    # Sorting distances effectively gives the cumulative distribution "function".
    return sorted(distances)[p85]

def run(algorithm, data, metric, metric_params):    
    best_score = metric_params['best_score']
    score = metric_params['score']
    op = get_ops()[metric_params['op']]
    
    results = []
    times = []
    best_model = 1
    
    for alg in algorithm:
        time_start = time.time()
        
        model = alg.fit(data)
        
        time_post_cluster = time.time()
        
        score = metric(data, model.labels_)
        
        time_post_score = time.time()
        
        if op(score, best_score):
            best_score = score
            best_model = model
        
        results.append((len(set(model.labels_)), score))
        times.append((time_start, time_post_cluster, time_post_score))
    
    best_n_clusters_guess = metric_params['fn'](results, key=lambda r: r[1])[0]
    total_exec_time = times[-1][2]-times[0][0]

    return best_n_clusters_guess, best_model, total_exec_time

silhouette_mp = {
    'best_score': -1,  # Lower bound for Silhouette coefficient (higher is better)
    'score': 0,        # Initial score
    'op': '>',         # Score comparison operator
    'fn': max
}

cal_hara_mp = {
    'best_score': -1,  # Assumed lower bound for CH index (higher is better)
    'score': 0,        # Initial score
    'op': '>',         # Score comparison operator
    'fn': max
}

dav_boul_mp = {
    'best_score': 99999,   # Assumed upper bound for DB index (lower is better)
    'score': 99999.1,        # Initial score
    'op': '<',         # Score comparison operator
    'fn': min    
}

def print_results(n_clusters, model, timing, algorithm, metric):
    print("Algorithm: ", algorithm)
    print("Metric: ", metric)
    print("Number of clusters: ", n_clusters)
    print(f"Time: {timing} seconds")

def plot_results(data, model, algorithm):
    if algorithm == 'K-Means':
        center_x = [point[0] for point in model.cluster_centers_]
        center_y = [point[1] for point in model.cluster_centers_]
    
    plt.scatter(data["x"], data["y"], c=model.labels_, cmap='rainbow')
    if algorithm == "K-Means":
        plt.scatter(center_x, center_y, marker="x", c="000000")
    plt.title(f"Data after clustering with {algorithm}")
    plt.show()

def plot_results_3D(data, model, algorithm):
    if algorithm == 'K-Means':
        center_x = [point[0] for point in model.cluster_centers_]
        center_y = [point[1] for point in model.cluster_centers_]
        center_z = [point[2] for point in model.cluster_centers_]

    threedee = plt.figure()
    ax = threedee.add_subplot(projection="3d")
    ax.scatter(data["x"], data["y"], data["z"], c=model.labels_, cmap='rainbow')
    if algorithm == "K-Means":
        ax.scatter(center_x, center_y, center_z, marker="x", c="000000")
    plt.title(f"Data after clustering with {algorithm}")
    plt.show()

## 2D

In [None]:
df2d = pd.read_csv("./dataset/tr.data", sep=" ", names=["x", "y"])

In [None]:
df2d.plot.scatter(x="x", y="y")

### k-Means on 2D datasets

In [None]:
kmeans_ = [cluster.KMeans(n_clusters=i, init="k-means++") for i in range(2, 20+1)]

n_clusters, model, timing = run(kmeans_, df2d, metrics.davies_bouldin_score, dav_boul_mp)
print_results(n_clusters, model, timing, "K-Means", "Davies-Bouldin score")
plot_results(df2d, model, "K-Means")

### Agglomerative clustering on 2D datasets

In [None]:
agglo_ward = [cluster.AgglomerativeClustering(n_clusters=i, linkage='ward') for i in range(2, 20+1)]

n_clusters, model, timing = run(agglo_ward, df2d, metrics.davies_bouldin_score, dav_boul_mp)
print_results(n_clusters, model, timing, "Agglomerative Clustering", "Davies-Bouldin score")
plot_results(df2d, model, "Agglomerative Clustering")

### DBSCAN on 2D datasets

In [None]:
eps = determine_eps(list(zip(df2d['x'], df2d['y'])))
dbscan_ = [cluster.DBSCAN(eps=eps, min_samples=i) for i in range(2, 20+1)]

n_clusters, model, timing = run(dbscan_, df2d, metrics.davies_bouldin_score, dav_boul_mp)
print_results(n_clusters, model, timing, "DBSCAN", "Davies-Bouldin score")
plot_results(df2d, model, "DBSCAN")

In [None]:
dbscan = cluster.DBSCAN(eps=0.3, min_samples=7).fit(df2d)

plt.scatter(df2d["x"], df2d["y"], c=dbscan.labels_, cmap='rainbow')
plt.title("Data after clustering with DBSCAN manually (eps = 0.3, min_samples = 7)")
plt.show()

### HDBSCAN on 2D datasets

In [None]:
hdbscan_ = [hdbscan.HDBSCAN(min_cluster_size=i) for i in range(2, 20+1)]

n_clusters, model, timing = run(hdbscan_, df2d, metrics.davies_bouldin_score, dav_boul_mp)
print_results(n_clusters, model, timing, "HDBSCAN", "Davies-Bouldin score")
plot_results(df2d, model, "HDBSCAN")

In [None]:
maa = hdbscan.HDBSCAN(min_cluster_size=4).fit(df2d)

plt.scatter(df2d["x"], df2d["y"], c=maa.labels_, cmap='rainbow')
plt.title("Data after clustering with HDBSCAN manually (min_cluster_size = 4)")
plt.show()

## 3D

In [None]:
df3d = pd.read_csv("./dataset/a.data", sep="\t", names=["x", "y", "z"])

In [None]:
threedee = plt.figure()
ax = threedee.add_subplot(projection="3d")
ax.scatter(df3d["x"], df3d["y"], df3d["z"])
plt.show()

### k-Means on 3D datasets

In [None]:
kmeans_ = [cluster.KMeans(n_clusters=i, init="k-means++") for i in range(2, 20+1)]

n_clusters, model, timing = run(kmeans_, df3d, metrics.davies_bouldin_score, dav_boul_mp)
print_results(n_clusters, model, timing, "K-Means", "Davies-Bouldin score")
plot_results_3D(df3d, model, "K-Means")

In [None]:
kmeans_ = cluster.KMeans(n_clusters=2, init="k-means++").fit(df3d)

threedee = plt.figure()
ax = threedee.add_subplot(projection="3d")
ax.scatter(df3d["x"], df3d["y"], df3d["z"], c=kmeans_.labels_, cmap='rainbow')
plt.title(f"Data after clustering with K-Means manually (n_clusters = 2)")
plt.show()

### Agglomerative clustering on 3D datasets

In [None]:
agglo_ward = [cluster.AgglomerativeClustering(n_clusters=i, linkage='ward') for i in range(2, 20+1)]

n_clusters, model, timing = run(agglo_ward, df3d, metrics.davies_bouldin_score, dav_boul_mp)
print_results(n_clusters, model, timing, "Agglomerative Clustering", "Davies-Bouldin score")
plot_results_3D(df3d, model, "Agglomerative Clustering")

In [None]:
agglo_ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward').fit(df3d)

threedee = plt.figure()
ax = threedee.add_subplot(projection="3d")
ax.scatter(df3d["x"], df3d["y"], df3d["z"], c=kmeans_.labels_, cmap='rainbow')
plt.title(f"Data after clustering with Agglomerative Clustering manually (n_clusters = 2)")
plt.show()

### DBSCAN on 3D datasets

In [None]:
eps = determine_eps(list(zip(df3d['x'], df3d['y'], df3d['z'])))
dbscan_ = [cluster.DBSCAN(eps=eps, min_samples=i) for i in range(2, 20+1)]

n_clusters, model, timing = run(dbscan_, df3d, metrics.davies_bouldin_score, dav_boul_mp)
print_results(n_clusters, model, timing, "DBSCAN", "Davies-Bouldin score")
plot_results_3D(df3d, model, "DBSCAN")

In [None]:
dbscan_ = cluster.DBSCAN(eps=5, min_samples=15).fit(df3d)

threedee = plt.figure()
ax = threedee.add_subplot(projection="3d")
ax.scatter(df3d["x"], df3d["y"], df3d["z"], c=dbscan_.labels_, cmap='rainbow')
plt.title(f"Data after clustering with DBSCAN manually (eps = 5, min_samples = 15)")
plt.show()

### HDBSCAN on 3D datasets

In [None]:
hdbscan_ = [hdbscan.HDBSCAN(min_cluster_size=i) for i in range(2, 20+1)]

n_clusters, model, timing = run(hdbscan_, df3d, metrics.davies_bouldin_score, dav_boul_mp)
print_results(n_clusters, model, timing, "HDBSCAN", "Davies-Bouldin score")
plot_results_3D(df3d, model, "HDBSCAN")
print(model)

In [None]:
hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=5).fit(df3d)

threedee = plt.figure()
ax = threedee.add_subplot(projection="3d")
ax.scatter(df3d["x"], df3d["y"], df3d["z"], c=hdbscan_.labels_, cmap='rainbow')
plt.title(f"Data after clustering with HDBSCAN manually (min_cluster_size = 5)")
plt.show()

### Interactive 3D plot for fun and profit

In [None]:
fig = px.scatter_3d(df3d, x="x", y="y", z="z", color=hdbscan_.labels_)
fig.show()