In this notebook, we explore k-means and DBSCAN.

In [None]:
# Let's start with k-means, and let's plot a cloud of points
%matplotlib inline
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import matplotlib.animation as animation

# Read the CSV file
df = pd.read_csv('PointsCoordinates.csv')

# Extract the second and third columns for x and y coordinates
x = df.iloc[:, 1].tolist()
y = df.iloc[:, 2].tolist()

# Number of points
n = len(x)

fig, ax = plt.subplots(figsize=(6, 8))
sc = ax.scatter([], [], s=5, color='blue')

def init():
    ax.set_xlim(min(x), max(x))
    ax.set_ylim(min(y), max(y))
    return sc,

x_data, y_data = [], []

def update(frame):
    x_data.append(x[frame])
    y_data.append(y[frame])
    sc.set_offsets(list(zip(x_data, y_data)))
    return sc,

# One point per frame, adjust interval for pacing (ms)
ani = animation.FuncAnimation(
    fig, update, frames=n, init_func=init, blit=True, repeat=False, interval=2
)

plt.title('Points from CSV on a 2D Graph')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.close(fig)  # Prevents static plot from showing
display(HTML(ani.to_jshtml()))


In [None]:
# Let's use k-means and plot the progression of 3 K means clustering on the same data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from sklearn.cluster import KMeans
from IPython.display import display, HTML

# Read the CSV file
df = pd.read_csv('PointsCoordinates.csv')

# Extract the second and third columns for x and y coordinates
data = df.iloc[:, 1:3].values

fig, ax = plt.subplots(figsize=(6, 8))
centroid_paths = [[], [], []]

def animate(i):
    ax.clear()
    
    # For the initial frame, plot all points in black
    if i == 0:
        ax.scatter(data[:, 0], data[:, 1], s=5, c='black')
        ax.set_title('Initial State')
        return

    # Fit KMeans with an increasing number of iterations and 'random' initialization
    kmeans = KMeans(n_clusters=3, init='random', n_init=1, max_iter=i, random_state=42)
    kmeans.fit(data)
    labels = kmeans.labels_
    
    # Plot points based on their cluster labels
    ax.scatter(data[labels == 0][:, 0], data[labels == 0][:, 1], s=5, c='green', label='Cluster 1')
    ax.scatter(data[labels == 1][:, 0], data[labels == 1][:, 1], s=5, c='red', label='Cluster 2')
    ax.scatter(data[labels == 2][:, 0], data[labels == 2][:, 1], s=5, c='blue', label='Cluster 3')
    
    # Plot cluster centers and their movement
    centers = kmeans.cluster_centers_
    for j, center in enumerate(centers):
        centroid_paths[j].append(center)
        path = np.array(centroid_paths[j])
        ax.plot(path[:, 0], path[:, 1], 'w--', linewidth=1)
        ax.scatter(center[0], center[1], c='black', s=100, marker='X')
    
    ax.set_title(f'Iteration: {i}')
    ax.legend()

# Animate for 11 frames (1 initial + 10 iterations)
ani = animation.FuncAnimation(fig, animate, frames=11, repeat=False, interval=500)


plt.close(fig)  # This will prevent the static plot from displaying
display(HTML(ani.to_jshtml()))


In [None]:
# Kmeans "almost" always converges. Let's try to run k-means a few times,
# and each time to plot the final cluster centroid positions, along with their
# starting points
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

# --- Generate synthetic data ---
np.random.seed(42)
X, _ = make_blobs(n_samples=200, centers=4, cluster_std=1.0)

# --- Parameters ---
k = 4
max_iters = 10
runs = 6  # Updated number of runs

def initialize_centroids(X, k):
    mins, maxs = X.min(axis=0), X.max(axis=0)
    return np.random.uniform(mins, maxs, size=(k, X.shape[1]))

def assign_clusters(X, centroids):
    distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
    return np.argmin(distances, axis=1)

def update_centroids(X, labels, k):
    return np.array([X[labels == i].mean(axis=0) for i in range(k)])

# --- Plotting Function ---
def run_kmeans_and_plot(run_idx):
    centroids = initialize_centroids(X, k)
    centroid_history = [centroids.copy()]
    
    for i in range(max_iters):
        labels = assign_clusters(X, centroids)
        new_centroids = update_centroids(X, labels, k)
        centroid_history.append(new_centroids.copy())
        
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    
    # Plot
    ax = plt.subplot(2, 3, run_idx + 1)
    ax.set_title(f"K-means Run {run_idx + 1}")
    
    for j in range(k):
        ax.scatter(X[labels == j][:, 0], X[labels == j][:, 1], s=20, label=f"Cluster {j}")
    
    for c in range(k):
        path = np.array([step[c] for step in centroid_history])
        ax.plot(path[:, 0], path[:, 1], marker='x', markersize=8, linewidth=2)
        ax.scatter(path[0, 0], path[0, 1], c='black', marker='o', s=80)
        ax.scatter(path[-1, 0], path[-1, 1], c='red', marker='X', s=100)
    
    ax.set_xticks([])
    ax.set_yticks([])

# --- Run All and Plot ---
plt.figure(figsize=(18, 10))
for run in range(runs):
    run_kmeans_and_plot(run)
plt.tight_layout()
plt.show()


In [None]:
# There are scenarios however where k-means fails, like polar
# representations with a group at the center and another group as a 'ring'
import numpy as np
import matplotlib.pyplot as plt

def generate_polar_view_points(n_points=1000):
    # Center cluster parameters
    center_radius = 0.2
    # For 30% of the points in the center cluster
    theta_center = 2 * np.pi * np.random.rand(int(n_points * 0.3))
    r_center = center_radius * np.sqrt(np.random.rand(int(n_points * 0.3)))
    center_points = np.column_stack((r_center * np.cos(theta_center), r_center * np.sin(theta_center)))


    # Ring parameters
    inner_radius = 0.5
    outer_radius = 0.7
    # For 70% of the points in the ring
    theta_ring = 2 * np.pi * np.random.rand(int(n_points * 0.7))
    r_ring = np.sqrt(np.random.uniform(inner_radius**2, outer_radius**2, int(n_points * 0.7)))
    ring_points = np.column_stack((r_ring * np.cos(theta_ring), r_ring * np.sin(theta_ring)))


    # Combine points
    all_points = np.vstack((center_points, ring_points))
    
    return all_points

# Generate and plot the points
points = generate_polar_view_points()
plt.figure(figsize=(11, 11))
plt.scatter(points[:, 0], points[:, 1], s=5)
plt.title("Polar View with Center Cluster and Outer Ring")
plt.xlabel("X")
plt.ylabel("Y")
plt.grid(True)
plt.gca().set_aspect('equal', adjustable='box')
plt.show()


In [None]:
#k-means has a hard time with this polar structure
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from matplotlib.animation import FuncAnimation
from IPython.display import display, HTML

# Generate the points as before
# ... [your generate_polar_view_points function here]

points = generate_polar_view_points()

# KMeans clustering with 2 clusters and random initialization
kmeans = KMeans(n_clusters=2, init='random', n_init=1, max_iter=1)

# Set up the figure and axis
fig, ax = plt.subplots(figsize=(11, 11))
scatter = ax.scatter(points[:, 0], points[:, 1], s=5, color='black')  # Start with all points in black
centroids, = ax.plot([], [], 'kX', markersize=10)  # Cluster centers in black

def init():
    centroids.set_data([], [])
    return scatter, centroids

def update(frame):
    if frame > 0:
        kmeans.max_iter = frame
        kmeans.fit(points)
        labels = kmeans.labels_
        colors = ['blue' if label == 0 else 'red' for label in labels]  # Manually set colors based on labels
        scatter.set_color(colors)
        centroids.set_data(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1])
    return scatter, centroids

ani = FuncAnimation(fig, update, frames=16, init_func=init, blit=True, repeat=False, interval=500)  # Set interval to 500 ms


# Display the animation in Jupyter
display(HTML(ani.to_jshtml()))


In [None]:
#DBSCAN to the rescue, the system immediately finds the right solution
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from matplotlib.animation import FuncAnimation
from IPython.display import display, HTML

# Generate the points as before
# ... [your generate_polar_view_points function here]

points = generate_polar_view_points()

# DBSCAN clustering
dbscan = DBSCAN(eps=0.1, min_samples=5)

# Set up the figure and axis
fig, ax = plt.subplots(figsize=(11, 11))
scatter = ax.scatter(points[:, 0], points[:, 1], s=5, color='black')  # Start with all points in black

def init():
    return scatter,

def update(frame):
    if frame == 1:
        labels = dbscan.fit_predict(points)
        colors = ['blue' if label == 0 else 'red' if label == 1 else 'black' for label in labels]
        scatter.set_color(colors)
    return scatter,

ani = FuncAnimation(fig, update, frames=2, init_func=init, blit=True, repeat=False, interval=1000)  # Set interval to 1000 ms

# Display the animation in Jupyter
display(HTML(ani.to_jshtml()))
