In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.datasets import make_blobs 
from scipy.linalg import norm  

In [None]:
RANDOM_SEED = 342
N_SAMPLES = 1000
N_FEATURES = 2
N_CENTERS = 4

In [None]:
X, y = make_blobs(n_samples=N_SAMPLES, n_features=N_FEATURES, centers=N_CENTERS, random_state=RANDOM_SEED)

In [None]:
print(X.shape, y.shape)

In [None]:
print(np.min(y), np.max(y))

In [None]:
print(X[:5], y[:5])

In [None]:
def plot_clusters(X, labels, k):
  colors = ['r', 'g', 'b', 'y', 'm']
  plt.figure(figsize=(8, 8))
  for c in range(k):
    c_label = y == c 
    plt.scatter(X[c_label,0], X[c_label, 1], c=colors[c], marker=".", s=15)

In [None]:
plot_clusters(X, y, k=N_CENTERS)

In [None]:
a = range(10)
a

In [None]:
random_values = np.random.choice(a, size=N_CENTERS, replace=False)

In [None]:
random_values

In [None]:
centroid_idxs = np.random.choice(range(N_SAMPLES), size=N_CENTERS, replace=False)

In [None]:
centroid_idxs

In [None]:
X[centroid_idxs]

In [None]:
def plot_clusters_with_centers(X, labels, centroids, k):
  colors = ['r', 'g', 'b', 'y', 'm']
  plt.figure(figsize=(8, 8))
  for c in range(k):
    c_label = y == c 
    plt.scatter(X[c_label,0], X[c_label, 1], c=colors[c], marker=".", s=15, alpha=0.3)
  plt.scatter(centroids[:, 0], centroids[:, 1], c='c', marker="x", s=100)

In [None]:
plot_clusters_with_centers(X, y, X[centroid_idxs], k=N_CENTERS)

In [None]:
# 1. Initialize centroids
# 2. Compute the distance of data points from the centroids and choose the closest one
# 3. Update the centroids
# 4. Repeat 2 & 3 until convergence

In [None]:
# Euclidean distance
a = np.arange(0, 8).reshape(-1, 2)
a

In [None]:
center = np.array([1, 1])

In [None]:
((a[0,0] - center[0])**2 + (a[0, 1] - center[1])**2)**0.5

In [None]:
((a[1,0] - center[0])**2 + (a[1, 1] - center[1])**2)**0.5

In [None]:
norm((a - center), axis=1)

In [None]:
np.sum(a, axis=0)

In [None]:
np.sum(a, axis=1)

In [None]:
centroids = X[centroid_idxs]

In [None]:
distances = np.empty(shape=(N_SAMPLES, N_CENTERS))
for i, centroid in enumerate(centroids):
  distance = np.array(norm(X-centroid, axis=1))
  distances[:, i] = distance 

In [None]:
distances[:5]

In [None]:
distances.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled.shape, X.shape

In [None]:
X_scaled[:5], X[:5]

In [None]:
centroids = X_scaled[centroid_idxs]

In [None]:
centroids

In [None]:
for i, point in enumerate(X_scaled):
  distances = [norm(point - centroid) for centroid in centroids]
  print(distances)
  print(np.argmin(distances))
  print(centroids[np.argmin(distances),:])
  if i == 3:
    break

In [None]:
def assign_clusters(X, centroids):
  n_samples = X.shape[0]
  k = len(centroids)
  distances = np.empty((n_samples, k))
  for i, centroid in enumerate(centroids):
    distances[:, i] = np.array(norm(X - centroid, axis=1))
  closest_centroid = np.argmin(distances, axis=1)
  return closest_centroid 

In [None]:
temp_labels = assign_clusters(X_scaled, centroids)

In [None]:
temp_labels.shape

In [None]:
temp_labels[:10]

In [None]:
# update the centroids:
prev_centroids = centroids

In [None]:
new_centroids = []
for c in range(N_CENTERS):
  cluster_data = X_scaled[temp_labels == c]
  new_centroids.append(np.mean(cluster_data, axis=0))

In [None]:
np.array(new_centroids).shape

In [None]:
new_centroids

In [None]:
prev_centroids

In [None]:
class KMeans():
  def __init__(self, k, tolerance, max_iters):
    self.k = k
    self.tolerance = tolerance
    self.max_iters = max_iters 
    self.inertia = 0.0
    self.centroids = []

  # Step 1: Init centroids
  def init_centroids(self, X):
    n_samples = X.shape[0]
    centroid_idxs = np.random.choice(range(n_samples), size=self.k, replace=False)
    return X[centroid_idxs]
  
  # Step 2: Assign points to the clusters
  def assign_clusters(self, X):
    n_samples = X.shape[0]
    distances = np.empty((n_samples, self.k))
    for c in range(self.k):
      distances[:, c] = norm(X - self.centroids[c], axis=1)
    labels = np.argmin(distances, axis=1)
    return labels
  
  # Step 3: Update centroids
  def update_centroids(self, X, labels):
    new_centroids = []
    for c in range(self.k):
      cluster_data = X[labels == c]
      new_centroids.append(np.mean(cluster_data, axis=0))
    return np.array(new_centroids)
  
  # Compute Inertia
  def compute_inertia(self, X, labels):
    for c in range(self.k):
      cluster_data = X[labels == c]
      within_cluster_distance = np.sum(norm(cluster_data - self.centroids[c], axis=1)**2)
      self.inertia += within_cluster_distance
    return self.inertia
  
  def fit(self, X):
    self.centroids = self.init_centroids(X)
    for i in range(self.max_iters):
      cluster_labels = self.assign_clusters(X)
      prev_centroids = self.centroids 
      self.centroids = self.update_centroids(X, cluster_labels)

      # Check if converged
      displacement = 0.0
      for c in range(self.k):
        displacement += norm(prev_centroids[c] - self.centroids[c])
      
      if displacement < self.tolerance:
        print(f"Converged in {i+1} iterations\n")
        self.compute_inertia(X, cluster_labels)
        return self.centroids, cluster_labels, self.inertia
    
    self.compute_inertia(X, cluster_labels)
    return self.centroids, cluster_labels, self.inertia 

In [None]:
my_kmeans = KMeans(k=N_CENTERS, tolerance=1e-4, max_iters=300)

In [None]:
final_centroids, final_labels, inertia = my_kmeans.fit(X_scaled)

In [None]:
print(final_centroids, inertia)

In [None]:
plot_clusters_with_centers(X_scaled, final_labels, final_centroids, k=N_CENTERS)

In [None]:
inertia_scores = []
for kk in range(2, 10):
  kmeans_obj = KMeans(kk, tolerance=1e-4, max_iters=300)
  centroids, labels, inertia_score = kmeans_obj.fit(X_scaled)
  inertia_scores.append(inertia_score)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(range(2,10),inertia_scores,c='r',marker='x',s=100)
plt.plot(range(2,10), inertia_scores)
plt.xlabel('k')
plt.ylabel('Inertia')

In [None]:
from sklearn.cluster import KMeans
sk_kmeans = KMeans(n_clusters=N_CENTERS)
clusters = sk_kmeans.fit_transform(X_scaled)

In [None]:
clusters.shape

In [None]:
sk_kmeans.inertia_

In [None]:
centers = sk_kmeans.cluster_centers_ 
plt.figure(figsize=(8,8))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], s=10, c=sk_kmeans.labels_)
plt.scatter(centers[:, 0], centers[:, 1], c='r', s=20)