In [None]:
import numpy as np
import pandas as pd

import sklearn as sk
import matplotlib.pyplot as plt

In [None]:
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score

from sklearn.cluster import KMeans

### 1. KMeans for circular dataset

In [None]:
from sklearn.datasets import make_blobs
X, y_true = make_blobs(n_samples=300, centers=4,
                       cluster_std=0.60, random_state=0)
plt.scatter(X[:, 0], X[:, 1]);

In [None]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)

In [None]:
print(kmeans.cluster_centers_)
print(kmeans.labels_)

In [None]:
plt.scatter(X[:,0],X[:,1], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')

#### KMeans from scratch

In [None]:
rnd = np.random.RandomState(42)
record_count = X.shape[0]
print(f"record count is {record_count}")
all_indices = rnd.permutation(record_count)

In [None]:
X[all_indices[:4]]

In [None]:
def get_random_centroids(X, cluster_count):
    rnd = np.random.RandomState(42)
    record_count = X.shape[0]
    permutations = rnd.permutation(record_count)
    random_centroid_idx = permutations[:cluster_count]
    return X[random_centroid_idx]

In [None]:
num_clusters = 4
centroids = get_random_centroids(X, 4)
centroids

In [None]:
from sklearn.metrics import pairwise_distances, pairwise_distances_argmin 

In [None]:
blob_centroid_dists = pairwise_distances(X, centroids)
blob_centroid_dists

In [None]:
np.argmin(blob_centroid_dists, axis=1)

In [None]:
cluster_memberships = pairwise_distances_argmin(X, centroids)
cluster_memberships

In [None]:
cluster_memberships == 0

In [None]:
X[cluster_memberships == 0]

In [None]:
clusters = [X[cluster_memberships == i] for i in range(0,num_clusters)]

In [25]:
clusters = []
for i in range(0,num_clusters):
    clusters.append(X[cluster_memberships == i])

In [26]:
len(clusters)

4

In [None]:
clusters

In [None]:
new_centroids = np.array([X[cluster_memberships == i].mean() for i in range(0,num_clusters)])
new_centroids

In [None]:
new_centroids = np.array([X[cluster_memberships == i].mean(axis=0) for i in range(0,num_clusters)])
new_centroids

In [None]:
centroids == new_centroids

In [None]:
(centroids - new_centroids) < 1e-8

In [30]:
def kmeans(X, num_clusters=4):
    centroids = get_random_centroids(X, cluster_count=num_clusters)
    iter_idx = 1
    while True:
        print(f"Iter idx = {iter_idx}")
        cluster_memberships = pairwise_distances_argmin(X, centroids)
        new_centroids = np.array([X[cluster_memberships == i].mean(axis=0) for i in range(0,num_clusters)])

        if np.all(centroids == new_centroids):
            break

        centroids = new_centroids
        iter_idx += 1

    return centroids, np.arange(0,4)

In [31]:
kmeans(X, 4)

Iter idx = 1
Iter idx = 2
Iter idx = 3
Iter idx = 4
Iter idx = 5
Iter idx = 6


(array([[-1.58438467,  2.83081263],
        [ 1.98258281,  0.86771314],
        [ 0.94973532,  4.41906906],
        [-1.37324398,  7.75368871]]),
 array([0, 1, 2, 3]))