## K-means

K-means is a unsupervised machine learning algorithm used for clustering tasks. The goal of the algorithm is to partition a set of n data points into k clusters, where each data point belongs to the cluster with the nearest mean.

K-means is an iterative algorithm that aims to minimize the sum of squared distances between the data points and their respective cluster centroids. The algorithm starts with an initial set of k centroids, and then iteratively refines the centroids by re-assigning the data points to the nearest cluster and recalculating the centroid of each cluster.

K-means is a simple yet effective algorithm that can be used for a wide range of clustering tasks. It is widely used in applications such as customer segmentation, image processing, and anomaly detection. However, the algorithm can be sensitive to the initial choice of centroids, and may converge to a sub-optimal solution. To mitigate this, multiple runs of the algorithm with different initializations can be performed, and the solution with the lowest sum of squared distances can be chosen.

In [None]:
import numpy as np


def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x2 - x1) ** 2))

    
class KMeans():
    
    
    def __init__(self, K=3, n_iters=100, plot_steps=False):
        self.K = K
        self.n_iters = n_iters
        self.plot_steps = plot_steps
        
        self.clusters = [[] for _ in range(self.K)]
        self.centroids = []
        
        
    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape
        
        # initialize
        random_sample_idxs = np.random.choice(self.n_samples, self.K, replace=False)
        self.centroids = [self.X[idx] for idx in random_sample_idxs]
        
        # optimize
        for _ in range(self.n_iters):
            self.clusters = self._create_clusters(self.centroids)
            
            centroids_old = self.centroids
            self.centroids = self._get_centroids(self.clusters)
            
            if self._is_converged(centroids_old, self.centroids):
                break
                
        # classify
        return self._get_cluster_labels(self.clusters)
    
    
    def _create_clusters(self, centroids):
        clusters = [[] for _ in range(self.K)]
        for idx, sample in enumerate(self.X):
            centroid_idx = self._closest_centroid(sample, centroids)
            clusters[centroid_idx].append(idx)
        return clusters
           
        
    def _closest_centroid(self, sample, centroids):
        distances = [euclidean_distance(sample, point) for point in centroids]
        closest_idx = np.argmin(distances)
        return closest_idx
    
    
    def _get_centroids(self, clusters):
        centroids = np.zeros((self.K, self.n_features))
        for cluster_idx, cluster in enumerate(clusters):
            cluster_mean  = np.mean(self.X[cluster], axis=0)
            centroids[cluster_idx] = cluster_mean
        return centroids
        
        
    def _is_converged(self, centroids_old, centroids):
        distances = [euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.K)]
        return sum(distances) == 0
    
    
    def _get_cluster_labels(self, clusters):
        labels = np.empty(self.n_samples)
        for cluster_idx, cluster in enumerate(clusters):
            for sample_idx in clusters:
                labels[sample_idx] = cluster_idx
                
        return labels
    
    
    def plot(self):
        fig, ax = plt.subplots(figsize=(12, 8))

        for i, index in enumerate(self.clusters):
            point = self.X[index].T
            ax.scatter(*point)

        for point in self.centroids:
            ax.scatter(*point, marker="x", color="black", linewidth=2)

        plt.show()

In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

X, y = make_blobs(centers=3, n_samples=500, n_features=2, shuffle=True, random_state=40)
print(X.shape)

clusters = len(np.unique(y))
print(clusters)

k = KMeans(K=clusters, n_iters=150, plot_steps=True)
y_pred = k.predict(X)

k.plot()