# K-Means

In [1]:
from typing import Union, Literal

import numpy as np
import pandas as pd

from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt
from torchvision import datasets

In [271]:
## TODO: check cost in one candidate case!
## TODO: update README

class KMeans:
    """
    Perform KMeans clustering on a dataset.
    """

    def __init__(self,
                 algorithm : Literal['lloyd', 'extended-hartigan', 'safe-hartigan', 'hartigan', 'binary-hartigan'] = 'lloyd',
                 init : Literal['random', 'random-data', 'k-means++', 'greedy'] = 'random',
                 seed : Union[int, None] = None):
        """
        Initialize the KMeans object.

        Parameters
        ----------
        algorithm : {'lloyd', 'extended-hartigan', 'safe-hartigan', 'hartigan', 'binary-hartigan'}
            Algorithm to use. Either 'lloyd' or 'extended-hartigan' or 'safe-hartigan' or 'hartigan' or 'binary-hartigan'

        init : {'random', 'random-data', 'k-means++', 'greedy'}
            Initialization method. Either 'random' or 'random-data' or 'k-means++' or 'greedy'

        seed : int
            Seed for random generator
        """

        assert algorithm in ['lloyd', 'extended-hartigan', 'safe-hartigan', 'hartigan', 'binary-hartigan'], "algorithm must be either 'lloyd', 'extended-hartigan', 'safe-hartigan', 'hartigan' or 'binary-hartigan'"
        assert init in ['random', 'random-data', 'k-means++', 'greedy'], "init must be either 'random', 'random-data', 'k-means++' or 'greedy'"
        assert seed is None or isinstance(seed, int), "seed must be an int or None"

        self.algorithm = algorithm
        self.init = init
        self.seed = seed

        self.data = None
        self.k = None
        self.centroids = None
        self.y_pred = None


    def fit(self, data : np.ndarray, k : int, debug : int = 0):
        """
        Fit the model to the data.

        Parameters
        ----------
        data : np.ndarray
            nxd DataFrame of n samples with d features
        k : int
            Number of clusters
        debug : int
            Debug level (0: no debug, 1: some debug, 2: all debug)

        Returns
        -------
        np.ndarray
            Array of shape (k, d) with cluster centroids
        np.ndarray
            Array of length n with cluster assignments for each sample
        """

        assert isinstance(data, np.ndarray), "data must be a numpy array"
        assert len(data.shape) == 2, "data must be a 2D array"
        assert isinstance(k, int), "k must be an int"
        assert 0 < k <= len(data), "k must be at least 0 and at most the number of samples"
        assert isinstance(debug, int) or debug, "debug must be an int"

        self.data = data
        self.k = k

        np.random.seed(self.seed)

        # initialize centroids
        self._init_centroids(debug)
        debug and print('initial centroids:\n', self.centroids)

        if self.algorithm == 'lloyd':
            self._lloyd(debug)
        elif self.algorithm == 'extended-hartigan':
            self._extended_hartigan(always_safe=False, debug=debug)
        elif self.algorithm == 'safe-hartigan':
            self._extended_hartigan(always_safe=True, binary_hartigan=False, debug=debug)
        elif self.algorithm == 'binary-hartigan':
            self._extended_hartigan(always_safe=False, binary_hartigan=True, debug=debug)
        elif self.algorithm == 'hartigan':
            self._hartigan(debug)
        
        print('final centroids:\n', self.centroids)
        print('final y_pred:', self.y_pred)


    def _init_centroids(self, debug=0):
        """
        Initialize the centroids.
        """

        if self.init == 'random':

            # choose k random data points as initial centroids
            idx = np.random.choice(self.data.shape[0], self.k, replace=False)
            self.centroids = self.data[idx]

        elif self.init == 'random-data':

            # assign each data point to a random cluster
            clusters = np.random.choice(self.k, self.data.shape[0])

            # check that at least one point is assigned to each cluster
            while len(set(clusters)) < self.k:
                clusters = np.random.choice(self.k, self.data.shape[0])
            self.y_pred = clusters
            self.centroids = self._move_centroids(None, debug > 1)

        elif self.init == 'k-means++':

            # choose first centroid randomly
            centroids = np.zeros((self.k, self.data.shape[1]))
            centroids[0] = self.data[np.random.choice(self.data.shape[0], 1, replace=False)[0]]
            debug and print('centroids:\n', centroids)

            # iterate over remaining k-1 centroids
            for i in range(1, self.k):
                debug and print('iteration', i)

                # calculate squared distance of each point to closest centroid
                dist = np.array([min([np.linalg.norm(c-x)**2 for c in centroids[:i]]) for x in self.data])

                # probabilities are given by the normalized distance squared
                probs = dist / dist.sum()
                debug and print('probs:', probs)

                # # cumulate probabilities
                # cumprobs = probs.cumsum()
                # r = np.random.rand()
                # debug and print('r:', r)
                # for j, p in enumerate(cumprobs):
                #     if r < p:
                #         break

                # choose next centroid randomly based on cumulated probabilities
                j = np.random.choice(len(self.data), p=probs)

                centroids[i] = self.data[j]
                debug and print('centroids:\n', centroids)

            self.centroids = centroids

        elif self.init == 'greedy':

            # choose first centroid randomly
            centroids = np.zeros((self.k, self.data.shape[1]))
            centroids[0] = self.data[np.random.choice(self.data.shape[0], 1, replace=False)[0]]
            debug and print('centroids:\n', centroids)

            # iterate over remaining k-1 centroids
            for i in range(1, self.k):
                debug and print('iteration', i)

                # calculate squared distance of each point to closest centroid
                dist = np.array([min([np.linalg.norm(c-x)**2 for c in centroids[:i]]) for x in self.data])

                # choose next centroid as the point with the maximum distance to the closest centroid
                centroids[i] = self.data[np.argmax(dist)]
                debug and print('centroids:\n', centroids)

            self.centroids = centroids



    def _lloyd(self, debug=0):
        """
        Lloyd's algorithm for k-means clustering.
        """

        debug and print('\nRunning Lloyd\'s algorithm...')

        while True:
            
            debug and print('New iteration')

            # assign each data point to the closest centroid
            self.y_pred = self._assign_clusters(debug > 1)
            debug and print('y_pred:', self.y_pred)

            # move centroids to the mean of their cluster
            new_centroids = self._move_centroids(None, debug > 1)

            # check for convergence
            if np.allclose(self.centroids, new_centroids):
                break

            self.centroids = new_centroids


    def _extended_hartigan(self, always_safe=False, binary_hartigan=False, debug=0):
        """
        Extended Hartigan algorithm for k-means clustering (unsafe+safe, always safe or binary mode).
        """

        debug and print('\nRunning Extended Hartigan algorithm...')

        # first assignment
        if self.y_pred is None:
            self.y_pred = self._assign_clusters(debug > 1)

        while True:
            # start with unsafe mode    
            safe_mode = False

            # create an empty dictionary of new candidates
            candidates = {}

            for datapoint_id in range(len(self.data)):
                debug and print('\ndatapoint_id:', datapoint_id)

                candidates = self._find_candidates(datapoint_id, candidates, debug)
                    
            debug and print('\ncandidates:', candidates)
            
            # break at convergence
            if not candidates:      ## [] -> False
                debug and print('no more candidates')
                break    

            # proceed in unsafe mode
            if not safe_mode and not always_safe:
                debug and print('\nentered in UNSAFE mode')

                # store current state for possible rollback
                rollback = self.y_pred.copy()

                # calculate original cost
                original_cost = self._tot_cluster_cost(self.centroids, self.y_pred, debug > 1)
                debug and print('original_cost:', original_cost)

                new_cost, new_centroids = self._accept_candidates(candidates, debug > 1)
                debug and print('new cost:', new_cost)

                if new_cost >= original_cost:
                    # new clustering is more expensive, proceed in safe mode
                    safe_mode = True
                    self.y_pred = rollback

            # start new condition since safe mode can be entered from unsafe mode
            if (safe_mode or always_safe) and not binary_hartigan:
                debug and print('\nentered in SAFE mode')

                unchanged_clusters = list(range(self.k))
                for _, [delta_cost, current_centroid_id, new_centroid_id] in sorted(candidates.items(), key=lambda e: e[1][1]):

                    # if both clusters are still unchanged, accept the candidate
                    if current_centroid_id in unchanged_clusters and new_centroid_id in unchanged_clusters:
                        debug and print(f'candidate {_} moved from {current_centroid_id} to {new_centroid_id}')
                        self.y_pred[datapoint_id] = new_centroid_id
                        unchanged_clusters.remove(current_centroid_id)
                        unchanged_clusters.remove(new_centroid_id)

                    # if we cannot operate on any more clusters, break
                    if not unchanged_clusters:
                        break

                new_centroids = self._move_centroids(None, debug > 1)

            # proceed in binary-hartigan if needed
            elif binary_hartigan:
                debug and print('\nentered in BINARY mode')

                # store current state for possible rollback
                rollback = self.y_pred.copy()

                # calculate original cost
                original_cost = self._tot_cluster_cost(self.centroids, self.y_pred, debug > 1)
                debug and print('original_cost:', original_cost)

                candidates_partition = [candidates]
                no_edit = True
                while no_edit:
                    debug and print('candidates_partition:', candidates_partition)

                    for part in candidates_partition:
                        # "binary" split
                        candidates_items = list(part.items())
                        half = len(candidates)//2

                        part_1 = dict(candidates_items[:half])                        
                        new_cost, new_centroids = self._accept_candidates(part_1, debug > 1)
                        debug and print('new_cost trying part_1:', new_cost)

                        if new_cost >= original_cost:
                            # new clustering accepting part_1 is more expensive
                            # rollback and try with part_2
                            self.y_pred = rollback

                            part_2 = dict(candidates_items[half:])
                            new_cost, new_centroids = self._accept_candidates(part_2, debug > 1)
                            debug and print('new_cost trying part_2:', new_cost)
                            
                            if new_cost >= original_cost:
                                # new clustering accepting part_2 is more expensive
                                # rollback and proceed with "binary" split
                                self.y_pred = rollback
                            else:
                                no_edit = False
                                break
                        else:
                            no_edit = False
                            break

                        # if no break was encountered, proceed with "binary" split
                        candidates_partition = [part_1, part_2]

            self.centroids = new_centroids

    def _hartigan(self, debug=0):
        """
        Hartigan algorithm for k-means clustering.
        """
        debug and print('\nRunning Hartigan algorithm...')

        # first assignment
        if self.y_pred is None:
            self.y_pred = self._assign_clusters(debug > 1)

        edit = True
        while edit:
            edit =  False
            for datapoint_id in range(len(self.data)):
                debug and print('\ndatapoint_id:', datapoint_id)

                candidate = self._find_candidates(datapoint_id, {}, debug)
                debug and print('candidate:', candidate)

                if candidate:
                    new_cost, new_centroids = self._accept_candidates(candidate, debug > 1)
                    self.centroids = new_centroids
                    edit = True
                    # the code continues with the next datapoint instead than starting from the first one again

    def _move_centroids(self, move_just = None, debug=0):
        """
        Move the centroids to the mean of their cluster.
        """

        debug and print('\n  moving centroids...')
        debug and print('  | y_pred:', self.y_pred)
        debug and print('  | data:\n', self.data)
        debug and print('  | centroids_before:\n', self.centroids)

        centroids = np.zeros((self.k, self.data.shape[1]))
        
        move = move_just if move_just is not None else range(self.k)
        debug and print('  | move:', move)
        for centroid_id in move:
            cluster_points = self.data[self.y_pred == centroid_id]
            
            # if centroid has no points assigned to it, reassign it randomly
            if len(cluster_points) == 0:
                debug and print(f"  Centroid {centroid_id} is empty. Reassigning.")
                new_centroid_id = np.random.choice(len(self.data))
                centroids[centroid_id] = self.data[new_centroid_id]
                self.y_pred[new_centroid_id] = centroid_id
            else:
                centroids[centroid_id] = np.mean(cluster_points, axis=0)

        debug and print('  centroids_after:\n', centroids)

        return centroids


    def _assign_clusters(self, debug=0):
        """
        Assign each data point to the closest centroid.
        """

        y_pred = np.zeros(len(self.data), dtype=int)
        for i, x in enumerate(self.data):
            debug and print(f'{i}:', [np.linalg.norm(x-c)**2 for c in self.centroids])
            y_pred[i] = np.argmin([np.linalg.norm(x-c)**2 for c in self.centroids])

        debug and print('y_pred:', y_pred)

        return y_pred


    def _delta_cost(self, cost, datapoint_id, centroid_id):
        """
        Compute the change in cost if datapoint is reassigned to centroid_id
        """

        cluster_size = np.where(self.y_pred == centroid_id)[0].shape[0]
        prefactor = cluster_size / (cluster_size + 1)

        # cost of new assignment
        new_cost = prefactor * np.linalg.norm(self.data[datapoint_id] - self.centroids[centroid_id])**2

        return new_cost - cost


    def _tot_cluster_cost(self, centroids, points_ids, debug=0):
        """
        Compute the overall cost of clustering
        """
        
        debug and print('\n  calculating _tot_cluster_cost')
        
        partial_sum = []
        for centroid_id in range(centroids.shape[0]):
            cluster_items = np.where(points_ids == centroid_id)[0]
            partial_sum.append(np.sum(np.square(self.data[cluster_items] - self.centroids[centroid_id])))

            debug and print('  | centroid_id:', centroid_id)
            debug and print('  | centroid:', centroids[centroid_id])
            debug and print('  | cluster_items:', cluster_items)
            debug and print('  | partial_sum:', partial_sum)
        
        debug and print('  partial_sum:', np.sum(partial_sum))
        debug and print('  _tot_cluster_cost:', np.sum(partial_sum))
        
        return np.sum(partial_sum)


    def _find_candidates(self, datapoint_id, candidates, debug=0):
        """
        Find candidates for reassignment of a single datapoint.
        """

        # calculate cost of current assignment which remains invariant
        current_centroid_id = self.y_pred[datapoint_id]
        cluster_size = np.where(self.y_pred == current_centroid_id)[0].shape[0]
        prefactor = cluster_size / (cluster_size - 1) if cluster_size > 1 else 0
        
        current_cost = prefactor * np.linalg.norm(self.data[datapoint_id] - self.centroids[current_centroid_id])**2
        debug and print('current_cost:', current_cost)

        # if current_cost is 0, delta_cost will always be positive
        if current_cost == 0:
            return candidates

        # iterate only on possible new centroid assignments
        for centroid_id in np.setdiff1d(self.y_pred, current_centroid_id):
            delta_cost = self._delta_cost(current_cost, datapoint_id, centroid_id)
            debug and print(f'delta_cost for datapoint {datapoint_id} from centroid {current_centroid_id} to centroid {centroid_id}:', delta_cost)

            # datapoint is a candidate if it reduces the cost
            # if more reassignments reduce the cost, the best one is stored (the one producing the most negfative delta_cost)
            if delta_cost < 0 and (candidates.get(datapoint_id) is None or delta_cost < candidates[datapoint_id][0]):
                candidates[datapoint_id] = [delta_cost, current_centroid_id, centroid_id]
        
        return candidates


    def _accept_candidates(self, candidates, debug=0):
        """
        Accepts all candidates passed as argument and calculates new total cluster cost.
        """
        # accept all candidates
        used_centroids = set()
        for candidate in candidates.keys():
            debug and print('candidate:', candidate)
            
            [delta_cost, current_centroid_id, new_centroid_id] = candidates[candidate]
            used_centroids.add(current_centroid_id)
            used_centroids.add(new_centroid_id)

            debug and print('y_pred before:', self.y_pred)

            # update closest_points_ids assigning datapoint to new_centroid_id
            self.y_pred[candidate] = new_centroid_id
            debug and print('y_pred after:', self.y_pred)

        new_centroids = self._move_centroids(move_just=used_centroids, debug = debug)

        return self._tot_cluster_cost(new_centroids, self.y_pred, debug), new_centroids



In [35]:
def accuracy(y_true : np.ndarray, y_pred : np.ndarray):
    """
    Compute the accuracy of the clustering.
    
    Parameters
    ----------
    y_true : np.ndarray
        True labels of the samples
    y_pred : np.ndarray
        Predicted labels of the samples
    
    Returns
    -------
    float
        Accuracy of the clustering through Hungarian algorithm
    """
    
    assert isinstance(y_true, np.ndarray), "y_true must be a numpy array"
    assert isinstance(y_pred, np.ndarray), "y_pred must be a numpy array"

    # create C matrix
    n_classes = max(max(y_true), max(y_pred)) + 1
    C = np.zeros((n_classes, n_classes), dtype=int)
    for true_label, pred_label in zip(y_true, y_pred):
        C[true_label, pred_label] += 1
    
    # Solve assignment problem
    row_ind, col_ind = linear_sum_assignment(-C)
    
    # Calculate accuracy
    matched = C[row_ind, col_ind].sum(axis=0)
    accuracy = matched / len(y_true)
    return accuracy

In [270]:
# debug
a = np.array([[1, 2], [3, 4], [11, 12], [13, 14], [19, 20]])

kmeans = KMeans(algorithm='binary-hartigan', init='random', seed=20000)
kmeans.fit(a, 3, debug=2)
accuracy(np.array([0, 0, 1, 1, 2]), kmeans.y_pred)

initial centroids:
 [[13 14]
 [11 12]
 [ 3  4]]

Running Extended Hartigan algorithm...
0: [np.float64(287.99999999999994), np.float64(200.00000000000003), np.float64(8.000000000000002)]
1: [np.float64(200.00000000000003), np.float64(128.00000000000003), np.float64(0.0)]
2: [np.float64(8.000000000000002), np.float64(0.0), np.float64(128.00000000000003)]
3: [np.float64(0.0), np.float64(8.000000000000002), np.float64(200.00000000000003)]
4: [np.float64(71.99999999999999), np.float64(128.00000000000003), np.float64(512.0000000000001)]
y_pred: [2 2 1 0 0]

datapoint_id: 0
current_cost: 16.000000000000004
delta_cost for datapoint 0 from centroid 2 to centroid 0: 175.99999999999994
delta_cost for datapoint 0 from centroid 2 to centroid 1: 84.00000000000001

datapoint_id: 1
current_cost: 0.0

datapoint_id: 2
current_cost: 0.0

datapoint_id: 3
current_cost: 0.0

datapoint_id: 4
current_cost: 143.99999999999997
delta_cost for datapoint 4 from centroid 0 to centroid 1: -79.99999999999996
delta_c

KeyboardInterrupt: 

In [268]:
# debug
a = np.array([[1, 2], [3, 4], [11, 12], [13, 14], [19, 20]])

kmeans = KMeans(algorithm='hartigan', init='random-data')
kmeans.fit(a, 3, debug=2)
accuracy(np.array([0, 0, 1, 1, 2]), kmeans.y_pred)


  moving centroids...
  | y_pred: [2 1 0 2 1]
  | data:
 [[ 1  2]
 [ 3  4]
 [11 12]
 [13 14]
 [19 20]]
  | centroids_before:
 None
  | move: range(0, 3)
  centroids_after:
 [[11. 12.]
 [11. 12.]
 [ 7.  8.]]
initial centroids:
 [[11. 12.]
 [11. 12.]
 [ 7.  8.]]

Running Hartigan algorithm...

datapoint_id: 0
current_cost: 143.99999999999997
delta_cost for datapoint 0 from centroid 2 to centroid 0: -43.99999999999996
delta_cost for datapoint 0 from centroid 2 to centroid 1: -10.666666666666629
candidate: {0: [np.float64(-43.99999999999996), np.int32(2), np.int32(0)]}
candidate: 0
y_pred before: [2 1 0 2 1]
y_pred after: [0 1 0 2 1]

  moving centroids...
  | y_pred: [0 1 0 2 1]
  | data:
 [[ 1  2]
 [ 3  4]
 [11 12]
 [13 14]
 [19 20]]
  | centroids_before:
 [[11. 12.]
 [11. 12.]
 [ 7.  8.]]
  | move: {np.int32(0), np.int32(2)}
  centroids_after:
 [[ 6.  7.]
 [ 0.  0.]
 [13. 14.]]

  calculating _tot_cluster_cost
  | centroid_id: 0
  | centroid: [6. 7.]
  | cluster_items: [0 2]
  | partia

np.float64(1.0)

In [272]:
# debug
a = np.array([[1, 2], [3, 4], [11, 12], [13, 14], [19, 20]])

accs = []
for i in range(20000, 40000):
    print(i)
    kmeans = KMeans(algorithm='lloyd', init='k-means++', seed=i)
    kmeans.fit(a, 3, debug=0)
    accs.append(accuracy(np.array([0, 0, 1, 1, 2]), kmeans.y_pred))
print(np.mean(accs))

20000
final centroids:
 [[19. 20.]
 [ 2.  3.]
 [12. 13.]]
final y_pred: [1 1 2 2 0]
20001
final centroids:
 [[ 2.  3.]
 [19. 20.]
 [12. 13.]]
final y_pred: [0 0 2 2 1]
20002
final centroids:
 [[14.33333333 15.33333333]
 [ 1.          2.        ]
 [ 3.          4.        ]]
final y_pred: [1 2 0 0 0]
20003
final centroids:
 [[12. 13.]
 [ 2.  3.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
20004
final centroids:
 [[19. 20.]
 [ 2.  3.]
 [12. 13.]]
final y_pred: [1 1 2 2 0]
20005
final centroids:
 [[12. 13.]
 [ 2.  3.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
20006
final centroids:
 [[14.33333333 15.33333333]
 [ 3.          4.        ]
 [ 1.          2.        ]]
final y_pred: [2 1 0 0 0]
20007
final centroids:
 [[12. 13.]
 [ 2.  3.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
20008
final centroids:
 [[19. 20.]
 [ 2.  3.]
 [12. 13.]]
final y_pred: [1 1 2 2 0]
20009
final centroids:
 [[ 3.          4.        ]
 [14.33333333 15.33333333]
 [ 1.          2.        ]]
final y_pred: [2 0 1 1 1]
20010
final cent

In [269]:
# debug
a = np.array([[1, 2], [3, 4], [11, 12], [13, 14], [19, 20]])

accs = []
for i in range(10000):
    print(i)
    kmeans = KMeans(algorithm='hartigan', init='random-data', seed=i)
    kmeans.fit(a, 3, debug=0)
    accs.append(accuracy(np.array([0, 0, 1, 1, 2]), kmeans.y_pred))
print(np.mean(accs))

0
final centroids:
 [[19. 20.]
 [ 0.  0.]
 [12. 13.]]
final y_pred: [1 1 2 2 0]
1
final centroids:
 [[12. 13.]
 [ 0.  0.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
2
final centroids:
 [[12. 13.]
 [ 0.  0.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
3
final centroids:
 [[12. 13.]
 [ 0.  0.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
4
final centroids:
 [[19. 20.]
 [12. 13.]
 [ 2.  3.]]
final y_pred: [2 2 1 1 0]
5
final centroids:
 [[12. 13.]
 [ 0.  0.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
6
final centroids:
 [[ 0.  0.]
 [19. 20.]
 [12. 13.]]
final y_pred: [0 0 2 2 1]
7
final centroids:
 [[12. 13.]
 [19. 20.]
 [ 0.  0.]]
final y_pred: [2 2 0 0 1]
8
final centroids:
 [[ 0.  0.]
 [19. 20.]
 [12. 13.]]
final y_pred: [0 0 2 2 1]
9
final centroids:
 [[ 0.  0.]
 [12. 13.]
 [19. 20.]]
final y_pred: [0 0 1 1 2]
10
final centroids:
 [[19. 20.]
 [12. 13.]
 [ 0.  0.]]
final y_pred: [2 2 1 1 0]
11
final centroids:
 [[19. 20.]
 [12. 13.]
 [ 0.  0.]]
final y_pred: [2 2 1 1 0]
12
final centroids:
 [[12. 13.]
 [ 0. 

In [148]:
# debug
a = np.array([[1, 2], [3, 4], [11, 12], [13, 14], [19, 20]])

accs = []
for i in range(10000):
    print(i)
    kmeans = KMeans(algorithm='extended-hartigan', init='k-means++', seed=i)
    kmeans.fit(a, 3, debug=0)
    accs.append(accuracy(np.array([0, 0, 1, 1, 2]), kmeans.y_pred))
print(np.mean(accs))

0
final centroids:
 [[11. 12.]
 [ 3.  4.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
1
final centroids:
 [[11. 12.]
 [ 1.  2.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
2
final centroids:
 [[11. 12.]
 [ 3.  4.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
3
final centroids:
 [[13. 14.]
 [ 1.  2.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
4
final centroids:
 [[ 1.  2.]
 [19. 20.]
 [11. 12.]]
final y_pred: [0 0 2 2 1]
5
final centroids:
 [[19. 20.]
 [ 1.  2.]
 [13. 14.]]
final y_pred: [1 1 2 2 0]
6
final centroids:
 [[13. 14.]
 [ 1.  2.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
7
final centroids:
 [[ 1.  2.]
 [19. 20.]
 [11. 12.]]
final y_pred: [0 0 2 2 1]
8
final centroids:
 [[19. 20.]
 [11. 12.]
 [ 1.  2.]]
final y_pred: [2 2 1 1 0]
9
final centroids:
 [[13. 14.]
 [ 1.  2.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
10
final centroids:
 [[11. 12.]
 [ 3.  4.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
11
final centroids:
 [[11. 12.]
 [ 3.  4.]
 [19. 20.]]
final y_pred: [1 1 0 0 2]
12
final centroids:
 [[ 1.  2.]
 [13. 

In [None]:
# debug
a = np.array([[1, 2], [3, 4], [11, 12], [13, 14], [19, 20]])

accs = []
for i in range(10000):
    print(i)
    kmeans = KMeans(algorithm='lloyd', init='random-data', seed=i)
    kmeans.fit(a, 3, debug=0)
    accs.append(accuracy(np.array([0, 0, 1, 1, 2]), kmeans.y_pred))
print(np.mean(accs))

In [226]:
a = np.array([[1, 2], [3, 4], [11, 12], [13, 14], [19, 20]])

kmeans = KMeans(algorithm='lloyd', init='k-means++')
kmeans.fit(a, 3, debug=0)
accuracy(np.array([0, 0, 1, 1, 2]), kmeans.y_pred)

centroids:
 [[1. 2.]
 [0. 0.]
 [0. 0.]]
iteration 1
probs: [0.         0.00699301 0.17482517 0.25174825 0.56643357]
r: 0.11165717709251399
centroids:
 [[ 1.  2.]
 [11. 12.]
 [ 0.  0.]]
iteration 2
probs: [0.         0.05555556 0.         0.05555556 0.88888889]
r: 0.9960458970299656
centroids:
 [[ 1.  2.]
 [11. 12.]
 [19. 20.]]
initial centroids:
 [[ 1.  2.]
 [11. 12.]
 [19. 20.]]
Running Lloyd's algorithm...
New iteration
[np.float64(0.0), np.float64(200.00000000000003), np.float64(648.0)]
[np.float64(8.000000000000002), np.float64(128.00000000000003), np.float64(512.0000000000001)]
[np.float64(200.00000000000003), np.float64(0.0), np.float64(128.00000000000003)]
[np.float64(287.99999999999994), np.float64(8.000000000000002), np.float64(71.99999999999999)]
[np.float64(648.0), np.float64(128.00000000000003), np.float64(0.0)]
y_pred: [0 0 1 1 2]
y_pred: [0 0 1 1 2]
data:
 [[ 1  2]
 [ 3  4]
 [11 12]
 [13 14]
 [19 20]]
centroids_before:
 [[ 1.  2.]
 [11. 12.]
 [19. 20.]]
centroids_after:
 

np.float64(1.0)

In [381]:
a = np.array([[1, 2], [3, 4], [11, 12], [13, 14], [19, 20]])

kmeans = KMeans(algorithm='extended-hartigan', init='random')
kmeans.fit(a, 3, True)
accuracy(np.array([0, 0, 1, 1, 2]), kmeans.y_pred)

initial centroids:
 [[13 14]
 [ 3  4]
 [19 20]]

Running Extended Hartigan algorithm...
0: [np.float64(287.99999999999994), np.float64(8.000000000000002), np.float64(648.0)]
1: [np.float64(200.00000000000003), np.float64(0.0), np.float64(512.0000000000001)]
2: [np.float64(8.000000000000002), np.float64(128.00000000000003), np.float64(128.00000000000003)]
3: [np.float64(0.0), np.float64(200.00000000000003), np.float64(71.99999999999999)]
4: [np.float64(71.99999999999999), np.float64(512.0000000000001), np.float64(0.0)]

datapoint_id: 0
current_cost: 16.000000000000004
delta_cost for datapoint 0 from centroid 1 to centroid 0: 175.99999999999994
delta_cost for datapoint 0 from centroid 1 to centroid 2: 308.0

datapoint_id: 1
current_cost: 0.0
delta_cost for datapoint 1 from centroid 1 to centroid 0: 133.33333333333334
delta_cost for datapoint 1 from centroid 1 to centroid 2: 256.00000000000006

datapoint_id: 2
current_cost: 16.000000000000004
delta_cost for datapoint 2 from centroid 0 to 

np.float64(1.0)

In [393]:
a = np.array([[1, 2], [3, 4], [11, 12], [13, 14], [19, 20]])

kmeans = KMeans(algorithm='safe-hartigan', init='random')
kmeans.fit(a, 3, True)
accuracy(np.array([0, 0, 1, 1, 2]), kmeans.y_pred)

initial centroids:
 [[19 20]
 [11 12]
 [ 1  2]]

Running Extended Hartigan algorithm...
0: [np.float64(648.0), np.float64(200.00000000000003), np.float64(0.0)]
1: [np.float64(512.0000000000001), np.float64(128.00000000000003), np.float64(8.000000000000002)]
2: [np.float64(128.00000000000003), np.float64(0.0), np.float64(200.00000000000003)]
3: [np.float64(71.99999999999999), np.float64(8.000000000000002), np.float64(287.99999999999994)]
4: [np.float64(0.0), np.float64(128.00000000000003), np.float64(648.0)]

datapoint_id: 0
current_cost: 0.0
delta_cost for datapoint 0 from centroid 2 to centroid 0: 324.0
delta_cost for datapoint 0 from centroid 2 to centroid 1: 133.33333333333334

datapoint_id: 1
current_cost: 16.000000000000004
delta_cost for datapoint 1 from centroid 2 to centroid 0: 240.00000000000006
delta_cost for datapoint 1 from centroid 2 to centroid 1: 69.33333333333334

datapoint_id: 2
current_cost: 0.0
delta_cost for datapoint 2 from centroid 1 to centroid 0: 64.000000000000

np.float64(1.0)

In [4]:
mnist = datasets.MNIST('data', train=True, download=True)

In [12]:
data = mnist.data.numpy().reshape(-1, 28*28)
kmeans = KMeans(algorithm='lloyd', init='random')
print('fitting...')
kmeans.fit(data, 10, debug=0)
print('done')
accuracy(mnist.targets.numpy(), kmeans.y_pred)

fitting...
final centroids:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
final y_pred: [2 1 7 ... 2 3 3]
done


np.float64(0.5315)

In [11]:
from sklearn.cluster import KMeans as KMeans_sklearn

kmeans = KMeans_sklearn(n_clusters=10, random_state=0)
kmeans.fit(data)

# accuracy using sklearn function
from sklearn.metrics import accuracy_score
accuracy(mnist.targets.numpy(), kmeans.labels_)


np.float64(0.5998666666666667)

In [None]:
data = mnist.data.numpy().reshape(-1, 28*28)
kmeans = KMeans(algorithm='lloyd', init='k-means++')
print('fitting...')
kmeans.fit(data, 10, debug=0)
print('done')
accuracy(mnist.targets.numpy(), kmeans.y_pred)

fitting...
final centroids:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
final y_pred: [5 1 9 ... 5 4 0]
done


np.float64(0.5556333333333333)

In [33]:
data = mnist.data.numpy().reshape(-1, 28*28)
kmeans = KMeans(algorithm='safe-hartigan', init='k-means++', seed=0)
print('fitting...')
kmeans.fit(data[:20], 10, debug=0)
print('done')
accuracy(mnist.targets.numpy()[:20], kmeans.y_pred)

AssertionError: init must be either 'random', 'random-data', 'kmeans++' or 'greedy'