# K-Means

In [4]:
import numpy as np
import pandas as pd
from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt
from operator import itemgetter

In [330]:
class KMeans:
    """
    Perform KMeans clustering on a dataset.
    """

    def __init__(self, algorithm='lloyd', init='random', seed=None):
        """
        Initialize the KMeans object.

        Parameters
        ----------
        algorithm : {'lloyd', 'extended-hartigan', 'safe-hartigan', 'hartigan', 'extended-hartigan-2', 'safe-hartigan-2', 'hartigan-2'}
            Algorithm to use. Either 'lloyd' or 'extended-hartigan' or 'safe-hartigan' or 'hartigan' or 'extended-hartigan-2'
        
        init : {'random', 'random-data', 'kmeans++', 'greedy'}
            Initialization method. Either 'random' or 'random-data' or 'kmeans++' or 'greedy'

        seed : int
            Seed for random generator
        """

        assert algorithm in ['lloyd', 'extended-hartigan', 'safe-hartigan', 'hartigan'], "algorithm must be either 'lloyd' or 'hartigan'"
        assert init in ['random', 'random-data', 'kmeans++', 'greedy'], "init must be either 'random', 'random-data', 'kmeans++' or 'greedy'"
        assert seed is None or isinstance(seed, int), "seed must be an int or None"

        self.algorithm = algorithm
        self.init = init
        self.seed = seed

        self.data = None
        self.k = None
        self.centroids = None
        self.y_pred = None


    def fit(self, data : np.ndarray, k : int, debug=False):
        """
        Fit the model to the data.

        Parameters
        ----------
        data : np.ndarray
            nxd DataFrame of n samples with d features
        k : int
            Number of clusters

        Returns
        -------
        np.ndarray
            Array of shape (k, d) with cluster centroids
        np.ndarray
            Array of length n with cluster assignments for each sample
        """

        assert isinstance(data, np.ndarray), "data must be a numpy array"
        assert len(data.shape) == 2, "data must be a 2D array"
        assert isinstance(k, int), "k must be an int"
        assert k <= len(data), "k must be at most the number of samples"

        self.data = data
        self.k = k

        np.random.seed(self.seed)
        
        # initialize centroids
        self._init_centroids(debug)
        debug and print('initial centroids:\n', self.centroids)

        ## TODO: implement clustering algorithm

        return NotImplemented


    ## TODO: implement predict method
    def predict(self):
        return NotImplemented


    def _init_centroids(self, debug=False):
        """
        Initialize the centroids.
        """

        if self.init == 'random':
            
            # choose k random data points as initial centroids
            idx = np.random.choice(self.data.shape[0], self.k, replace=False)
            self.centroids = self.data[idx]

        elif self.init == 'random-data':

            # assign each data point to a random cluster
            clusters = np.random.choice(self.k, self.data.shape[0])

            # check that at least one point is assigned to each cluster
            while len(set(clusters)) < self.k:
                clusters = np.random.choice(self.k, self.data.shape[0])
            self.y_pred = clusters
            self.centroids = self._move_centroids(debug)

            ## TODO: calculate centroids based on cluster assignments
            return NotImplemented

        elif self.init == 'kmeans++':
    
            # choose first centroid randomly
            centroids = np.zeros((self.k, self.data.shape[1]))
            centroids[0] = self.data[np.random.choice(self.data.shape[0], 1, replace=False)[0]]
            debug and print('centroids:\n', centroids)

            # iterate over remaining k-1 centroids
            for i in range(1, self.k):
                debug and print('iteration', i)

                # calculate distance squared of each point to closest centroid
                dist = np.array([min([np.linalg.norm(c-x)**2 for c in centroids[:i]]) for x in self.data])
                
                # probabilities are given by the normalized distance squared
                probs = dist / dist.sum()
                debug and print('probs:', probs)

                # cumulate probabilities
                cumprobs = probs.cumsum()
                
                # choose next centroid randomly based on cumulated probabilities
                r = np.random.rand()
                debug and print('r:', r)
                for j, p in enumerate(cumprobs):
                    if r < p:
                        break

                centroids[i] = self.data[j]
                debug and print('centroids:\n', centroids)

            self.centroids = centroids


    def _move_centroids(self, debug=False):
        """
        Move the centroids to the mean of their cluster.
        """

        debug and print('y_pred:', self.y_pred)
        debug and print('data:\n', self.data)
        debug and print('centroids_before:', self.centroids)

        centroids = np.zeros((self.k, self.data.shape[1]))
        for centroid_id in range(self.k):
            centroids[centroid_id] = np.mean(self.data[self.y_pred == centroid_id], axis=0)

        debug and print('centroids_after:\n', centroids)

        return centroids



In [342]:
a = np.array([[1, 2], [3, 4], [11, 12], [13, 14], [19, 20]])

kmeans = KMeans(algorithm='lloyd', init='kmeans++')
kmeans.fit(a, 3, True)

centroids:
 [[3. 4.]
 [0. 0.]
 [0. 0.]]
iteration 1
probs: [0.00943396 0.         0.1509434  0.23584906 0.60377358]
r: 0.1755918004572875
centroids:
 [[ 3.  4.]
 [13. 14.]
 [ 0.  0.]]
iteration 2
probs: [0.09090909 0.         0.09090909 0.         0.81818182]
r: 0.12228487837182511
centroids:
 [[ 3.  4.]
 [13. 14.]
 [11. 12.]]
initial centroids:
 [[ 3.  4.]
 [13. 14.]
 [11. 12.]]


NotImplemented

In [329]:
a = np.array([[0,0], [1,1], [2,2], [3,3], [4,4], [5,5], [6,6], [7,7], [8,8], [9,9]])
for i in range(1,10):
    print(i, a[:i])

1 [[0 0]]
2 [[0 0]
 [1 1]]
3 [[0 0]
 [1 1]
 [2 2]]
4 [[0 0]
 [1 1]
 [2 2]
 [3 3]]
5 [[0 0]
 [1 1]
 [2 2]
 [3 3]
 [4 4]]
6 [[0 0]
 [1 1]
 [2 2]
 [3 3]
 [4 4]
 [5 5]]
7 [[0 0]
 [1 1]
 [2 2]
 [3 3]
 [4 4]
 [5 5]
 [6 6]]
8 [[0 0]
 [1 1]
 [2 2]
 [3 3]
 [4 4]
 [5 5]
 [6 6]
 [7 7]]
9 [[0 0]
 [1 1]
 [2 2]
 [3 3]
 [4 4]
 [5 5]
 [6 6]
 [7 7]
 [8 8]]


In [323]:
a = np.array([[0.,0.], [1.,1.], [2.,2.]])

# only positive entries
print(np.all(a != 0, axis=1))
a[np.all(a != 0, axis=1)]


[False  True  True]


array([[1., 1.],
       [2., 2.]])

In [169]:
np.random.choice(5, 10)

array([2, 3, 4, 3, 4, 3, 1, 4, 3, 1], dtype=int32)

In [None]:
def accuracy(y_true : np.ndarray, y_pred : np.ndarray):
    """
    Compute the accuracy of the clustering.
    
    Parameters
    ----------
    y_true : np.ndarray
        True labels of the samples
    y_pred : np.ndarray
        Predicted labels of the samples
    
    Returns
    -------
    float
        Accuracy of the clustering through Hungarian algorithm
    """
    
    assert isinstance(y_true, np.ndarray), "y_true must be a numpy array"
    assert isinstance(y_pred, np.ndarray), "y_pred must be a numpy array"

    # create C matrix
    n_classes = max(max(y_true), max(y_pred)) + 1
    C = np.zeros((n_classes, n_classes), dtype=int)
    for true_label, pred_label in zip(y_true, y_pred):
        C[true_label, pred_label] += 1
    
    # Solve assignment problem
    row_ind, col_ind = linear_sum_assignment(-C)
    
    # Calculate accuracy
    matched = C[row_ind, col_ind].sum(axis=0)
    accuracy = matched / len(y_true)
    return accuracy


In [3]:
from sklearn.cluster import KMeans as KMeans_sklearn

KMeans = KMeans_sklearn()
KMeans.fit = KMeans_sklearn.fit()