In [None]:
import numpy as np
from numpy.linalg import norm
class Kmeans:
    '''Implementing Kmeans algorithm.'''
      def __init__(self, k , max_iter=100, seed=69):
        self.k = k
        self.max_iter = max_iter
        self.seed = seed
    
    def initialise_centroids(self , X):
        '''
        select k random data points as initial indices
        
        Parameters
        ----------
        X: numpy array of original data
        
        '''
        
        # generate a  random permutation of indices
        np.random.RandomState.seed(seed=self.seed)
        random_indices = np.random.permutation(X.shape[0])
        
        #select first k indices from the random permutation
        k_indices = random_indices[:self.k]
        centroids = X[k_indices]
        return centroids
        
    
    def compute_centroids(self, X):
        '''
         computes average of all data points in cluster and assigns new centroid for each cluster
        
        Parameters
        ----------
        X: numpy array of original data
        
        '''
        #initialise centroid matrix
        centroids = np.zeros((self.k, X.shape[1]))
        
        #for each cluster
        for i in range(self.k):
             #find all points that belong to ith cluster
            X_i = X[self.labels == i , :]
            
            # find their mean
            centroids[i] = np.mean(X_i , axis=0)
        return centroids
    
    def compute_distance_from_centroids(self, X): 
         '''
         computes the distance of data from each cluster centre
        
        Parameters
        ----------
        X: numpy array of original data
        
        Returns
        --------
        distance: array containing squared L2 norm distance of each data point from each cluster
            
        '''
        #initialise distance matrix
        distance = np.zeros((X.shape[0], self.k)) 
        for i in range(self.k):
            vector_difference_i = X - self.centroids[i, :] 
            #row-wise norm to calculate distance for each data point from ith centroid
            row_norm = norm(vector_difference_i, axis=1) 
            distance[:, i] = np.square(row_norm)  
        return distance
    
   
    def find_closest_cluster(self, distance):
         '''
         assign each data point to closest cluster.
        
        Parameters
        ----------
        distance: array containing squared L2 norm distance of each data point from each cluster
        
        Returns
        --------
        closest_cluster_labels: index of the cluster which minmises the distance of data to each cluster
            
        '''
        closest_cluster_labels = np.argmin(distance, axis=1) #return the closest cluster number for each row (each data point)
        return closest_cluster_labels
        
    def compute_sse(self, X, labels, centroids):
        distance = np.zeros(X.shape[0])
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(distance))
    
    def fit(self, X):
        #initialise centroids
        self.centroids = self.initialise_centroids(X)
        
        
        for i in range(self.max_iter):
            
            centroids_old = self.centroids
            
            #compute distance of all data points from centroids
            distance = self.compute_distance_from_centroids(X)
            
            #assign labels - find closest cluster for each data point
            self.cluster_labels = self.find_closest_cluster(distance)
            
            #recompute centroids
            self.centroids = self.compute_centroids(X)
            
            #stop if the centroids don't change for consecutive iterations
            if np.all(centroids_old, self.centroids):
                break
        
        #compute final error
        self.error = self.compute_sse(X)
    
   
    
    
       
        

array([1, 2, 0])

In [36]:
import numpy as np
X=np.array([[4,3,5],[8,6,7]])
np.square(np.linalg.norm(X,axis=1))

array([ 50., 149.])

In [39]:
X- np.array([1,2,4])

array([[3, 1, 1],
       [7, 4, 3]])

In [40]:
np.argmin(X,axis=1)

array([1, 1])

In [45]:
X=np.array([[1,2,3],[4,5,6]])
Y=np.array([[1,2,4],[4,5,6]])


In [47]:
np.all(X==Y)

False