In [1]:
import numpy as np
from numpy.linalg import norm

In [11]:
class Kmeans:


    def __init__(self,n_clusters,max_iter = 100, random_state = 47):

        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state



    def initialize_centroids(self,X):

        # X is the dataset
        np.random.RandomState(self.random_state)
        random_idx = np.random.permutation(X.shape[0])  # fetch random index from dataset
        centroids = X[random_idx[:self.n_clusters]]    # choose these random datapoints from dataset as initial guess n clusters
        return centroids


    def compute_centroids(self,X,labels):
        # X is the dataset
        centroids = np.zeros((self.n_clusters,X.shape[1]))
        # number of rows = clusters
        # number of columns = number of features in dataset
        # basically computing centroids of dataset by taking means of datapoints
        # in "n" clusters

        for k in range(self.n_clusters):
            centroids[k,:] = np.mean(X[labels==k, :], axis = 0)

        return centroids

    def compute_distance(self,X,centroids):

        distance = np.zeros((X.shape[0],self.n_clusters))

        for k in range(self.n_clusters):
            row_norm = norm(X - centroids[k,:],axis = 1)
            distance[:,k] = np.square(row_norm)

        return distance

    def find_closest_cluster(self,distance):

        return np.argmin(distance,axis = 1)


    def compute_sse(self,X,labels,centroids):

        distance = np.zeros(X.shape[0])

        for k in range(self.n_clusters):
            distance[labels==k] = norm(X[labels==k] - centroids[k],axis = 1)

        return np.sum(np.square(distance))

    def fit(self,X):

        self.centroids = self.initialize_centroids(X)

        for i in range(self.max_iter):

            old_centroids = self.centroids
            distance = self.compute_distance(X,old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(X,self.labels)

            if np.all(old_centroids == self.centroids):
                break

        self.error = self.compute_sse(X,self.labels,self.centroids)

    def predict(self, X):


        distance = self.compute_distance(X,self.centroids)
        return self.find_closest_cluster(distance)




In [3]:
a = np.array([[1,2],[3,4]])

In [4]:
a

array([[1, 2],
       [3, 4]])

In [9]:
b = norm(a,axis=1)

In [10]:
b

array([2.23606798, 5.        ])