In [1]:
import numpy as np
from numpy import random

In [2]:
X = np.array([[1,2], [4,5], [7,8]])
len(X)

3

In [3]:
np.random.choice(len(X), 2) # or np.random.randint(0,len(X), 2)
# range(num)-> excludes num

array([2, 2])

In [4]:
a = np.array([[11,11], [1,2]])
b = np.array([[21,22]]) 

a-b

array([[-10, -11],
       [-20, -20]])

In [5]:
np.sum(a-b)

-61

In [6]:
np.sum(a-b, axis = 0)

array([-30, -31])

In [7]:
np.sum(a-b, axis = 1)

array([-21, -40])

In [8]:
c = np.array([[11,10], [22,23]])
np.argmin(c) #flattened array index

# In case of multiple occurrences of the minimum values, the indices corresponding to the first occurrence are returned.

1

In [9]:
np.argmin(c, axis=0)

array([0, 0])

In [10]:
np.argmin(c, axis=1)

array([1, 0])

In [11]:
X = np.array([[11,12], [21,22], [31,32], [41,42]])
k = 2
max_epochs = 3

random_list = np.random.choice(len(X), k) # same as np.random.randint(0,len(X),k)
centroids = [X[i] for i in random_list]

centroids

[array([21, 22]), array([41, 42])]

In [12]:
X[:2]

array([[11, 12],
       [21, 22]])

In [13]:
X[3]

array([41, 42])

In [14]:
def point_assignment_cluster(point, centroids):
    euclidean_dist = np.sqrt(np.sum((centroids - point) ** 2, axis = 1)) # centroids - point can only be done if one np array has only one element
    return np.argmin(euclidean_dist)

In [15]:
def cluster_assignment(X, centroids):
    new_centroids_index = []
    for point in X:
        new_centroid_index =  point_assignment_cluster(point, centroids)
        new_centroids_index.append(new_centroid_index)
    return np.array(new_centroids_index)

In [16]:
cluster_index = np.zeros(len(X))
cluster_index

array([0., 0., 0., 0.])

In [17]:
for _ in range(max_epochs):
    cluster_index = cluster_assignment(X, centroids)
    #print(cluster_index)
    new_centroids = []
    for c in range(k):
        #print(X[cluster_index == c])
        new_centroids.append(np.mean(X[cluster_index == c], axis = 0))
    centroids = np.array(new_centroids)
#print(centroids)
        

In [18]:
# Time Complexity = O(num of data pts * num of clusters * num of iterations/epochs)

In [19]:
# What if the number of data points is huge that it cannot fit into memory
# Ans: Map - Reduce
# Map for assignment of each data point to a cluster
# Reduce for calculating new centroid

In [20]:
# fit: find cluster centroids
# predict: find which cluster these points belong to

class K_Means:
    def __init__(self, k, max_epochs):
        self.k = k
        self.max_epochs = max_epochs
        self.centroids = [] 
        
    def __find_random_centroids(self, X, k):
        random_list = np.random.choice(len(X), k)
        return [X[i] for i in random_list]
    
    def __point_assignment_cluster(self, point, centroids):
        euclidean_dist = np.sqrt(np.sum((centroids - point) ** 2, axis = 1))
        return np.argmin(euclidean_dist)
    
    def __cluster_assignment(self, X, centroids):
        new_centroids_index = []
        for point in X:
            new_centroid_index =  self.__point_assignment_cluster(point, centroids)
            new_centroids_index.append(new_centroid_index)
        return np.array(new_centroids_index)
        
        
    def fit(self, X):
        self.centroids = np.array(self.__find_random_centroids(X, self.k))
        
        for _ in range(self.max_epochs):
            
            cluster_index = self.__cluster_assignment(X, self.centroids)
            
            new_centroids = []
            for c in range(self.k):
                new_centroids.append(np.mean(X[cluster_index == c], axis = 0))
            self.centroids = np.array(new_centroids)
            
    
    
    def predict(self, X_test):
        return self.__cluster_assignment(X_test, np.array(self.centroids))
                

In [23]:
X = np.array([[0,1], [0,2], [5,3], [5,4], [10, 100], [10,101]])
kmeans = K_Means(3, 5)
kmeans.fit(X)


In [24]:
kmeans.centroids

array([[  5. ,   3.5],
       [ 10. , 100.5],
       [  0. ,   1.5]])

In [25]:
X_test = np.array([[2,2], [11,95]])
kmeans.predict(X_test)

array([2, 1])