In [317]:
# add necessary libraries here

import numpy as np
import pandas as pd

In [318]:
# read the excel file
# dataFile = 'KMC_student_dataset.xlsx'
dataFile = 'simpleDatasetKMC.xlsx'
dataFrame = pd.read_excel(dataFile)
print(dataFrame)

   X   Y
0  2  10
1  2   5
2  8   4
3  5   8
4  7   5
5  6   4
6  1   2
7  4   9


In [319]:
# convert pandas dataframe to numpy array to make calculations easy
# dataArray = dataFrame.to_numpy()
dataArray = np.array(dataFrame)

# convert all elemets to float for precise division
dataArray = dataArray.astype(float)
print(dataArray)

[[ 2. 10.]
 [ 2.  5.]
 [ 8.  4.]
 [ 5.  8.]
 [ 7.  5.]
 [ 6.  4.]
 [ 1.  2.]
 [ 4.  9.]]


In [320]:
# since some of the columns has small numbers and some of the columns has bigger numbers,
# let's convert all elements as 0~1 using the value difference in the same column values
# so, the numbers are changing but the difference is same in their category
# this is normalization | X_normalized = (X - X_min) / (max - min)


def getNormalizedArray(dataArray):
    rows, cols = dataArray.shape
    # print(f'rows={rows}, cols={cols}')
    normalizedArray = np.zeros_like(dataArray)

    for colIndex in range(cols):
        colElements = dataArray[:, colIndex] # get all elements in the same column
        xMin = np.min(colElements)
        # print(min)
        xMax = np.max(colElements)
        normalizedArray[:, colIndex] = (dataArray[:, colIndex] - xMin) / (xMax - xMin)
    
    return normalizedArray


# print(normalizeArray(dataArray))

In [321]:
# manhattanDistance = |x2 - x1|  + |y2 - y1|
# calculating distance
def calculateManhattanDistances(normalizedArray, centroids, K):
    rows, cols = normalizedArray.shape
    distances = np.zeros((rows, K))
    for i in range(K):
        distances[:,i] = np.sqrt( np.sum((normalizedArray - centroids[i])**2, axis=1) )
    return distances

In [322]:
def assignClusters(distances, centroids, normalizedArray):
    clusters = np.argmin(distances, axis=1)
    # print(clusters)

    newCentroids = np.zeros_like(centroids)
    # print(len(centroids))
    # print(centroids.shape[0]) # rows
    rows = centroids.shape[0]
    for i in range(rows):
        points = normalizedArray[i==clusters]
        newCentroids[i] = points.mean(axis=0)

    return newCentroids

In [323]:
def applyKMC(dataArray, K):

    ## step 1: Normalize all data
    normalizedArray = getNormalizedArray(dataArray)

    ## step 2: Randomly Select k centroids initially
    # genrating k random indices
    rows, cols = dataArray.shape
    random_indices = np.random.choice(rows, K, replace=False)
    # print(random_indices)
    
    # getting those points as centroids
    centroids = normalizedArray[random_indices]
    print(centroids)

    newCentroids = None
    while np.array_equal(centroids, newCentroids) == False:
        ## step 3: Get the distances of all elements from each centroid
        distances = calculateManhattanDistances(normalizedArray, centroids, K)
        # print(distances)

        ## step 4: assign them their clusters
        newCentroids = assignClusters(distances, centroids, normalizedArray)
        # print(newCentroids)

    print(newCentroids)

    # print(np.array_equal(centroids, newCentroids))


In [324]:
k = 3
applyKMC(dataArray, k)

[[0.14285714 1.        ]
 [0.42857143 0.875     ]
 [0.         0.        ]]


KeyboardInterrupt: 