In [32]:
import numpy as np
import pandas as pd

In [33]:
dataSetPath = 'simpleDatasetKMC.xlsx'

In [34]:
# read the excel using pandas data frame
dataFrame = pd.read_excel(dataSetPath)
print(dataFrame)

   X   Y
0  2  10
1  2   5
2  8   4
3  5   8
4  7   5
5  6   4
6  1   2
7  4   9


In [35]:
# convert dataframe to array using numpy
dataArray = np.array(dataFrame)
# converting all elements to float
dataArray = dataArray.astype(float)
print(dataArray)

[[ 2. 10.]
 [ 2.  5.]
 [ 8.  4.]
 [ 5.  8.]
 [ 7.  5.]
 [ 6.  4.]
 [ 1.  2.]
 [ 4.  9.]]


In [36]:
# let us create another table for normalized array | elements initially 0
normalizedArray = np.zeros_like(dataArray)
print(normalizedArray)

[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [37]:
# get no of rows and cols
print(normalizedArray.shape)
noOfRows = normalizedArray.shape[0]
noOfCols = normalizedArray.shape[1]

(8, 2)


In [38]:
# normalized data means converting the elements between 0 to 1 considering others
# normalized_x = (x - min) / (max - min)
# dataArray[:, col] = an array of all rows where column = col
# store the normalize data
for col in range(noOfCols):
  colMin = np.min( dataArray[:, col] ) # minimum element of the col
  colMax = np.max( dataArray[:, col] ) # maximum element of the col
  normalizedArray[:, col] = (dataArray[:, col]- colMin) / (colMax - colMin)

print(normalizedArray)

[[0.14285714 1.        ]
 [0.14285714 0.375     ]
 [1.         0.25      ]
 [0.57142857 0.75      ]
 [0.85714286 0.375     ]
 [0.71428571 0.25      ]
 [0.         0.        ]
 [0.42857143 0.875     ]]


In [42]:
# randomly select 3 centroids
k = 3
noOfPoints = normalizedArray.shape[0] # rows
randomPoints = np.random.choice(noOfPoints, k, replace=False) # replace = false | we don't want same array twice
print("random points:")
print(randomPoints)

centroids = normalizedArray[randomPoints]
print("centroids:")
print(centroids)

[3 1 6]
[[0.57142857 0.75      ]
 [0.14285714 0.375     ]
 [0.         0.        ]]


In [50]:
distances = np.zeros(shape=(noOfRows, k))

for i in range(k):
  distances[:, i] = np.sum( abs(normalizedArray-centroids[i]), axis=1 ) # axis=1 means sum of same rows elements

print(distances)

[[0.67857143 0.625      1.14285714]
 [0.80357143 0.         0.51785714]
 [0.92857143 0.98214286 1.25      ]
 [0.         0.80357143 1.32142857]
 [0.66071429 0.71428571 1.23214286]
 [0.64285714 0.69642857 0.96428571]
 [1.32142857 0.51785714 0.        ]
 [0.26785714 0.78571429 1.30357143]]


In [52]:
clusters = np.argmin(distances, axis=1)
print(clusters)


[1 1 0 0 0 0 2 0]


In [57]:
newCentroids = np.zeros_like(centroids)
for i in range(k):
  points = normalizedArray[ i==clusters ]
  # print(f'** {points} **')
  newCentroids[i] = points.mean(axis=0)

print(newCentroids)


[[0.71428571 0.5       ]
 [0.14285714 0.6875    ]
 [0.         0.        ]]
