## Writing my own k-means clustering algorithm

In [None]:
import math
import csv
import random

### Function to estimate Euclidean distance between two points


In [2]:
def get_distance(x, y):
    """Estimates the Euclidean distance between two n-dimensional points.
    Assumes x and y are lists of numerical values (the point coordinates).
    Returns float (the Euclidean distance between x and y).
    """
    
    sqrs = [(x[i] - y[i])**2 for i in range(len(x))]
    return math.sqrt(sum(sqrs))

print(get_distance([0, 3, 0], [4, 0, 0]))


5.0


### Function to estimate the centroid of a collection of points


In [3]:
test_lst = [[0,0,0], [0,0,1], [0,1,0], [1,0,0], 
            [0,1,1], [1,0,1], [1,1,0], [1,1,1]]

def get_centroid(points):
    """Estimates the centroid for a collection of n-dimensional points.
    Assumes points is a collection of lists of numerical values.
    Returns a list of numerical values (the coordinates of the centroid).
    """
    
    centroid = []
    num_points = len(points)
    num_dims = len(points[0])
    for dim in range(num_dims):
        coord = [i[dim] for i in points]
        centroid.append(sum(coord)/num_points)
        
    return centroid

print(get_centroid(test_lst))


[0.5, 0.5, 0.5]


### Function to read data

In [4]:
def get_data():
    """Reads the file Wholesale customers data.csv and 
    returns part of the data as a list of lists.
    """
    
    with open('../data/Wholesale customers data.csv') as f:
        reader = csv.reader(f)
        data = [[int(i) for i in row[2:]] for row in reader if row[0] != 'Channel']
    return data

data = get_data()
print(data[:2])

[[12669, 9656, 7561, 214, 2674, 1338], [7057, 9810, 9568, 1762, 3293, 1776]]


### Function to implement k-means algorithm

In [5]:

def kmeans(points, k):
    """Clusters a colection of points into k clusters 
    using a simplified version of the k-means algorithm.
    """
    
    # Select k random points to use as initial centroids
    init = random.sample(points, k)

    # Create a list of k lists to contain the points assigned to each cluster.  
    clusters = [[] for i in init]
    
    # Create a list to keep the centroids of the k clusters. 
    centroids = [i for i in init]

    for _ in range(100):
        
        # Initialize the clusters for this iteration
        clusters = [[] for i in centroids]

        # Assign each point to the cluster with the closest centroid. 
        for point in points:
            # Initialize the minimum distance to the distance between each initial clusters
            min_dist = float("inf")
            closest_index = None

            for i in range(len(centroids)):
                dist = get_distance(centroids[i], point)

                if dist < min_dist:
                    min_dist = dist
                    closest_index = i
                    
    # Update new clusters
            clusters[closest_index].append(point)

    #Update centroids
        for i in range(len(centroids)):
            if clusters[i]:
                centroids[i] = get_centroid(clusters[i])

    # Check whether centroids are updated
    return clusters, centroids

clusters, centroids = kmeans(data, 3)

for i in range(3):
    print(f"Number of points: {len(clusters[i])}")
    print(f"Centroid coordinates: {centroids[i]}")

Number of points: 73
Centroid coordinates: [33111.69863013698, 4918.465753424657, 5847.54794520548, 5554.027397260274, 1097.876712328767, 2097.123287671233]
Number of points: 50
Centroid coordinates: [8723.78, 19220.54, 27604.86, 2724.7, 12277.34, 3195.36]
Number of points: 317
Centroid coordinates: [7655.482649842272, 3881.0157728706627, 5335.79810725552, 2555.11356466877, 1810.236593059937, 1129.6056782334385]
