## Writing my own k-means clustering algorithm

In [2]:
import math
import csv
import random

### Function to estimate Euclidean distance between two points


In [3]:
def get_distance(pt1, pt2):
    """ Assumes pt1 and pt2 are lists. 
    Returns the Euclidean distance between two n-dimensional points.
    """ 
    # Calculate squared difference between each corresponding pair of coordinates
    difference = [(coord1 - coord2)**2 for coord1, coord2 in zip(pt1, pt2)]
    
    # Return the square root of the sum of squared differences
    return math.sqrt(sum(difference))

get_distance([0,3,0],[4,0,0])

5.0

### Function to estimate the centroid of a collection of points


In [4]:
test_lst = [[0,0,0], [0,0,1], [0,1,0], [1,0,0], 
            [0,1,1], [1,0,1], [1,1,0], [1,1,1]]

def get_centroid(points_ls):
    """ Assumes points_ls is a list of n-dimensional points, each of which is a list.
    Returns an estimate of the centroid, which is a list of n coordinates.
    """
    nb_dimensions = len(points_ls[0])
    
    sums = [0] * nb_dimensions
    for point in points_ls:
        for dim in range(nb_dimensions):
            sums[dim] += point[dim]
            
    # Calculate mean of coordinates for each dimension
    centroid = [s / len(points_ls) for s in sums]
    return centroid

get_centroid(test_lst)

[0.5, 0.5, 0.5]

### Function to read data

In [5]:
def get_data(file):
    """ Assumes file is a string representing the path to a CSV file.
    This function opens and returns a list of the file data 
    """
    data_ls = []
    file = open(file)
    read_file = csv.reader(file)
    for row in read_file:
        data_ls.append(row[2:])

    data_ls = [[int(elem) for elem in row] for row in data_ls[1:]]
    return data_ls
    
file = '../data/Wholesale customers data.csv'
data = get_data(file)
print(data[:2])


[[12669, 9656, 7561, 214, 2674, 1338], [7057, 9810, 9568, 1762, 3293, 1776]]


### Function to implement k-means algorithm

In [None]:

def kmeans(points, k):
    """Clusters a colection of points into k clusters"""
    
    # Select k random points to use as initial centroids
    init = random.sample(points, k)

    # Create a list of k lists to contain the points assigned to each cluster.  
    clusters = [[] for i in init]
    
    # Create a list to keep the centroids of the k clusters. 
    centroids = [i for i in init]

    for _ in range(100):
        
        # Initialize the clusters for this iteration
        clusters = [[] for i in centroids]
        for point in points:
            # Initialize the minimum distance to the distance between each initial clusters
            min_dist = float("inf")
            closest_index = None

            for i in range(len(centroids)):
                dist = get_distance(centroids[i], point)

                if dist < min_dist:
                    min_dist = dist
                    closest_index = i
                    
    # Update new clusters
            clusters[closest_index].append(point)

    #Update centroids
        for i in range(len(centroids)):
            if clusters[i]:
                centroids[i] = get_centroid(clusters[i])

    # Check whether centroids are updated
    return clusters, centroids

clusters, centroids = kmeans(data, 3)

for i in range(3):
    print(f"Number of points: {len(clusters[i])}")
    print(f"Centroid coordinates: {centroids[i]}")

Number of points: 75
Centroid coordinates: [32768.013333333336, 4827.68, 5723.1466666666665, 5535.92, 1074.12, 2066.64]
Number of points: 337
Centroid coordinates: [7390.958456973293, 4439.768545994065, 6292.19584569733, 2495.53412462908, 2238.652818991098, 1158.448071216617]
Number of points: 28
Centroid coordinates: [11849.17857142857, 24717.10714285714, 33887.71428571428, 3409.3214285714284, 15459.714285714286, 4483.857142857143]
