In [2]:
import random

def generateRandomData(num_points=10, dims=2, ranges=(-1,1), style='scatter'):
    if style == 'scatter':
        data = np.random.random((num_points, dims))*(ranges[1]-ranges[0])+ranges[0]
    return data

In [3]:
import sys
import numpy as np
import copy

def computeScore(data, clusters):
    cluster_nums = set(clusters)
    score = 0
    for i in cluster_nums:
        cluster = data[clusters == i]
        mid = np.mean(cluster, 0)
        score += np.sum((cluster - mid)**2)
    #print((clusters,score))
    return score
            
def exhuastiveSearch(data,k):
    n = len(data)
    current_try = np.array(n*[0])
    best = (sys.float_info.max,None)
    
    done = False
    while not done:
        score = computeScore(data, current_try)
        if score < best[0]:
            best = (score, copy.copy(current_try))
        i = 0
        while True:
            if i == n:
                done = True
                break
            if current_try[i] == (k-1) or current_try[i] == i:
                current_try[i] = 0
                i += 1
            else:
                current_try[i] += 1
                break
        
    return best

In [4]:
def hMeans(data, k):
    n, d = data.shape
    centers = data[np.random.randint(n, size=k)]
    clusters = np.zeros(n)
    clustersold = np.zeros(n)
    while True:
        valid = False
        while not valid:
            for i in range(n):
                clustersold[i] = clusters[i]
                clusters[i] = np.argmin(np.sum((centers - data[i])**2,1))
            missing = False
            for i in range(k):
                if i not in clusters:
                    centers[i] = data[np.random.randint(n)]
                    missing = True
            if not missing:
                valid = True
        if np.array_equal(clusters, clustersold):
            return computeScore(data, clusters), clusters
        for i in range(k):
            centers[i] = np.mean(data[clusters == i],0)
        
            

In [46]:
def kMeans(data, k):
    n = len(data)
    clusters = np.random.randint(k, size=n)
    clusters_try = clusters.copy()
    score = computeScore(data, clusters)
    while True:
        old_score = score
        for i in range(n):
            for j in range(k):
                clusters_try[i] = j
                score_try = computeScore(data, clusters_try) # change to compute score diffrence
                if score_try < score:
                    score = score_try
                    clusters[i] = j
                else:
                    clusters_try[i] = clusters[i]
        if score == old_score:
            break
    return score, clusters

In [47]:
d = generateRandomData()

In [48]:
hMeans(d,3)

(1.6500355597106484, array([2., 1., 0., 2., 1., 0., 2., 2., 1., 2.]))

In [49]:
kMeans(d,3)

(1.0851164154004769, array([0, 2, 2, 0, 2, 2, 1, 0, 2, 1]))

In [52]:
computeScore(d, np.array([0, 2, 2, 0, 2, 2, 1, 0, 2, 1]))

1.0851164154004769

In [53]:
exhuastiveSearch(d,3)

(1.085116415400477, array([0, 1, 1, 0, 1, 1, 2, 0, 1, 2]))

In [103]:
import time

timeaprox = []
timeexahust = []
scoreaprox = []
scoreexhaust = []
sizes = [5,6,7,8]

for s in sizes:
    for i in range(100):
        timeaproxtmp = []
        timeexahusttmp = []
        scoreaproxtmp = []
        scoreexhausttmp = []
        d = generateRandomData(s)
        
        t = time.process_time()
        res = kMeans(d,3)
        timeaproxtmp.append(time.process_time()-t)
        scoreaproxtmp.append(res[0])
        
        t = time.process_time()
        res = exhuastiveSearch(d,3)
        timeexahusttmp.append(time.process_time()-t)
        scoreexhausttmp.append(res[0])
    
    timeaprox.append(timeaproxtmp)
    timeexahust.append(timeexahusttmp)
    scoreaprox.append(scoreaproxtmp)
    scoreexhaust.append(scoreexhausttmp)

In [104]:
timeaprox

[[0.0], [0.0], [0.0], [0.0]]