In [147]:
import numpy as np
import random
import math

In [148]:
def load_data():
    data = [l.strip() for l in open('data/z_scores.csv') if l.strip()]
    features = [list(map(float, x.split(',')[:-1])) for x in data]
    labels = [x.split(',')[-1] for x in data]
    return features, labels

In [225]:
class Cluster(object):
    """Object that represent
    the Clusters & Centroids and 
    related methods"""
    
    def __init__(self, vectors):
        """The vectors of the cluster"""
        
        self.vectors = vectors
        self.centroid = self.calcCentroid()
        
    """def __repr__(self):
        
        return str((self.vectors, self.centroid))"""
    
    def calcCentroid(self):
        """Calculate the center point"""
        size = len(self.vectors)
        # zip all features together
        zipped = zip(*self.vectors)
        # Calculate the mean for each feature/column
        centroid = [math.fsum(column)/size for column in zipped]
        
        return centroid
    
    def updateCentroid(self, vectors):
        """Returns how much the centroids moved
        from his previous place and update the new centroid"""
        previous_centroid = self.centroid
        self.vectors = vectors
        self.centroid = self.calcCentroid()
        shift = distance(previous_centroid, self.centroid)
        
        return shift

In [226]:
def distance(x, y):
    """Euclidan distance between two vectors"""
    dist = [pow((x-y), 2) for x, y in zip(x,y)]
    dist = math.sqrt(sum(dist))
    
    return dist

In [227]:
def kmeans(vectors, k, converaged):
    
    """Select random k vectors as initial centroids"""
    init_vectors = random.sample(vectors, k)
    
    """Create k clusters using init_vectors"""
    clusters = [Cluster([v]) for v in init_vectors]
    
    """How many interations until centroids stabilize"""
    iteration = 0
    
    while True:
        
        # Vector (list) which holds all the cluster points in the given cluster
        cluster_points = [[] for c in clusters]
        n_cluster = len(clusters)
        
        # Start the iteration
        iteration += 1
        
        # For every feature from the vectors (dataset)
        for v in vectors:
            # Distance between the first feature and the centroid of the 1st cluster
            distance_ = distance(v, clusters[0].centroid)
            cluster_no = 0
            
            for i in range(1, n_cluster):
                dist = distance(v, clusters[i].centroid)
                # Find the smallest distance from each remaining centroids
                if dist < distance_:
                    distance_ = dist
                    cluster_no = i
            
            cluster_points[cluster_no].append(v)
        
        # Initial (biggest) shift
        shift = 0.0
        
        for i in range(n_cluster):
            # Calculate much the centroids moved in the interation
            shift_ = clusters[i].updateCentroid(cluster_points[i]) 
            # Largest Move from all the clusters
            shift = max(shift, shift_)
            
        if shift_ < converaged:
            print("Convereged after " + repr(iteration) + " iterations")
            break
        
    
    return clusters

In [228]:
features, labels = load_data()

In [243]:
converaged = 0.002

In [244]:
clusters = kmeans(features, 3, converaged)

Convereged after 8 iterations
