In [63]:
import random
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [64]:
def parse(x):
    """
    Parsing Reads to tuple (label,data)
    """
    digit = x.split(",")
    return (digit[0], np.asfarray(digit[1:]))

In [65]:
# data load from minist_test.csv
with open("mnist_test.csv","r") as f:
    digits = list(map(parse, f.readlines()))

In [75]:
ratio = int(len(digits)*0.25)
validation = digits[:ratio]
training = digits

In [77]:

def computeDistance(a, b):
    """
    calculate the vector distance
    """
    return np.sqrt(np.sum((a-b)**2))
    
def init_centroids(labelled_data,k):
    """
    Generates the first number of center points randomly k.
    """
    return list(map(lambda x: x[1], random.sample(labelled_data, k)))

def sum_cluster(labelled_cluster):
    """
    Function to add cluster vector values
    """
    # assumes len(cluster) > 0
    sum_ = labelled_cluster[0][1].copy()
    for (label,vector) in labelled_cluster[1:]:
        sum_ += vector
    return sum_

def mean_cluster(labelled_cluster):
    """
    Add the cluster vector and obtain the center value..
    """
    sum_of_points = sum_cluster(labelled_cluster)
    mean_of_points = sum_of_points * (1.0 / len(labelled_cluster))
    return mean_of_points

In [78]:
def assignLabels(labelled_data, unlabelled_centroids):
    """
    Clustering physical data at the center point.
    Assign to same cluster as near center point
    """
    # enumerate because centroids are arrays which are unhashable,
    centroids_indices = range(len(unlabelled_centroids))
    
    
    clusters = {c: [] for c in centroids_indices}
    
    for (label,Xi) in labelled_data:
        # for each datapoint, pick the closest centroid.
        smallest_distance = float("inf")
        for cj_index in centroids_indices:
            cj = unlabelled_centroids[cj_index]
            distance = computeDistance(Xi, cj)
            if distance < smallest_distance:
                closest_centroid_index = cj_index
                smallest_distance = distance
        # allocate that datapoint to the cluster of that centroid.
        clusters[closest_centroid_index].append((label,Xi))
        
    return list(clusters.values())

def computeCentroid(labelled_clusters):
    """
    Locate the center point within each cluster.
    """
    new_centroids = []
    for cluster in labelled_clusters:
        new_centroids.append(mean_cluster(cluster))
    return new_centroids

def repeat_until_convergence(labelled_data, labelled_clusters, unlabelled_centroids):
    """
    Continue to move the center point and proceed with clustering.
    Continue until the newly created center point and the previous center point are similar.
    """
    previous_max_difference = 0
    while True:
        unlabelled_old_centroids = unlabelled_centroids
        unlabelled_centroids = computeCentroid(labelled_clusters)
        labelled_clusters = assignLabels(labelled_data, unlabelled_centroids)
        # Calculate the difference between the newly created center point and the previous center point
        # If the difference is small, do not calculate any more and return the value.
        differences = list(map(lambda a, b: computeDistance(a, b), unlabelled_old_centroids,unlabelled_centroids))

        max_difference = max(differences)
        difference_change = abs((max_difference - previous_max_difference) / np.mean([previous_max_difference, max_difference])) * 100
        previous_max_difference = max_difference
        
        if np.isnan(difference_change):
            break
            
    return labelled_clusters, unlabelled_centroids

In [87]:
def kmeans(labelled_data, k):
    """
    k means algorithm
    Randomize the center point, assign the cluster to the newly created center point. 
    Continue until there is no difference between the new center points.
    """
    # Init centroid and assign cluster to labels
    centroids = init_centroids(labelled_data, k)
    clusters = assignLabels(labelled_data, centroids)
    
    final_clusters, final_centroids = repeat_until_convergence(labelled_data, clusters, centroids)
    labelled_centroids = assign_labels_to_centroids(final_clusters, final_centroids)    
    return labelled_centroids

In [88]:
def assign_labels_to_centroids(clusters, centroids):
    """
    A function that associates the center point created from the data with the actual label (number).
    Locate the label(number) of the generated center point.
    """
    
    labelled_centroids = []
    for i in range(len(clusters)):
        labels = list(map(lambda x: x[0], clusters[i]))
        # pick the most common label
        most_common = max(set(labels), key=labels.count)
        centroid = (most_common, centroids[i])
        labelled_centroids.append(centroid)
    return labelled_centroids

In [89]:
def classify_digit(digit, labelled_centroids):
    """
    given an unlabelled digit represented by a vector and a list of
    labelled centroids [(label,vector)], determine the closest centroid
    and thus classify the digit.
    """
    mindistance = float("inf")
    for (label, centroid) in labelled_centroids:
        distance = computeDistance(centroid, digit)
        if distance < mindistance:
            mindistance = distance
            closest_centroid_label = label
            
    return closest_centroid_label

def computeAccuracy(digits, labelled_centroids):
    """
    classifies a list of labelled digits. returns the error rate.
    """
    classified_incorrect = 0
    for (label,digit) in digits:
        classified_label = classify_digit(digit, labelled_centroids)
        if classified_label != label:
            classified_incorrect +=1
    error_rate = classified_incorrect / float(len(digits))
    
    return error_rate



In [90]:
k = 10
labelled_centroids = kmeans(training, k) # compute k means algorithm
accuracy = computeAccuracy(training, labelled_centroids)

print("#####################")
print("Accuracy is ", 1 - accuracy)
print("Error rate is ", accuracy)
print("#####################")




#####################
Accuracy is  0.6078
Error rate is  0.3922
#####################


In [None]:
"""
GIT HUB ADDRESS : https://github.com/seyeong3131/assignment-04.git
"""