In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as sio
from operator import itemgetter
%matplotlib inline

### Helper functions

In [2]:
# Loading data and transforming it to pandas dataframe
def load_data(path, col_name):
    data = sio.loadmat(path)
    data = pd.DataFrame(data['Y'])
    return data

In [3]:
# Getting the values and plotting it
def plotCluster(Y, x1, x2):
    f1 = np.ravel(Y[x1, :])
    f2 = np.ravel(Y[x2, :])
    plt.scatter(f1, f2, c='blue', s=7)
    plt.show()

In [4]:
# Assigns each data point to a centroid
def assignment(X, centroids):
    C = dict.fromkeys(range(X.shape[0]), np.inf)
    Z = {}
    for i in centroids.keys():
        for j in range(X.shape[0]):
            
            # Euclidean dist
            dist = abs(np.linalg.norm(X[j] - centroids[i]))
            
            # Change assignment if dist is lesser than previous
            if dist < C[j]:
                C[j] = dist
                Z[j] = i
    return Z

In [5]:
# Updates the centroids 
def update(centroids, X, Z):
    for i in centroids.keys():
        temp = []
        for j in range(X.shape[0]):
            if Z[j] == i:
                temp.append(X[j])
        centroids[i] = np.mean(temp, axis=0)
    return centroids

In [6]:
def compute_cost(centroids, X, Z):
    cost = 0
    for i in centroids.keys():
        temp = []
        for j in range(X.shape[0]):
            if Z[j] == i:
                temp.append(X[j])
                
        for k in range(len(temp)): 
            cost += abs(np.linalg.norm(temp[k] - centroids[i]))
    return cost

### K-means

In [7]:
def k_means(X, k, r):
    # Initialize metadata
    X = X.T
    clustering_run = {}
    tol = 0.01
    
    # Iterate over given r
    for itr in range(r):
        cost = [np.inf]
        
        # Initialize random centroids
        centroids = {}
        rand = np.random.choice(len(X), k, replace=False)
        for i in range(len(rand)):
            centroids[i] = X[i, :]
        
        # Loop till you converge
        while True:
            # Alternate between assignment and update
            Z = assignment(X, centroids)
            centroids = update(centroids, X, Z)
            new_Z = assignment(X, centroids)
            cost.append(compute_cost(centroids, X, new_Z))
            
            # if change in cost is less than tol the break
            if cost[-2] - cost[-1] < tol and cost[-2] - cost[-1] > 0:
                break
        
        clustering_run[cost[-1]] = new_Z
        
    return min(clustering_run.items(), key=itemgetter(0))[1]


In [8]:
data = load_data("HW3_Data/dataset1.mat", 'Y')
X = np.matrix(data.values)
Z = k_means(X, 3, 10)
print(Z)

{0: 0, 1: 1, 2: 0, 3: 0, 4: 1, 5: 0, 6: 1, 7: 0, 8: 1, 9: 1, 10: 0, 11: 1, 12: 1, 13: 0, 14: 1, 15: 1, 16: 0, 17: 0, 18: 1, 19: 0, 20: 1, 21: 0, 22: 0, 23: 0, 24: 0, 25: 1, 26: 0, 27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 0, 33: 0, 34: 0, 35: 1, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 0, 45: 0, 46: 1, 47: 0, 48: 1, 49: 1, 50: 0, 51: 1, 52: 0, 53: 0, 54: 0, 55: 0, 56: 1, 57: 1, 58: 1, 59: 1, 60: 0, 61: 1, 62: 1, 63: 0, 64: 1, 65: 0, 66: 0, 67: 1, 68: 1, 69: 0, 70: 1, 71: 1, 72: 1, 73: 0, 74: 0, 75: 1, 76: 1, 77: 0, 78: 1, 79: 1, 80: 0, 81: 1, 82: 0, 83: 1, 84: 1, 85: 0, 86: 0, 87: 1, 88: 1, 89: 1, 90: 1, 91: 1, 92: 1, 93: 0, 94: 0, 95: 0, 96: 1, 97: 0, 98: 0, 99: 1, 100: 2, 101: 2, 102: 2, 103: 2, 104: 2, 105: 2, 106: 2, 107: 2, 108: 0, 109: 2, 110: 2, 111: 2, 112: 2, 113: 2, 114: 2, 115: 0, 116: 2, 117: 2, 118: 2, 119: 2, 120: 2, 121: 2, 122: 2, 123: 2, 124: 2, 125: 2, 126: 2, 127: 2, 128: 2, 129: 2, 130: 2, 131: 2, 132: 2, 133: 2, 134: 2, 135: 0, 136: 2, 137: 2, 138: 