## K-Means Implementation
(with K-means++ initialization)
### Author: Yifan Wang

In [451]:
from  sklearn.datasets import load_digits
import numpy as np

### Load Data

In [452]:
X,y = load_digits(n_class = 3, return_X_y=True)

In [453]:
print(X.shape)
print(y.shape)

(537, 64)
(537,)


### Kmeans

In [454]:
class KMEANS:
    def __init__(self, k, iters=100,n_eval=10):
        '''
        Input:
        k: number of clusters
        iters: number of iterations the model will run
        n_eval: evaluate the Within-cluster sum of squares every how many rounds
        '''
        self.k = k
        self.iters = iters
        self.n_eval = n_eval
        
    def distance(self, data,centroid):
        '''vectorized euclidean distance

        Input:
        data: X matrix for the Kmeans, dimension is mxn
        centroid: centroid matrix, dimension is kxn

        Note: np.newaxis is used to broadcast vectorized operations
        '''

        return np.sqrt((np.square(data[:,np.newaxis]-centroid).sum(axis=2))) 


    def centroids_init(self,X,k):
        '''
        K-means++ centroid initialzation method.
        Details can be found: https://en.wikipedia.org/wiki/K-means%2B%2B
        Steps:


            1. Choose one center uniformly at random from among the data points.
            2. For each data point x, compute D(x), the distance between x and the nearest center that has already been chosen.
            3. Choose one new data point at random as a new center, using a weighted probability distribution where a point x is chosen with probability proportional to D(x)2.
            4. Repeat Steps 2 and 3 until k centers have been chosen.
        Now that the initial centers have been chosen, proceed using standard k-means clustering.
        '''
        dim = X.shape[1]
        centroids = np.zeros((k,dim))
        # first centroid:
        centroids[0,:] = X[np.random.randint(X.shape[0]),]
        # subsequent centroid:
        for i in range(1,k):
            sq_dist = distance(X, centroids[i-1,:].reshape(1,-1))**2
            prob = sq_dist/np.sum(sq_dist)
            cent_idx = np.random.choice(range(X.shape[0]),p = prob.ravel())
            centroids[i,:] = X[cent_idx,]

        return centroids
    
    
    
    def fit(self,X):
        centroids_mat = self.centroids_init(X,self.k)
        
        for round in range(self.iters):
            dists = self.distance(X,centroids_mat) # distance calc
            labels = np.argmin(dists,axis=1) # cluster labeling based on distances

            if round%self.n_eval==0:
                print("Within-Cluster Sum of Squares: {}".format(np.sum(np.min(dists,axis=1))))

            # Re-adjust centroid:
            if round < (self.iters-1):
                for i in range(self.k):
                    centroids_mat[i,:] = np.mean(X[labels==i,:],axis=0)
        self.cluster = labels




        

In [455]:
N_CLUSTER = 3
N_ITER = 150
N_EVAL =20

In [456]:
km_model = KMEANS(
    k = N_CLUSTER, 
    iters=N_ITER,
    n_eval=N_EVAL)

In [457]:
# Normalize scale:
X /= 255.0

In [458]:
km_model.fit(X)

Within-Cluster Sum of Squares: 82.4121597722791
Within-Cluster Sum of Squares: 52.74963388979773
Within-Cluster Sum of Squares: 52.74963388979773
Within-Cluster Sum of Squares: 52.74963388979773
Within-Cluster Sum of Squares: 52.74963388979773
Within-Cluster Sum of Squares: 52.74963388979773
Within-Cluster Sum of Squares: 52.74963388979773
Within-Cluster Sum of Squares: 52.74963388979773


### Result:

Let's check within each cluster, what are the actual digits labels

In [459]:
from collections import Counter

In [460]:

for cl in range(N_CLUSTER):
    
    idx = km_model.cluster==cl
    
    res = Counter(y[idx])
    
    
    print('For all data labeled %d :'%cl)
    print('The label:count pairs are: {}'.format(res))
    
    
    
    
    
    

For all data labeled 0 :
The label:count pairs are: Counter({0: 178, 2: 3, 1: 2})
For all data labeled 1 :
The label:count pairs are: Counter({1: 153, 2: 9})
For all data labeled 2 :
The label:count pairs are: Counter({2: 165, 1: 27})


### Summary:

According to the result, 

- In the first cluster,  178/183 = 97%   are belong to the same label;
- In the second cluster, 153/162 = 94%   are belong to the same label;
- In the third cluster,  165/192 = 85%   are belong to the same label;

### References:


https://en.wikipedia.org/wiki/K-means_clustering

https://en.wikipedia.org/wiki/K-means%2B%2B