# K - Means Overview

This is a pure Python implementation of the K-Means clustering algorithm.

https://www.naftaliharris.com/blog/visualizing-k-means-clustering/

In [None]:
import math
import random
import numpy as np
import pandas as pd

from copy import deepcopy

from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from bokeh.plotting import figure, show, ColumnDataSource
import bokeh.models as bmo
from bokeh.io import output_notebook
output_notebook()

## Simulated Data
SKlearn has a couple different methods for simulating data for testing and development purposes. We're generating a serries of fake cluster data with 5 clusters and 500 observations. THe data will only have two dimensions.

In [None]:
#Going to make fake data using Sklearn
X, y = make_blobs(n_samples=500, centers=5, n_features=2,
                  random_state=745)

df = pd.DataFrame({'X_1': X[:,0],
                  'X_2':X[:,1],
                  'Y':y})

df['group'] = df['Y'].astype('str')

In [None]:
df.head()

## Charting in Bokeh

In [None]:
deloitte_palette = ["#000000","#86BC25","#C4D600","#43B02A","#046A38","#2C5234", "#0097A9", "#62B5E5",
                   "#00A3E0", "#0076A8", "#012169"]

source = ColumnDataSource(df)

color_map = bmo.CategoricalColorMapper(factors=df['group'].unique(), palette=deloitte_palette)

p = figure()

p.circle(x='X_1', y="X_2", radius=.23, 
         fill_alpha=0.6, source=source, 
         fill_color={'field' : 'group', 'transform': color_map})

In [None]:
show(p)

## K-Means Algorithm: *By-Hand Example*

1. [Choose $k$ initial centroids (note that $k$ is an input)](#Step1)
2. For each point $p$:
  - Find distance to each centroid
  - Assign point to nearest centroid
3. Recalculate centroid positions
4. Repeat steps 2-3 until stopping criteria met

### Step 1: Choosing Initial Centroids <a id=Step1></a>

There are several options to pick an initial centroid positions:
1. Randomly (may yield divergent behavior)
2. Perform alternative clustering task, use resulting centroids as initial k-means centroids
3. Start with global centroid, choose point at max distance, repeat (but might select outlier)

#### Random initial centroids

In [None]:
k = 5

r = np.random.randint(low=0, high=X.shape[0], size=k)
initial = X[r,:]

print("Our initial centroids:")
initial

In [None]:
p.diamond_cross(x=initial[:,0], 
                y=initial[:,1], 
                size=20, 
                color="#386CB0", 
                fill_color=None, 
                line_width=2)

In [None]:
show(p)

### Step 2: Assessing Similarity

How do you determine which centroid a given point is most similar to? The similarity criterion is determined by the measure we choose. In the case of k-means clustering, the most common similarity metric is *__Euclidean distance:__*

$$ d(x_1,x_2) = \sqrt{\sum_{i=1}^N(x_{1i} - x_{2i})^2} $$

Both `numpy` and `sklearn` have implementations of euclidian which we can leverage. 

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

dist = euclidean_distances(X, initial)

In [None]:
dist

In [None]:
cluster = np.argmin(dist, axis=1)

In [None]:
cluster

### Step 3: Recompute the Center
How do we recompute the positions of the centers at each iteration of the algorithm?

We calculate the centroid at the geometric center of our new assigned clusters. `Pandas` has significanly eaiser ways to perform group by operations over `numpy`.

In [None]:
points = pd.DataFrame.from_records(X, columns=['x', 'y'])
points['cluster'] = cluster

points.head()

In [None]:
centroids = points.groupby('cluster').mean()
centroids.head()

In [None]:
p.diamond_cross(x=centroids.x, 
                y=centroids.y, 
                size=20, 
                color="#ff0000", 
                fill_color=None, 
                line_width=2)
show(p)

In [None]:
old = pd.DataFrame.from_records(initial, columns=["x_old", "y_old"])
centroids = pd.concat([centroids,old],axis=1)
centroids.head()

def x_line(row):
    return [row['x'],row['x_old']]

def y_line(row):
    return [row['y'], row['y_old']]

centroids['xs'] = centroids.apply(x_line, axis=1)
centroids['ys'] = centroids.apply(y_line, axis=1)

In [None]:
p.multi_line(xs=centroids['xs'], ys=centroids['ys'],color="navy", alpha=0.3, line_width=4)
show(p)

### Step 4: Converge
We iterate until some stopping criteria are met; in general, suitable convergence is achieved in a small number of steps. 

Stopping criteria can be based on the centroids (eg, if positiosn change by no more than $\epsilon$) or on the points (if no more than x% change clusters between iterations).

Up to this point, we have been using illustrative examples of the steps. Now, we will wrap up our work in a KMeans class with some helper functions to iterate through the steps and fit the model. 

In [None]:
np.inf

In [None]:
def centroid(data):
    """Find the centroid of the given data."""
    return np.mean(data, 0)


def sse(data):
    """Calculate the SSE of the given data."""
    u = centroid(data)
    return np.sum(np.linalg.norm(data - u, 2, 1))


class KMeansClusterer:
    """The standard k-means clustering algorithm."""

    def __init__(self, data=None, k=2, min_gain=0.01, max_iter=100,
                 max_epoch=10, verbose=True):
        """Learns from data if given."""
        if data is not None:
            self.fit(data, k, min_gain, max_iter, max_epoch, verbose)

    def fit(self, data, k=2, min_gain=0.01, max_iter=100, max_epoch=10,
            verbose=True):
        """Learns from the given data.
        Args:
            data:      The dataset with m rows each with n features
            k:         The number of clusters
            min_gain:  Minimum gain to keep iterating
            max_iter:  Maximum number of iterations to perform
            max_epoch: Number of random starts, to find global optimum
            verbose:   Print diagnostic message if True
        Returns:
            self
        """
        # Pre-process
        self.data = np.matrix(data)
        self.k = k
        self.min_gain = min_gain
        self.meta = []

        # Perform multiple random init for global optimum
        min_sse = np.inf
        for epoch in range(max_epoch):

            # Randomly initialize k centroids
            indices = np.random.choice(len(data), k, replace=False)
            u = self.data[indices, :]

            # Loop
            t = 0
            old_sse = np.inf
            while True:
                t += 1

                # Cluster assignment
                C = [None] * k
                for x in self.data:
                    j = np.argmin(np.linalg.norm(x - u, 2, 1))
                    C[j] = x if C[j] is None else np.vstack((C[j], x))

                # Centroid update
                for j in range(k):
                    u[j] = centroid(C[j])

                # Loop termination condition
                if t >= max_iter:
                    break
                new_sse = np.sum([sse(C[j]) for j in range(k)])
                gain = old_sse - new_sse
                if verbose:
                    line = "Epoch {:2d} Iter {:2d}: SSE={:10.4f}, GAIN={:10.4f}"
                    print(line.format(epoch, t, new_sse, gain))
                if gain < self.min_gain:
                    if new_sse < min_sse:
                        min_sse, self.C, self.u = new_sse, C, u
                    break
                else:
                    old_sse = new_sse

            if verbose:
                print('')  # blank line between every epoch

        return self

In [None]:
t = KMeansClusterer(data=X, k=5)

### Using K means clustering from scikit learn

In [None]:
from sklearn.cluster import KMeans

#### Identifying appropriate K using Elbow method

In [None]:
df.head()

In [None]:
df2= df.copy()

In [None]:
cols=['Y','group']
df2.drop(columns=cols,axis=1,inplace=True)

In [None]:
df2.head(2)

In [None]:
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(df2)
    distortions.append(kmeanModel.inertia_)

### Plotting distortions or SSE of all the K values

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

1. The above code shows that inertia's change is no more significant post K=4 and hence we can go ahead with K=4

In [None]:
kmeans_object = KMeans(n_clusters=4,random_state=123)
y_groups=kmeans_object.fit_predict(df2)

In [None]:
#Identifying the cluster centers for 4 clusters
centroids=kmeans_object.cluster_centers_