We want to compare the performance of the MiniBatchKMeans and KMeans: the MiniBatchKMeans is faster, but gives slightly different results (see [Mini Batch K-Means](http://scikit-learn.org/stable/modules/clustering.html#mini-batch-kmeans)).

We will cluster a set of data, first with KMeans and then with MiniBatchKMeans, and plot the results. We will also plot the points that are labelled differently between the two algorithms.

#### New to Plotly?
Plotly's Python library is free and open source! [Get started](https://plot.ly/python/getting-started/) by downloading the client and [reading the primer](https://plot.ly/python/getting-started/).
<br>You can set up Plotly to work in [online](https://plot.ly/python/getting-started/#initialization-for-online-plotting) or [offline](https://plot.ly/python/getting-started/#initialization-for-offline-plotting) mode, or in [jupyter notebooks](https://plot.ly/python/getting-started/#start-plotting-online).
<br>We also have a quick-reference [cheatsheet](https://images.plot.ly/plotly-documentation/images/python_cheat_sheet.pdf) (new!) to help you get started!

### Version

In [1]:
import sklearn
sklearn.__version__

'0.18'

### Imports

This tutorial imports [MiniBatchKMeans](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans), [KMeans](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans), [pairwise_distances_argmin](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances_argmin.html#sklearn.metrics.pairwise_distances_argmin) and [make_blobs](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html#sklearn.datasets.make_blobs).

In [2]:
print(__doc__)

import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

import time
import numpy as np

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.datasets.samples_generator import make_blobs

Automatically created module for IPython interactive environment


### Calculations

Generate sample data.

In [3]:
np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)


Compute clustering with KMeans

In [4]:
k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0

Compute clustering with MiniBatchKMeans

In [5]:
mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
                      n_init=10, max_no_improvement=10, verbose=0)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0

### Plot Result

In [6]:
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.
k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0)
mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0)
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)
order = pairwise_distances_argmin(k_means_cluster_centers,
                                  mbk_means_cluster_centers)

fig = tools.make_subplots(rows=1, cols=3,
                          print_grid=False,
                          subplot_titles=('KMeans<br>train time: %.2fs\ninertia: %f' %
                                          (t_mini_batch, mbk.inertia_),
                                          'MiniBatchKmeans<br>train time: %.2fs\ninertia: %f' %
                                          (t_mini_batch, mbk.inertia_), 
                                          'Difference'))

### K Means

In [7]:
for k, col in zip(range(n_clusters), colors):
    my_members = k_means_labels == k
    cluster_center = k_means_cluster_centers[k]
    kmeans1 = go.Scatter(x=X[my_members, 0], y=X[my_members, 1],
                         showlegend=False,
                         mode='markers', marker=dict(color=col, size=4))
    kmeans2 = go.Scatter(x=[cluster_center[0]], y=[cluster_center[1]],
                         showlegend=False,
                         mode='markers', marker=dict(color=col, size=14,
                                                    line=dict(color='black',
                                                              width=1)))
    fig.append_trace(kmeans1, 1, 1)
    fig.append_trace(kmeans2, 1, 1)
    
fig['layout']['xaxis1'].update(showticklabels=False, ticks='',
                               zeroline=False, showgrid=False)
fig['layout']['yaxis1'].update(showticklabels=False, ticks='',
                               zeroline=False, showgrid=False)

### MiniBatchKMeans

In [8]:
for k, col in zip(range(n_clusters), colors):
    my_members = mbk_means_labels == order[k]
    cluster_center = mbk_means_cluster_centers[order[k]]
    minibatchkmeans1 = go.Scatter(x=X[my_members, 0], y=X[my_members, 1],
                                 showlegend=False,
                                 mode='markers', marker=dict(color=col, size=4))
    minibatchkmeans2 = go.Scatter(x=[cluster_center[0]], y=[cluster_center[1]],
                                 showlegend=False,
                                 mode='markers', marker=dict(color=col, size=14,
                                                            line=dict(color='black',
                                                                      width=1)))
    fig.append_trace(minibatchkmeans1, 1, 2)
    fig.append_trace(minibatchkmeans2, 1, 2)

fig['layout']['xaxis2'].update(showticklabels=False, ticks='',
                               zeroline=False, showgrid=False)
fig['layout']['yaxis2'].update(showticklabels=False, ticks='',
                               zeroline=False, showgrid=False)

### Difference

In [9]:
# Initialise the different array to all False
different = (mbk_means_labels == 4)

for k in range(n_clusters):
    different += ((k_means_labels == k) != (mbk_means_labels == order[k]))

identic = np.logical_not(different)
difference1 = go.Scatter(x=X[identic, 0], y=X[identic, 1],
                         showlegend=False,
                         mode='markers', marker=dict(color='#bbbbbb', size=4))
        
difference2 = go.Scatter(x=X[different, 0], y=X[different, 1], 
                         showlegend=False,
                         mode='markers', marker=dict(color='magenta', size=4))

fig.append_trace(difference1, 1, 3)
fig.append_trace(difference2, 1, 3)

fig['layout']['xaxis3'].update(showticklabels=False, ticks='',
                               zeroline=False, showgrid=False)
fig['layout']['yaxis3'].update(showticklabels=False, ticks='',
                               zeroline=False, showgrid=False)

In [10]:
py.iplot(fig)


Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!



In [2]:
from IPython.display import display, HTML

display(HTML('<link href="//fonts.googleapis.com/css?family=Open+Sans:600,400,300,200|Inconsolata|Ubuntu+Mono:400,700" rel="stylesheet" type="text/css" />'))
display(HTML('<link rel="stylesheet" type="text/css" href="http://help.plot.ly/documentation/all_static/css/ipython-notebook-custom.css">'))

! pip install git+https://github.com/plotly/publisher.git --upgrade
import publisher
publisher.publish(
    'Comparison of the K-Means and MiniBatchKMeans.ipynb', 'scikit-learn/plot-mini-batch-kmeans/', 'Comparison of the K-Means and MiniBatchKMeans clustering algorithms | plotly',
    ' ',
    title = 'Comparison of the K-Means and MiniBatchKMeans clustering algorithms | plotly',
    name = 'Comparison of the K-Means and MiniBatchKMeans clustering algorithms',
    has_thumbnail='true', thumbnail='thumbnail/kmeans-vs-minibatch.jpg', 
    language='scikit-learn', page_type='example_index',
    display_as='clustering', order=11,
    ipynb= '~Diksha_Gabha/2783')

Collecting git+https://github.com/plotly/publisher.git
  Cloning https://github.com/plotly/publisher.git to /tmp/pip-nC92g9-build
Installing collected packages: publisher
  Running setup.py install for publisher ... [?25l- error
    Complete output from command /usr/bin/python -u -c "import setuptools, tokenize;__file__='/tmp/pip-nC92g9-build/setup.py';exec(compile(getattr(tokenize, 'open', open)(__file__).read().replace('\r\n', '\n'), __file__, 'exec'))" install --record /tmp/pip-tT99EH-record/install-record.txt --single-version-externally-managed --compile:
    running install
    running build
    running build_py
    creating build
    creating build/lib.linux-x86_64-2.7
    creating build/lib.linux-x86_64-2.7/publisher
    copying publisher/publisher.py -> build/lib.linux-x86_64-2.7/publisher
    copying publisher/__init__.py -> build/lib.linux-x86_64-2.7/publisher
    running install_lib
    creating /usr/local/lib/python2.7/dist-packages/publisher
    error: could not create 