From d2b08e224cdc43dc8b174e8b6fbf6e17bec8dea0 Mon Sep 17 00:00:00 2001 From: Parijat Mazumdar Date: Tue, 4 Feb 2014 00:49:11 +0530 Subject: [PATCH] KMeans notebook added --- doc/ipython-notebooks/clustering/KMeans.ipynb | 614 ++++++++++++++++++ 1 file changed, 614 insertions(+) create mode 100644 doc/ipython-notebooks/clustering/KMeans.ipynb diff --git a/doc/ipython-notebooks/clustering/KMeans.ipynb b/doc/ipython-notebooks/clustering/KMeans.ipynb new file mode 100644 index 00000000000..bd480e391f8 --- /dev/null +++ b/doc/ipython-notebooks/clustering/KMeans.ipynb @@ -0,0 +1,614 @@ +{ + "metadata": { + "name": "" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Clustering with KMeans in Shogun Machine Learning Toolbox" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Notebook by Parijat Mazumdar (GitHub ID: mazumdarparijat)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook we are going to see how Shogun Machine Learning Toolbox can be used for clustering with KMeans. In particular, we will be discussing the various options/choices provided to a user by the KMeans implementation in Shogun. " + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "KMeans - An Overview" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The KMeans clustering algorithm is used to partition a space of n observations into k partitions (or clusters). Each of these clusters is denoted by the mean of the observation vectors belonging to it and a unique label which is attached to all the observations belonging to it. Thus, in general, the algorithm takes parameter k and an observation matrix (along with the notion of distance between points ie distance metric) as input and returns mean of each of the k clusters along with labels indicating belongingness of each observations. Let us construct a simple example to understand how it is done in Shogun using the CKMeans class. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us start by creating a toy dataset." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from numpy import concatenate, array\n", + "from numpy.random import randn\n", + "\n", + "num=200\n", + "d1 = concatenate((randn(1,num),10.*randn(1,num)),0)\n", + "d2 = concatenate((randn(1,num),10.*randn(1,num)),0)+array([[10.],[0.]])\n", + "d3 = concatenate((randn(1,num),10.*randn(1,num)),0)+array([[0.],[100.]])\n", + "d4 = concatenate((randn(1,num),10.*randn(1,num)),0)+array([[10.],[100.]])\n", + "\n", + "rectangle = concatenate((d1,d2,d3,d4),1)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The toy data created above consists of 4 gaussian blobs, having 200 points each, centered around the vertices of a rectancle. Let's plot it for convenience." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import matplotlib.pyplot as pyplot\n", + "% matplotlib inline\n", + "\n", + "figure,axis = pyplot.subplots(1,1)\n", + "axis.plot(rectangle[0], rectangle[1], 'o', color='red', markersize=3)\n", + "axis.set_xlim(-5,15)\n", + "axis.set_ylim(-50,150)\n", + "axis.set_title('Toy data : Rectangle')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With data at our disposal, it is time to apply KMeans to it using the KMeans class in Shogun. First we construct Shogun features from our data:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from modshogun import *\n", + "\n", + "train_features = RealFeatures(rectangle)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we specify the number of clusters we want and create a distance object specifying the distance metric to be used over our data for our KMeans training:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# number of clusters\n", + "k = 2\n", + "\n", + "# distance metric over feature matrix - Euclidean distance\n", + "distance = EuclideanDistance(train_features, train_features)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we create a KMeans object with our desired inputs/parameters and train:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# KMeans object created\n", + "kmeans = KMeans(k, distance)\n", + "\n", + "# KMeans training \n", + "kmeans.train()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that training has been done, let's get the cluster centers and label for each data point " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# cluster centers\n", + "centers = kmeans.get_cluster_centers()\n", + "\n", + "# Labels for data points\n", + "result = kmeans.apply()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally let us plot the centers and the data points (in different colours for different clusters):" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "figure,axis = pyplot.subplots(1,1)\n", + "for i in xrange(800):\n", + " if (result[i]==0.0):\n", + " axis.plot(rectangle[0,i], rectangle[1,i], 'o', color='green', markersize=3)\n", + " else:\n", + " axis.plot(rectangle[0,i], rectangle[1,i], 'o', color='blue', markersize=3)\n", + " axis.plot(centers[0,0], centers[1,0], 'x', color='green', markersize=10)\n", + " axis.plot(centers[0,1], centers[1,1], 'x', color='blue', markersize=10)\n", + "axis.set_xlim(-5,15)\n", + "axis.set_ylim(-50,150)\n", + "axis.set_title('KMeans Results')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have already worked out a simple KMeans implementation, it's time to understand certain specifics of KMeans implementaion and the options provided by Shogun to its users." + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Initialization of cluster centers " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The KMeans algorithm requires that the cluster centers are initialized with some values. Shogun offers 3 ways to initialize the clusters. Unless the user supplies initial centers or tells Shogun to use KMeans++, Random initialization is the default method used for cluster center initialization. This was precisely the case in the example discussed above." + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + " Initialization by hand" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are 2 ways to initialize centers by hand. One way is to pass on the centers during KMeans object creation, as follows:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from numpy import array\n", + "initial_centers = array([[0.,10.],[50.,50.]])\n", + "\n", + "# initial centers passed\n", + "kmeans = KMeans(k, distance, initial_centers)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's first get results by repeating the rest of the steps:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# KMeans training \n", + "kmeans.train(train_features)\n", + "\n", + "# cluster centers\n", + "centers = kmeans.get_cluster_centers()\n", + "\n", + "# Labels for data points\n", + "result = kmeans.apply()\n", + "\n", + "figure,axis = pyplot.subplots(1,1)\n", + "for i in xrange(800):\n", + " if (result[i]==0.0):\n", + " axis.plot(rectangle[0,i], rectangle[1,i], 'o', color='green', markersize=3)\n", + " else:\n", + " axis.plot(rectangle[0,i], rectangle[1,i], 'o', color='blue', markersize=3)\n", + " axis.plot(centers[0,0], centers[1,0], 'x', color='green', markersize=10)\n", + " axis.plot(centers[0,1], centers[1,1], 'x', color='blue', markersize=10)\n", + "axis.set_xlim(-5,15)\n", + "axis.set_ylim(-50,150)\n", + "axis.set_title('Hand initialized KMeans Results 1')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The other way to initialize centers by hand is as follows: " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "new_initial_centers = array([[5.,5.],[0.,100.]])\n", + "\n", + "# set new initial centers\n", + "kmeans.set_initial_centers(new_initial_centers)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's complete the rest of the code to get results." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# KMeans training \n", + "kmeans.train(train_features)\n", + "\n", + "# cluster centers\n", + "centers = kmeans.get_cluster_centers()\n", + "\n", + "# Labels for data points\n", + "result = kmeans.apply()\n", + "\n", + "figure,axis = pyplot.subplots(1,1)\n", + "for i in xrange(800):\n", + " if (result[i]==0.0):\n", + " axis.plot(rectangle[0,i], rectangle[1,i], 'o', color='green', markersize=3)\n", + " else:\n", + " axis.plot(rectangle[0,i], rectangle[1,i], 'o', color='blue', markersize=3)\n", + " axis.plot(centers[0,0], centers[1,0], 'x', color='green', markersize=10)\n", + " axis.plot(centers[0,1], centers[1,1], 'x', color='blue', markersize=10)\n", + "axis.set_xlim(-5,15)\n", + "axis.set_ylim(-50,150)\n", + "axis.set_title('Hand initialized KMeans Results 2')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Initializing using KMeans++ algorithm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Shogun, a user can also use KMeans++ algorithm for center initialization. Using KMeans++ for center initialization is beneficial because it reduces total iterations used by KMeans and also the final centers mostly correspond to the global minima, which is often not the case with KMeans with random initialization. One of the ways to use KMeans++ is to set flag as true during KMeans object creation, as follows: " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# set flag for using KMeans++\n", + "kmeans = KMeans(k, distance, True)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The other way to initilize using KMeans++ is as follows:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# set KMeans++ flag\n", + "kmeans.set_use_kmeanspp(True)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Completing rest of the steps to get result:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# KMeans training \n", + "kmeans.train(train_features)\n", + "\n", + "# cluster centers\n", + "centers = kmeans.get_cluster_centers()\n", + "\n", + "# Labels for data points\n", + "result = kmeans.apply()\n", + "\n", + "figure,axis = pyplot.subplots(1,1)\n", + "for i in xrange(800):\n", + " if (result[i]==0.0):\n", + " axis.plot(rectangle[0,i], rectangle[1,i], 'o', color='green', markersize=3)\n", + " else:\n", + " axis.plot(rectangle[0,i], rectangle[1,i], 'o', color='blue', markersize=3)\n", + " axis.plot(centers[0,0], centers[1,0], 'x', color='green', markersize=10)\n", + " axis.plot(centers[0,1], centers[1,1], 'x', color='blue', markersize=10)\n", + "axis.set_xlim(-5,15)\n", + "axis.set_ylim(-50,150)\n", + "axis.set_title('KMeans with KMeans++ Results')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To switch back to random initialization, you may use:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#unset KMeans++ flag\n", + "kmeans.set_use_kmeanspp(False)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Training Methods" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Shogun offers 2 training methods for KMeans clustering:Lloyd's training method is used by Shogun by default unless user switches to mini-batch training method." + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Mini-Batch KMeans" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mini-batch KMeans is very useful in case of extremely large datasets and/or very high dimensional data which is often the case in text mining. One can switch to Mini-batch KMeans training while creating KMeans object as follows:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# set training method to mini-batch\n", + "kmeans = KMeans(k, distance, KMM_MINI_BATCH)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One can switch to Mini-batch KMeans also by making use of the following method:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# set training method to mini-batch\n", + "kmeans.set_train_method(KMM_MINI_BATCH)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In mini-batch KMeans it is compulsory to set batch-size and number of iterations. These parameters can be set together or one after the other." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# set both parameters together batch size-2 and no. of iterations-100\n", + "kmeans.set_mbKMeans_params(2,100)\n", + "\n", + "# OR\n", + "\n", + "# set batch size-2\n", + "kmeans.set_mbKMeans_batch_size(2)\n", + "\n", + "# set no. of iterations-100\n", + "kmeans.set_mbKMeans_iter(100)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Completing the code to get results:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# KMeans training \n", + "kmeans.train(train_features)\n", + "\n", + "# cluster centers\n", + "centers = kmeans.get_cluster_centers()\n", + "\n", + "# Labels for data points\n", + "result = kmeans.apply()\n", + "\n", + "figure,axis = pyplot.subplots(1,1)\n", + "for i in xrange(800):\n", + " if (result[i]==0.0):\n", + " axis.plot(rectangle[0,i], rectangle[1,i], 'o', color='green', markersize=3)\n", + " else:\n", + " axis.plot(rectangle[0,i], rectangle[1,i], 'o', color='blue', markersize=3)\n", + " axis.plot(centers[0,0], centers[1,0], 'x', color='green', markersize=10)\n", + " axis.plot(centers[0,1], centers[1,1], 'x', color='blue', markersize=10)\n", + "axis.set_xlim(-5,15)\n", + "axis.set_ylim(-50,150)\n", + "axis.set_title('Mini-batch KMeans Results')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One can switch back to Lloyd's KMeans in the following way:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# set training method to mini-batch\n", + "kmeans.set_train_method(KMM_LLOYD)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "References" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[1] D. Sculley. Web-scale k-means clustering. In Proceedings of the 19th international conference on World wide web, pages 1177\u20131178. ACM, 2010\n", + "\n", + "[2] Bishop, C. M., & others. (2006). Pattern recognition and machine learning. Springer New York." + ] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file