From 96539087fa78bbbc12181ec24ad6b2df105f6950 Mon Sep 17 00:00:00 2001 From: Parijat Mazumdar Date: Sun, 16 Feb 2014 21:13:50 +0530 Subject: [PATCH 1/2] added PCA+KMeans in KMeans notebook --- doc/ipython-notebooks/clustering/KMeans.ipynb | 392 ++++++++++++++++-- 1 file changed, 361 insertions(+), 31 deletions(-) diff --git a/doc/ipython-notebooks/clustering/KMeans.ipynb b/doc/ipython-notebooks/clustering/KMeans.ipynb index b3888d6efae..2d9f4e712cc 100644 --- a/doc/ipython-notebooks/clustering/KMeans.ipynb +++ b/doc/ipython-notebooks/clustering/KMeans.ipynb @@ -585,7 +585,7 @@ "cell_type": "code", "collapsed": false, "input": [ - "f = open('../../../data/iris.data')\n", + "f = open('../../../data/multiclass/iris.data')\n", "features = []\n", "# read data from file\n", "for line in f:\n", @@ -600,11 +600,11 @@ "# plot the data\n", "figure,axis = pyplot.subplots(1,1)\n", "# First 50 data belong to Iris Sentosa, plotted in green\n", - "axis.plot(obsmatrix[2,0:49], obsmatrix[3,0:49], 'o', color='green', markersize=5)\n", + "axis.plot(obsmatrix[2,0:50], obsmatrix[3,0:50], 'o', color='green', markersize=5)\n", "# Next 50 data belong to Iris Versicolour, plotted in red\n", - "axis.plot(obsmatrix[2,50:99], obsmatrix[3,50:99], 'o', color='red', markersize=5)\n", + "axis.plot(obsmatrix[2,50:100], obsmatrix[3,50:100], 'o', color='red', markersize=5)\n", "# Last 50 data belong to Iris Virginica, plotted in blue\n", - "axis.plot(obsmatrix[2,100:149], obsmatrix[3,100:149], 'o', color='blue', markersize=5)\n", + "axis.plot(obsmatrix[2,100:150], obsmatrix[3,100:150], 'o', color='blue', markersize=5)\n", "axis.set_xlim(-1,8)\n", "axis.set_ylim(-1,3)\n", "axis.set_title('3 varieties of Iris plants')\n", @@ -625,30 +625,34 @@ "cell_type": "code", "collapsed": false, "input": [ - "# wrap to Shogun features\n", - "train_features = RealFeatures(obsmatrix)\n", + "def apply_kmeans_iris(data):\n", + " # wrap to Shogun features\n", + " train_features = RealFeatures(data)\n", "\n", - "# number of cluster centers = 3\n", - "k = 3\n", + " # number of cluster centers = 3\n", + " k = 3\n", "\n", - "# distance function features - euclidean\n", - "distance = EuclideanDistance(train_features, train_features)\n", + " # distance function features - euclidean\n", + " distance = EuclideanDistance(train_features, train_features)\n", "\n", - "# initialize KMeans object\n", - "kmeans = KMeans(k, distance)\n", + " # initialize KMeans object\n", + " kmeans = KMeans(k, distance)\n", "\n", - "# use kmeans++ to initialize centers [play around: change it to False and compare results]\n", - "kmeans.set_use_kmeanspp(True)\n", + " # use kmeans++ to initialize centers [play around: change it to False and compare results]\n", + " kmeans.set_use_kmeanspp(True)\n", "\n", - "# training method is Lloyd by default [play around: change it to mini-batch by uncommenting the following lines]\n", - "#kmeans.set_train_method(KMM_MINI_BATCH)\n", - "#kmeans.set_mbKMeans_params(20,30)\n", + " # training method is Lloyd by default [play around: change it to mini-batch by uncommenting the following lines]\n", + " #kmeans.set_train_method(KMM_MINI_BATCH)\n", + " #kmeans.set_mbKMeans_params(20,30)\n", "\n", - "# training kmeans\n", - "kmeans.train(train_features)\n", + " # training kmeans\n", + " kmeans.train(train_features)\n", "\n", - "# labels for data points\n", - "result = kmeans.apply()" + " # labels for data points\n", + " result = kmeans.apply()\n", + " return result\n", + "\n", + "result = apply_kmeans_iris(obsmatrix)" ], "language": "python", "metadata": {}, @@ -719,18 +723,24 @@ "collapsed": false, "input": [ "from numpy import nonzero\n", - "# shogun object for clustering accuracy\n", - "AccuracyEval = ClusteringAccuracy()\n", "\n", - "# changes the labels of result (keeping clusters intact) to produce a best match with ground truth\n", - "AccuracyEval.best_map(result, ground_truth)\n", + "def analyzeResult(result): \n", + " # shogun object for clustering accuracy\n", + " AccuracyEval = ClusteringAccuracy()\n", "\n", - "# evaluates clustering accuracy\n", - "print 'Accuracy : ' + str(AccuracyEval.evaluate(result, ground_truth))\n", + " # changes the labels of result (keeping clusters intact) to produce a best match with ground truth\n", + " AccuracyEval.best_map(result, ground_truth)\n", "\n", - "# find out which sample points differ from actual labels (or ground truth)\n", - "compare = result.get_labels()-labels\n", - "diff = nonzero(compare)\n", + " # evaluates clustering accuracy\n", + " accuracy = AccuracyEval.evaluate(result, ground_truth)\n", + "\n", + " # find out which sample points differ from actual labels (or ground truth)\n", + " compare = result.get_labels()-labels\n", + " diff = nonzero(compare)\n", + " return (diff,accuracy)\n", + "\n", + "(diff,accuracy_4d) = analyzeResult(result)\n", + "print 'Accuracy : ' + str(accuracy_4d)\n", "\n", "# plot the difference between ground truth and predicted clusters\n", "figure,axis = pyplot.subplots(1,1)\n", @@ -749,7 +759,327 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the above plot, wrongly clustered data points are marked in red. We see that the Iris Sentosa plants are perfectly clustered without error. The Iris Versicolour plants and Iris Virginica plants are also clustered with high accuracy, but there are some plant samples of either class that have been clustered with the wrong class. This happens near the boundary of the 2 classes in the plot and was well expected. " + "In the above plot, wrongly clustered data points are marked in red. We see that the Iris Sentosa plants are perfectly clustered without error. The Iris Versicolour plants and Iris Virginica plants are also clustered with high accuracy, but there are some plant samples of either class that have been clustered with the wrong class. This happens near the boundary of the 2 classes in the plot and was well expected. Having mastered KMeans, it's time to move on to next interesting topic. " + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "PCA as a preprocessor to KMeans" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "KMeans is highly affected by the curse of dimensionality. So, dimension reduction becomes an important preprocessing step. Shogun offers a variety of dimension reduction techniques to choose from. Since our data is not very high dimensional, PCA is a good choice for dimension reduction. We have already seen the accuracy of KMeans when all four dimensions are used. In the following exercise we shall see how the accuracy varies as one chooses lower dimensions to represent data. " + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "1-Dimensional representation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us first apply PCA to reduce training features to 1 dimension" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from numpy import dot\n", + "def apply_pca_to_data(target_dims):\n", + " train_features = RealFeatures(obsmatrix)\n", + " submean = PruneVarSubMean(False)\n", + " submean.init(train_features)\n", + " submean.apply_to_feature_matrix(train_features)\n", + " preprocessor = PCA()\n", + " preprocessor.set_target_dim(target_dims)\n", + " preprocessor.init(train_features)\n", + " pca_transform = preprocessor.get_transformation_matrix()\n", + " new_features = dot(pca_transform.T, train_features)\n", + " return new_features\n", + "oneD_matrix = apply_pca_to_data(1)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let us get an idea of the data in 1-D by plotting it." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "figure,axis = pyplot.subplots(1,1)\n", + "# First 50 data belong to Iris Sentosa, plotted in green\n", + "axis.plot(oneD_matrix[0,0:50], zeros(50), 'o', color='green', markersize=5)\n", + "# Next 50 data belong to Iris Versicolour, plotted in red\n", + "axis.plot(oneD_matrix[0,50:100], zeros(50), 'o', color='red', markersize=5)\n", + "# Last 50 data belong to Iris Virginica, plotted in blue\n", + "axis.plot(oneD_matrix[0,100:150], zeros(50), 'o', color='blue', markersize=5)\n", + "axis.set_xlim(-5,5)\n", + "axis.set_ylim(-1,1)\n", + "axis.set_title('3 varieties of Iris plants')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us now apply KMeans to the 1-D data to get clusters." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "result = apply_kmeans_iris(oneD_matrix)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have the results, the inevitable step is to check how good these results are. " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "(diff,accuracy_1d) = analyzeResult(result)\n", + "print 'Accuracy : ' + str(accuracy_1d)\n", + "\n", + "# plot the difference between ground truth and predicted clusters\n", + "figure,axis = pyplot.subplots(1,1)\n", + "axis.plot(oneD_matrix[0,:],zeros(150),'x',color='black', markersize=5)\n", + "axis.plot(oneD_matrix[0,diff],zeros(len(diff)),'x',color='r', markersize=7)\n", + "axis.set_xlim(-5,5)\n", + "axis.set_ylim(-1,1)\n", + "axis.set_title('Difference')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "2-Dimensional Representation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We follow the same steps as above and get the clustering accuracy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "STEP 1 : Apply PCA and plot the data (plotting is optional)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "twoD_matrix = apply_pca_to_data(2)\n", + "\n", + "figure,axis = pyplot.subplots(1,1)\n", + "# First 50 data belong to Iris Sentosa, plotted in green\n", + "axis.plot(twoD_matrix[0,0:50], twoD_matrix[1,0:50], 'o', color='green', markersize=5)\n", + "# Next 50 data belong to Iris Versicolour, plotted in red\n", + "axis.plot(twoD_matrix[0,50:100], twoD_matrix[1,50:100], 'o', color='red', markersize=5)\n", + "# Last 50 data belong to Iris Virginica, plotted in blue\n", + "axis.plot(twoD_matrix[0,100:150], twoD_matrix[1,100:150], 'o', color='blue', markersize=5)\n", + "axis.set_title('3 varieties of Iris plants')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "STEP 2 : Apply KMeans to obtain clusters" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "result = apply_kmeans_iris(twoD_matrix)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "STEP 3: Get the accuracy of the results" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "(diff,accuracy_2d) = analyzeResult(result)\n", + "print 'Accuracy : ' + str(accuracy_2d)\n", + "\n", + "# plot the difference between ground truth and predicted clusters\n", + "figure,axis = pyplot.subplots(1,1)\n", + "axis.plot(twoD_matrix[0,:],twoD_matrix[1,:],'x',color='black', markersize=5)\n", + "axis.plot(twoD_matrix[0,diff],twoD_matrix[1,diff],'x',color='r', markersize=7)\n", + "axis.set_title('Difference')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "3-Dimensional Representation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, we follow the same steps, but skip plotting data (because plotting 3-D is not possible)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "STEP 1: Apply PCA to data" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "threeD_matrix = apply_pca_to_data(3)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "STEP 2: Apply KMeans to 3-D representation of data" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "result = apply_kmeans_iris(threeD_matrix)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "STEP 3: Get accuracy of results. In this step, the 'difference' plot positions data points based petal length \n", + " and petal width in the original data. This will enable us to visually campare these results with that of KMeans applied\n", + " to 4-Dimensional data (ie. our first result on Iris dataset)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "(diff,accuracy_3d) = analyzeResult(result)\n", + "print 'Accuracy : ' + str(accuracy_3d)\n", + "\n", + "# plot the difference between ground truth and predicted clusters\n", + "figure,axis = pyplot.subplots(1,1)\n", + "axis.plot(obsmatrix[2,:],obsmatrix[3,:],'x',color='black', markersize=5)\n", + "axis.plot(obsmatrix[2,diff],obsmatrix[3,diff],'x',color='r', markersize=7)\n", + "axis.set_title('Difference')\n", + "axis.set_xlim(-1,8)\n", + "axis.set_ylim(-1,3)\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let us plot clustering accuracy vs. number of dimensions to consolidate our results." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from scipy.interpolate import interp1d\n", + "from numpy import linspace\n", + "x = array([1, 2, 3, 4])\n", + "y = array([accuracy_1d, accuracy_2d, accuracy_3d, accuracy_4d])\n", + "f = interp1d(x, y)\n", + "xnew = linspace(1,4,10)\n", + "pyplot.plot(x,y,'o',xnew,f(xnew),'-')\n", + "pyplot.xlim([0,5])\n", + "pyplot.xlabel('no. of dims')\n", + "pyplot.ylabel('Clustering Accuracy')\n", + "pyplot.title('PCA Results')\n", + "pyplot.show()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above plot is not very intuitive. The accuracy obtained by using just one latent dimension is much more than that obtained by taking all four features features. This shows the importance of PCA. Not only does it reduce the complexity of running KMeans, it also enhances results." ] }, { From d2b1a47d7cc3113dc46ba1069a8a66daf3983e1f Mon Sep 17 00:00:00 2001 From: Parijat Mazumdar Date: Tue, 18 Feb 2014 12:06:07 +0530 Subject: [PATCH 2/2] minor changes in PCA in KMeans notebook --- doc/ipython-notebooks/clustering/KMeans.ipynb | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/ipython-notebooks/clustering/KMeans.ipynb b/doc/ipython-notebooks/clustering/KMeans.ipynb index 2d9f4e712cc..c85cdaf68cf 100644 --- a/doc/ipython-notebooks/clustering/KMeans.ipynb +++ b/doc/ipython-notebooks/clustering/KMeans.ipynb @@ -774,7 +774,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "KMeans is highly affected by the curse of dimensionality. So, dimension reduction becomes an important preprocessing step. Shogun offers a variety of dimension reduction techniques to choose from. Since our data is not very high dimensional, PCA is a good choice for dimension reduction. We have already seen the accuracy of KMeans when all four dimensions are used. In the following exercise we shall see how the accuracy varies as one chooses lower dimensions to represent data. " + "KMeans is highly affected by the curse of dimensionality. So, dimension reduction becomes an important preprocessing step. Shogun offers a variety of [dimension reduction techniques](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDimensionReductionPreprocessor.html) to choose from. Since our data is not very high dimensional, PCA is a good choice for dimension reduction. We have already seen the accuracy of KMeans when all four dimensions are used. In the following exercise we shall see how the accuracy varies as one chooses lower dimensions to represent data. " ] }, { @@ -797,6 +797,7 @@ "collapsed": false, "input": [ "from numpy import dot\n", + "\n", "def apply_pca_to_data(target_dims):\n", " train_features = RealFeatures(obsmatrix)\n", " submean = PruneVarSubMean(False)\n", @@ -808,6 +809,7 @@ " pca_transform = preprocessor.get_transformation_matrix()\n", " new_features = dot(pca_transform.T, train_features)\n", " return new_features\n", + "\n", "oneD_matrix = apply_pca_to_data(1)" ], "language": "python", @@ -981,7 +983,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Again, we follow the same steps, but skip plotting data (because plotting 3-D is not possible)." + "Again, we follow the same steps, but skip plotting data." ] }, { @@ -1023,7 +1025,7 @@ "metadata": {}, "source": [ "STEP 3: Get accuracy of results. In this step, the 'difference' plot positions data points based petal length \n", - " and petal width in the original data. This will enable us to visually campare these results with that of KMeans applied\n", + " and petal width in the original data. This will enable us to visually compare these results with that of KMeans applied\n", " to 4-Dimensional data (ie. our first result on Iris dataset)" ] }, @@ -1060,6 +1062,7 @@ "input": [ "from scipy.interpolate import interp1d\n", "from numpy import linspace\n", + "\n", "x = array([1, 2, 3, 4])\n", "y = array([accuracy_1d, accuracy_2d, accuracy_3d, accuracy_4d])\n", "f = interp1d(x, y)\n", @@ -1079,7 +1082,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The above plot is not very intuitive. The accuracy obtained by using just one latent dimension is much more than that obtained by taking all four features features. This shows the importance of PCA. Not only does it reduce the complexity of running KMeans, it also enhances results." + "The above plot is not very intuitive theoretically. The accuracy obtained by using just one latent dimension is much more than that obtained by taking all four features features. A plausible explanation could be that the mixing of data points from Iris Versicolour and Iris Virginica is least along the single principal dimension chosen by PCA. Additional dimensions only aggrevate this inter-mixing, thus resulting in poorer clustering accuracy. While there could be other explanations to the observed results, our small experiment has successfully highlighted the importance of PCA. Not only does it reduce the complexity of running KMeans, it also enhances results at times." ] }, {