diff --git a/examples/undocumented/python_modular/graphical/cluster_kpp.py b/examples/undocumented/python_modular/graphical/cluster_kpp.py new file mode 100644 index 00000000000..47567ec172e --- /dev/null +++ b/examples/undocumented/python_modular/graphical/cluster_kpp.py @@ -0,0 +1,61 @@ +"""Graphical example illustrating improvement of convergence of KMeans +when cluster centers are initialized by KMeans++ algorithm. + +In this example, 4 vertices of a rectangle are chosen: (0,0) (0,100) (10,0) (10,100). +There are 500 points normally distributed about each vertex. +Therefore, the ideal cluster centers for k=2 are the global minima ie (5,0) (5,100). + +Written (W) 2014 Parijat Mazumdar +""" +from pylab import figure,clf,plot,linspace,pi,show +from numpy import array,ones,zeros,cos,sin,concatenate +from numpy.random import randn + +from modshogun import * + +k=2 +num=500 +d1=concatenate((randn(1,num),10.*randn(1,num)),0) +d2=concatenate((randn(1,num),10.*randn(1,num)),0)+array([[10.],[0.]]) +d3=concatenate((randn(1,num),10.*randn(1,num)),0)+array([[0.],[100.]]) +d4=concatenate((randn(1,num),10.*randn(1,num)),0)+array([[10.],[100.]]) + +traindata=concatenate((d1,d2,d3,d4),1) +feat_train=RealFeatures(traindata) +distance=EuclideanDistance(feat_train,feat_train) + +kmeans=Kmeans(k, distance, True) +kmeans.train() +centerspp=kmeans.get_cluster_centers() +radipp=kmeans.get_radiuses() + +kmeans.set_use_kmeanspp(False) +kmeans.train() +centers=kmeans.get_cluster_centers() +radi=kmeans.get_radiuses() + +figure('KMeans with Kmeans++') +clf() +plot(d1[0],d1[1],'rx') +plot(d2[0],d2[1],'bx',hold=True) +plot(d3[0],d3[1],'gx',hold=True) +plot(d4[0],d4[1],'cx',hold=True) + +plot(centerspp[0,:], centerspp[1,:], 'ko',hold=True) +for i in xrange(k): + t = linspace(0, 2*pi, 100) + plot(radipp[i]*cos(t)+centerspp[0,i],radipp[i]*sin(t)+centerspp[1,i],'k-', hold=True) + +figure('KMeans w/o KMeans++') +clf() +plot(d1[0],d1[1],'rx') +plot(d2[0],d2[1],'bx',hold=True) +plot(d3[0],d3[1],'gx',hold=True) +plot(d4[0],d4[1],'cx',hold=True) + +plot(centers[0,:], centers[1,:], 'ko',hold=True) +for i in xrange(k): + t = linspace(0, 2*pi, 100) + plot(radi[i]*cos(t)+centers[0,i],radi[i]*sin(t)+centers[1,i],'k-', hold=True) + +show() diff --git a/src/shogun/clustering/KMeans.cpp b/src/shogun/clustering/KMeans.cpp index 31e6be84914..7fcca60e7d4 100644 --- a/src/shogun/clustering/KMeans.cpp +++ b/src/shogun/clustering/KMeans.cpp @@ -30,12 +30,13 @@ CKMeans::CKMeans() init(); } -CKMeans::CKMeans(int32_t k_, CDistance* d) +CKMeans::CKMeans(int32_t k_, CDistance* d, bool use_kmpp) : CDistanceMachine() -{ +{ init(); k=k_; set_distance(d); + use_kmeanspp=use_kmpp; } CKMeans::CKMeans(int32_t k_i, CDistance* d_i, SGMatrix centers_i) @@ -224,6 +225,10 @@ bool CKMeans::train_machine(CFeatures* data) ASSERT(XSize>0 && dimensions>0); + ///if kmeans++ to be used + if (use_kmeanspp) + mus_initial=kmeanspp(); + int32_t changed=1; const int32_t XDimk=dimensions*k; int32_t iter=0; @@ -387,6 +392,15 @@ bool CKMeans::save(FILE* dstfile) return false; } +void CKMeans::set_use_kmeanspp(bool kmpp) +{ + use_kmeanspp=kmpp; +} + +const bool CKMeans::get_use_kmeanspp() +{ + return use_kmeanspp; +} void CKMeans::set_k(int32_t p_k) { @@ -454,13 +468,68 @@ void CKMeans::store_model_features() SG_UNREF(rhs); } +SGMatrix CKMeans::kmeanspp() +{ + int32_t num=distance->get_num_vec_lhs(); + SGVector dists=SGVector(num); + SGVector mu_index=SGVector(k); + + /* 1st center */ + int32_t mu_1=CMath::random((int32_t) 0,num-1); + mu_index[0]=mu_1; + + /* choose a center - do k-1 times */ + int32_t count=0; + while (++countdistance(mu_index[0],point_idx); + int32_t cent_id=1; + + while (cent_iddistance(mu_index[cent_id],point_idx); + if (dists[point_idx]>dist_temp) + dists[point_idx]=dist_temp; + cent_id++; + } + + dists[point_idx]*=dists[point_idx]; + sum+=dists[point_idx]; + } + + /*random choosing - points weighted by square of distance from nearset center*/ + int32_t mu_next=0; + float64_t chosen=CMath::random(0.0,sum); + while ((chosen-=dists[mu_next])>0) + mu_next++; + + mu_index[count]=mu_next; + } + + CDenseFeatures* lhs=(CDenseFeatures*)distance->get_lhs(); + int32_t dim=lhs->get_num_features(); + SGMatrix mat=SGMatrix(dim,k); + for (int32_t c_m=0;c_m feature=lhs->get_feature_vector(c_m); + for (int32_t r_m=0;r_muse KMeans++ false=>don't use KMeans++ + */ + void set_use_kmeanspp(bool kmpp); + + /* get use_kmeanspp attribute + * + * @return use_kmeanspp true=>use KMeans++ false=>don't use KMeans++ + */ + const bool get_use_kmeanspp(); + /** set fixed centers * * @param fixed true if fixed cluster centers are intended @@ -161,6 +174,12 @@ class CKMeans : public CDistanceMachine virtual bool train_require_labels() const { return false; } + /** kmeans++ algorithm to initialize cluster centers + * + * @return initial cluster centers: matrix (k columns, dim rows) + */ + SGMatrix kmeanspp(); + private: void init(); void set_random_centers(float64_t* weights_set, int32_t* ClList, int32_t XSize); @@ -186,7 +205,9 @@ class CKMeans : public CDistanceMachine ///initial centers supplied SGMatrix mus_initial; - + + ///flag to check if kmeans++ has to be used + bool use_kmeanspp; private: /* temp variable for cluster centers */ SGMatrix mus;