Skip to content

Commit

Permalink
Merge pull request #1817 from mazumdarparijat/new_kmeans
Browse files Browse the repository at this point in the history
kmeans++ added
  • Loading branch information
iglesias committed Jan 16, 2014
2 parents 944b0b8 + 05408fa commit c84dc0e
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 5 deletions.
61 changes: 61 additions & 0 deletions examples/undocumented/python_modular/graphical/cluster_kpp.py
@@ -0,0 +1,61 @@
"""Graphical example illustrating improvement of convergence of KMeans
when cluster centers are initialized by KMeans++ algorithm.
In this example, 4 vertices of a rectangle are chosen: (0,0) (0,100) (10,0) (10,100).
There are 500 points normally distributed about each vertex.
Therefore, the ideal cluster centers for k=2 are the global minima ie (5,0) (5,100).
Written (W) 2014 Parijat Mazumdar
"""
from pylab import figure,clf,plot,linspace,pi,show
from numpy import array,ones,zeros,cos,sin,concatenate
from numpy.random import randn

from modshogun import *

k=2
num=500
d1=concatenate((randn(1,num),10.*randn(1,num)),0)
d2=concatenate((randn(1,num),10.*randn(1,num)),0)+array([[10.],[0.]])
d3=concatenate((randn(1,num),10.*randn(1,num)),0)+array([[0.],[100.]])
d4=concatenate((randn(1,num),10.*randn(1,num)),0)+array([[10.],[100.]])

traindata=concatenate((d1,d2,d3,d4),1)
feat_train=RealFeatures(traindata)
distance=EuclideanDistance(feat_train,feat_train)

kmeans=Kmeans(k, distance, True)
kmeans.train()
centerspp=kmeans.get_cluster_centers()
radipp=kmeans.get_radiuses()

kmeans.set_use_kmeanspp(False)
kmeans.train()
centers=kmeans.get_cluster_centers()
radi=kmeans.get_radiuses()

figure('KMeans with Kmeans++')
clf()
plot(d1[0],d1[1],'rx')
plot(d2[0],d2[1],'bx',hold=True)
plot(d3[0],d3[1],'gx',hold=True)
plot(d4[0],d4[1],'cx',hold=True)

plot(centerspp[0,:], centerspp[1,:], 'ko',hold=True)
for i in xrange(k):
t = linspace(0, 2*pi, 100)
plot(radipp[i]*cos(t)+centerspp[0,i],radipp[i]*sin(t)+centerspp[1,i],'k-', hold=True)

figure('KMeans w/o KMeans++')
clf()
plot(d1[0],d1[1],'rx')
plot(d2[0],d2[1],'bx',hold=True)
plot(d3[0],d3[1],'gx',hold=True)
plot(d4[0],d4[1],'cx',hold=True)

plot(centers[0,:], centers[1,:], 'ko',hold=True)
for i in xrange(k):
t = linspace(0, 2*pi, 100)
plot(radi[i]*cos(t)+centers[0,i],radi[i]*sin(t)+centers[1,i],'k-', hold=True)

show()
75 changes: 72 additions & 3 deletions src/shogun/clustering/KMeans.cpp
Expand Up @@ -30,12 +30,13 @@ CKMeans::CKMeans()
init();
}

CKMeans::CKMeans(int32_t k_, CDistance* d)
CKMeans::CKMeans(int32_t k_, CDistance* d, bool use_kmpp)
: CDistanceMachine()
{
{
init();
k=k_;
set_distance(d);
use_kmeanspp=use_kmpp;
}

CKMeans::CKMeans(int32_t k_i, CDistance* d_i, SGMatrix<float64_t> centers_i)
Expand Down Expand Up @@ -224,6 +225,10 @@ bool CKMeans::train_machine(CFeatures* data)

ASSERT(XSize>0 && dimensions>0);

///if kmeans++ to be used
if (use_kmeanspp)
mus_initial=kmeanspp();

int32_t changed=1;
const int32_t XDimk=dimensions*k;
int32_t iter=0;
Expand Down Expand Up @@ -387,6 +392,15 @@ bool CKMeans::save(FILE* dstfile)
return false;
}

void CKMeans::set_use_kmeanspp(bool kmpp)
{
use_kmeanspp=kmpp;
}

const bool CKMeans::get_use_kmeanspp()
{
return use_kmeanspp;
}

void CKMeans::set_k(int32_t p_k)
{
Expand Down Expand Up @@ -454,13 +468,68 @@ void CKMeans::store_model_features()
SG_UNREF(rhs);
}

SGMatrix<float64_t> CKMeans::kmeanspp()
{
int32_t num=distance->get_num_vec_lhs();
SGVector<float64_t> dists=SGVector<float64_t>(num);
SGVector<int32_t> mu_index=SGVector<int32_t>(k);

/* 1st center */
int32_t mu_1=CMath::random((int32_t) 0,num-1);
mu_index[0]=mu_1;

/* choose a center - do k-1 times */
int32_t count=0;
while (++count<k)
{
float64_t sum=0.0;
/* for each data point find distance to nearest already chosen center */
for (int32_t point_idx=0;point_idx<num;point_idx++)
{
dists[point_idx]=distance->distance(mu_index[0],point_idx);
int32_t cent_id=1;

while (cent_id<count)
{
float64_t dist_temp=distance->distance(mu_index[cent_id],point_idx);
if (dists[point_idx]>dist_temp)
dists[point_idx]=dist_temp;
cent_id++;
}

dists[point_idx]*=dists[point_idx];
sum+=dists[point_idx];
}

/*random choosing - points weighted by square of distance from nearset center*/
int32_t mu_next=0;
float64_t chosen=CMath::random(0.0,sum);
while ((chosen-=dists[mu_next])>0)
mu_next++;

mu_index[count]=mu_next;
}

CDenseFeatures<float64_t>* lhs=(CDenseFeatures<float64_t>*)distance->get_lhs();
int32_t dim=lhs->get_num_features();
SGMatrix<float64_t> mat=SGMatrix<float64_t>(dim,k);
for (int32_t c_m=0;c_m<k;c_m++)
{
SGVector<float64_t> feature=lhs->get_feature_vector(c_m);
for (int32_t r_m=0;r_m<dim;r_m++)
mat(r_m,c_m)=feature[r_m];
}
SG_UNREF(lhs);
return mat;
}

void CKMeans::init()
{
max_iter=10000;
k=3;
dimensions=0;
fixed_centers=false;

use_kmeanspp=false;
SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
Expand Down
25 changes: 23 additions & 2 deletions src/shogun/clustering/KMeans.h
Expand Up @@ -48,8 +48,9 @@ class CKMeans : public CDistanceMachine
*
* @param k parameter k
* @param d distance
* @param kmeanspp True for using KMeans++
*/
CKMeans(int32_t k, CDistance* d);
CKMeans(int32_t k, CDistance* d, bool kmeanspp=false);

/** constructor for supplying initial centers
* @param k_i parameter k
Expand Down Expand Up @@ -94,6 +95,18 @@ class CKMeans : public CDistanceMachine
*/
int32_t get_k();

/* set use_kmeanspp attribute
*
* @param kmpp true=>use KMeans++ false=>don't use KMeans++
*/
void set_use_kmeanspp(bool kmpp);

/* get use_kmeanspp attribute
*
* @return use_kmeanspp true=>use KMeans++ false=>don't use KMeans++
*/
const bool get_use_kmeanspp();

/** set fixed centers
*
* @param fixed true if fixed cluster centers are intended
Expand Down Expand Up @@ -161,6 +174,12 @@ class CKMeans : public CDistanceMachine

virtual bool train_require_labels() const { return false; }

/** kmeans++ algorithm to initialize cluster centers
*
* @return initial cluster centers: matrix (k columns, dim rows)
*/
SGMatrix<float64_t> kmeanspp();

private:
void init();
void set_random_centers(float64_t* weights_set, int32_t* ClList, int32_t XSize);
Expand All @@ -186,7 +205,9 @@ class CKMeans : public CDistanceMachine

///initial centers supplied
SGMatrix<float64_t> mus_initial;


///flag to check if kmeans++ has to be used
bool use_kmeanspp;
private:
/* temp variable for cluster centers */
SGMatrix<float64_t> mus;
Expand Down

0 comments on commit c84dc0e

Please sign in to comment.