In [70]:
import numpy as np
from pandas import Series, DataFrame
import multiprocessing as mp
from multiprocessing import Pool, Value, Array


In [71]:
def dis(a,b):
    return np.sum((a[None,:] - b[:, None])**2, -1)**0.5

In [72]:
%load_ext cython

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [73]:
%%cython -a
import numpy as np
import cython
def dis_cy(a,b):
    cdef int i, j
    cdef int m, n, l
    d=(a[None,:] - b[:, None])**2
    m,n,l=d.shape
    dis = np.zeros([m,n])
    for i in range(m):
        for j in range(n):
            dis[i,j]=(sum(d[i][j]))**0.5
    return dis

In [74]:
def cost(a,b):
    return sum(dis(a,b).min(axis=0))

In [75]:
%%cython -a
import cython
import numpy as np
from __main__ import dis_cy
def cost_cy(a,b):
#     d=np.sum((a[None,:] - b[:, None])**2, -1)**0.5
    d = dis_cy(a, b)
    return  sum(d.min(axis=0))

In [76]:
def modelcost(center,cluster):
    tot=[sum(dis(center[i],cluster[i])) for i in range(len(center))]
    return sum(tot)

In [77]:
%%cython -a
import cython
import numpy as np
from __main__ import dis_cy
def modelcost_cy(center,cluster):
    tot = [sum(dis_cy(center[i], cluster[i])) for i in range(len(center))]
#     tot=[sum(np.sum((center[i][None,:] - cluster[i][:, None])**2, -1)**0.5) for i in range(len(center))]
    return sum(tot)
    
   

In [91]:

def kmeans(data, ncenter, center, maxiter=1000, maxtol=10e-5):
    centroids=[center]
    i = 0
    diff=100
    while (diff>maxtol and i<maxiter):
        # assign data points to clusters
        
        index=np.argmin(dis(data,centroids[i]),axis=0)
        

        clusters=[data[index==i] for i in range(ncenter)]

        # recalculate centroids
        centroids.append(np.concatenate([np.mean(cluster,axis=0) for cluster in clusters]).reshape(ncenter,-1))
        diff=sum(np.sum((centroids[i+1]-centroids[i])**2,-1)**0.5)

        i+= 1
    return centroids, clusters,i

In [92]:
%%cython -a
import cython
import numpy as np
from __main__ import dis_cy, dis
@cython.boundscheck(False)
@cython.wraparound(False)

def kmeans_cy(data, ncenter, center, maxiter=1000, maxtol=10e-5):
    #m,center.shape
    centroids=[center]
    centroidsa=np.array(centroids)[0]
    i = 0
    diff=100
    while (diff>maxtol and i<maxiter):
        index=np.argmin(dis(data,centroids[i]),axis=0)
        clusters=[data[index==j] for j in range(ncenter)]
      
        temp = np.array([np.mean(cluster,axis=0) for cluster in clusters]).reshape(ncenter,-1)
       
        centroidsa=np.concatenate((centroidsa,temp),axis=1)
        
        centroids.append(temp)
        
     
        diff=sum(np.sum((centroids[i+1]-centroids[i])**2,-1)**0.5)
       
       
        i+= 1
    return centroids, clusters,i

In [80]:

def kpp(x,k,weight=1):
    index=[]
    index.append(np.random.choice(x.shape[0],1)[0])
    while(len(index)<k):
        
        prob=dis(x,x[index,]).min(axis=0)*weight/sum(dis(x,x[index,]).min(axis=0)*weight)  
        index.append(np.random.choice(x.shape[0],1,p=prob)[0])
       
    return x[index,]

In [81]:
%%cython -a
import cython
import numpy as np
def kpp_cy(x,k,weight=1):
    index=[]
    index.append(np.random.choice(x.shape[0],1)[0])
    while(len(index)<k):
        d=np.sum((x[None,:] - x[index,][:, None])**2, -1)**0.5
        prob=d.min(axis=0)*weight/sum(d.min(axis=0)*weight)  
        index.append(np.random.choice(x.shape[0],1,p=prob)[0])
       
    return x[index,]

In [82]:
def kII(x,k,l):
    index=set()
    index.add(np.random.choice(x.shape[0],1)[0])
    cost=sum(dis(x,x[list(index),]).min(axis=0))
    i=1
    while (i<np.log(cost)):
        prob=dis(x,x[list(index),]).min(axis=0)/sum(dis(x,x[list(index),]).min(axis=0)) 
        cp=np.random.choice(x.shape[0],l,p=prob,replace=True)
        index=index.union(set(cp))
        i=i+1
    center0=x[list(index),]
    w=np.sum(np.argsort(dis(center0,center0),axis=1)==1,axis=0)
    center1=kpp(center0,k,w)
    return center1

In [83]:
%%cython -a
import cython
import numpy as np
def kII_cy(x,k,l):
    index=set()
    index.add(np.random.choice(x.shape[0],1)[0])
    d=d=np.sum((x[None,:] - x[list(index),][:, None])**2, -1)**0.5
    cost=sum(d.min(axis=0))
    i=1
    while (i<np.log(cost)):
        prob=d.min(axis=0)/sum(d.min(axis=0)) 
        cp=np.random.choice(x.shape[0],l,p=prob,replace=True)
        index=index.union(set(cp))
        i=i+1
    center0=x[list(index),]
    dc=np.sum((center0[None,:] - center0[:, None])**2, -1)**0.5
    w=np.sum(np.argsort(dc,axis=1)==1,axis=0)
    indexpp=[]
    indexpp.append(np.random.choice(center0.shape[0],1)[0])
    while(len(indexpp)<k):
        d=np.sum((center0[None,:] - center0[indexpp,][:, None])**2, -1)**0.5
        prob=d.min(axis=0)*w/sum(d.min(axis=0)*w)  
        indexpp.append(np.random.choice(center0.shape[0],1,p=prob)[0])
        center1=center0[indexpp,]
    return center1

In [84]:

#simulation
def simudata(ncenter,nsize,dim):
    mean=[0]*dim
    cov=np.diag(np.random.choice([1,10,100],dim,replace=True))
    center=np.random.multivariate_normal(mean, cov, ncenter)
    s=np.random.multivariate_normal(center[0,], np.eye(dim),nsize)
    sample=np.concatenate((s, 0*np.ones(nsize)[:,None]), axis=1)
    for i in range(ncenter-1):
        s=np.random.multivariate_normal(center[i+1,], np.eye(dim),nsize)
        s=np.concatenate((s, (i+1)*np.ones(nsize)[:,None]), axis=1)
        sample=np.concatenate((sample,s),axis=0)
    return center, sample

In [85]:
%%cython -a
import cython
import numpy as np
def simudata_cy(ncenter,nsize,dim):
    mean=[0]*dim
    cov=np.diag(np.random.choice([1,10,100],dim,replace=True))
    center=np.random.multivariate_normal(mean, cov, ncenter)
    s=np.random.multivariate_normal(center[0,], np.eye(dim),nsize)
    sample=np.concatenate((s, 0*np.ones(nsize)[:,None]), axis=1)
    for i in range(ncenter-1):
        s=np.random.multivariate_normal(center[i+1,], np.eye(dim),nsize)
        s=np.concatenate((s, (i+1)*np.ones(nsize)[:,None]), axis=1)
        sample=np.concatenate((sample,s),axis=0)
    return center, sample

In [118]:
%%time
center,sample=simudata(20,1000,15)
sample
data=sample[:,0:15]
print(data.shape)


(20000, 15)
CPU times: user 27.2 ms, sys: 22.3 ms, total: 49.5 ms
Wall time: 47.9 ms


In [112]:
%%time
centercy,samplecy=simudata_cy(20,1000,15)
datacy=samplecy[:,0:15]
print(data.shape)


(20000, 15)
CPU times: user 27.6 ms, sys: 23.7 ms, total: 51.3 ms
Wall time: 49.9 ms


In [120]:
%%time
center0=kpp(sample[:,0:15],20)

CPU times: user 834 ms, sys: 597 ms, total: 1.43 s
Wall time: 1.47 s


In [121]:
%%time
center0cy=kpp_cy(sample[:,0:15],20)

CPU times: user 457 ms, sys: 470 ms, total: 927 ms
Wall time: 974 ms


In [123]:
%%time
center1=kII(sample[:,0:15],20,10)


CPU times: user 3.4 s, sys: 3.05 s, total: 6.45 s
Wall time: 6.75 s


In [124]:
%%time
center1cy=kII_cy(sample[:,0:15],20,10)

CPU times: user 61.6 ms, sys: 3.93 ms, total: 65.5 ms
Wall time: 68.7 ms


In [126]:
%%time
center,cluster, iter=kmeans(data, ncenter=20, center=center1cy, maxiter=1000, maxtol=10e-5)
print(iter)

28
CPU times: user 1.18 s, sys: 448 ms, total: 1.62 s
Wall time: 1.71 s


In [127]:
%%time
centercy,clustercy, itercy=kmeans(data, ncenter=20, center=center1cy, maxiter=1000, maxtol=10e-5)
print(iter)

28
CPU times: user 1.3 s, sys: 501 ms, total: 1.8 s
Wall time: 1.87 s


In [128]:
modelcost_cy(center[-1],cluster)

array([ 143828.29953304])

In [129]:
modelcost(center[-1],cluster)

array([ 143828.29953304])