Skip to content

Commit

Permalink
Use fused types in kmeans_elkan
Browse files Browse the repository at this point in the history
  • Loading branch information
yenchenlin committed Jun 3, 2016
1 parent 9567ef5 commit 243e013
Showing 1 changed file with 28 additions and 22 deletions.
50 changes: 28 additions & 22 deletions sklearn/cluster/_k_means_elkan.pyx
Expand Up @@ -10,6 +10,7 @@
import numpy as np
cimport numpy as np
cimport cython
from cython cimport floating

from libc.math cimport sqrt

Expand All @@ -18,8 +19,8 @@ from ._k_means import _centers_dense
from ..utils.fixes import partition


cdef double euclidian_dist(double* a, double* b, int n_features) nogil:
cdef double result, tmp
cdef floating euclidian_dist(floating* a, floating* b, int n_features) nogil:
cdef floating result, tmp
result = 0
cdef int i
for i in range(n_features):
Expand All @@ -29,8 +30,8 @@ cdef double euclidian_dist(double* a, double* b, int n_features) nogil:


cdef update_labels_distances_inplace(
double* X, double* centers, double[:, :] center_half_distances,
int[:] labels, double[:, :] lower_bounds, double[:] upper_bounds,
floating* X, floating* centers, floating[:, :] center_half_distances,
int[:] labels, floating[:, :] lower_bounds, floating[:] upper_bounds,
int n_samples, int n_features, int n_clusters):
"""
Calculate upper and lower bounds for each sample.
Expand Down Expand Up @@ -81,9 +82,9 @@ cdef update_labels_distances_inplace(
"""
# assigns closest center to X
# uses triangle inequality
cdef double* x
cdef double* c
cdef double d_c, dist
cdef floating* x
cdef floating* c
cdef floating d_c, dist
cdef int c_x, j, sample
for sample in range(n_samples):
# assign first cluster center
Expand All @@ -103,8 +104,8 @@ cdef update_labels_distances_inplace(
upper_bounds[sample] = d_c


def k_means_elkan(np.ndarray[np.float64_t, ndim=2, mode='c'] X_, int n_clusters,
np.ndarray[np.float64_t, ndim=2, mode='c'] init,
def k_means_elkan(np.ndarray[floating, ndim=2, mode='c'] X_, int n_clusters,
np.ndarray[floating, ndim=2, mode='c'] init,
float tol=1e-4, int max_iter=30, verbose=False):
"""Run Elkan's k-means.
Expand All @@ -128,30 +129,35 @@ def k_means_elkan(np.ndarray[np.float64_t, ndim=2, mode='c'] X_, int n_clusters,
Whether to be verbose.
"""
#initialize
cdef np.ndarray[np.float64_t, ndim=2, mode='c'] centers_ = init
cdef double* centers_p = <double*>centers_.data
cdef double* X_p = <double*>X_.data
cdef double* x_p
if floating is float:
dtype = np.float32
else:
dtype = np.float64

#initialize
cdef np.ndarray[floating, ndim=2, mode='c'] centers_ = init
cdef floating* centers_p = <floating*>centers_.data
cdef floating* X_p = <floating*>X_.data
cdef floating* x_p
cdef Py_ssize_t n_samples = X_.shape[0]
cdef Py_ssize_t n_features = X_.shape[1]
cdef int point_index, center_index, label
cdef float upper_bound, distance
cdef double[:, :] center_half_distances = euclidean_distances(centers_) / 2.
cdef double[:, :] lower_bounds = np.zeros((n_samples, n_clusters))
cdef double[:] distance_next_center
cdef floating upper_bound, distance
cdef floating[:, :] center_half_distances = euclidean_distances(centers_) / 2.
cdef floating[:, :] lower_bounds = np.zeros((n_samples, n_clusters), dtype=dtype)
cdef floating[:] distance_next_center
labels_ = np.empty(n_samples, dtype=np.int32)
cdef int[:] labels = labels_
upper_bounds_ = np.empty(n_samples, dtype=np.float)
cdef double[:] upper_bounds = upper_bounds_
upper_bounds_ = np.empty(n_samples, dtype=dtype)
cdef floating[:] upper_bounds = upper_bounds_

# Get the inital set of upper bounds and lower bounds for each sample.
update_labels_distances_inplace(X_p, centers_p, center_half_distances,
labels, lower_bounds, upper_bounds,
n_samples, n_features, n_clusters)
cdef np.uint8_t[:] bounds_tight = np.ones(n_samples, dtype=np.uint8)
cdef np.uint8_t[:] points_to_update = np.zeros(n_samples, dtype=np.uint8)
cdef np.ndarray[np.float64_t, ndim=2, mode='c'] new_centers
cdef np.ndarray[floating, ndim=2, mode='c'] new_centers

if max_iter <= 0:
raise ValueError('Number of iterations should be a positive number'
Expand Down Expand Up @@ -226,7 +232,7 @@ def k_means_elkan(np.ndarray[np.float64_t, ndim=2, mode='c'] X_, int n_clusters,

# reassign centers
centers_ = new_centers
centers_p = <double*>new_centers.data
centers_p = <floating*>new_centers.data

# update between-center distances
center_half_distances = euclidean_distances(centers_) / 2.
Expand Down

0 comments on commit 243e013

Please sign in to comment.