Skip to content

Commit

Permalink
MAINT: cluster: rewrite vq with fused types and BLAS Fortran ABI
Browse files Browse the repository at this point in the history
  • Loading branch information
cairijun committed May 31, 2014
1 parent 5541a1c commit a3736e2
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 166 deletions.
235 changes: 90 additions & 145 deletions scipy/cluster/_vq.pyx
Expand Up @@ -9,164 +9,103 @@ Translated to Cython by David Warde-Farley, October 2009.

import numpy as np
cimport numpy as np
from cblas cimport *
from cluster_blas cimport *

cdef extern from "math.h":
float sqrtf(float num)
double sqrt(double num)

cdef extern from "numpy/arrayobject.h":
object PyArray_EMPTY(int, np.npy_intp*, int, int)
cdef enum:
PyArray_INTP

cdef extern from "numpy/npy_math.h":
cdef enum:
NPY_INFINITY

# C types
ctypedef np.float64_t float64_t
ctypedef np.float32_t float32_t
ctypedef np.int32_t int32_t

# Initialize the NumPy C API
np.import_array()


cdef void float32_tvq(float32_t *obs, float32_t *code_book,
int ncodes, int nfeat, int nobs,
np.npy_intp *codes, float32_t *low_dist):
# Naive algorithm is prefered when nfeat is small
if nfeat < 5:
float32_tvq_small_nf(obs, code_book, ncodes, nfeat, nobs,
codes, low_dist)
return

cdef int obs_index, code_index
cdef float32_t *p_obs, *p_codes, dist_sqr
cdef np.ndarray[float32_t, ndim=1] obs_sqr, codes_sqr
cdef np.ndarray[float32_t, ndim=2] M
# Use Cython fused types for templating
# Define supported data types as vq_type
ctypedef fused vq_type:
float32_t
float64_t

obs_sqr = np.ndarray(nobs, np.float32)
codes_sqr = np.ndarray(ncodes, np.float32)
M = np.ndarray((nobs, ncodes), np.float32)
# When the number of features is less than this number,
# switch back to the naive algorithm to avoid high overhead.
DEF NFEATURES_CUTOFF=5

p_obs = obs
for obs_index in range(nobs):
# obs_sqr[i] is the inner product of the i-th observation with itself
obs_sqr[obs_index] = cblas_sdot(nfeat, p_obs, 1, p_obs, 1)
p_obs += nfeat
# Initialize the NumPy C API
np.import_array()

p_codes = code_book
for code_index in range(ncodes):
# codes_sqr[i] is the inner product of the i-th code with itself
codes_sqr[code_index] = cblas_sdot(nfeat, p_codes, 1, p_codes, 1)
p_codes += nfeat

# M[i][j] is the inner product of the i-th obs and j-th code
# M = Obs * Codes.T
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
nobs, ncodes, nfeat, -2.0, obs, nfeat, code_book, nfeat,
0.0, <float32_t *>M.data, ncodes)
cdef inline vq_type _sqrt(vq_type x):
if vq_type is float32_t:
return sqrtf(x)
else:
return sqrt(x)

for obs_index in range(nobs):
low_dist[obs_index] = NPY_INFINITY
for code_index in range(ncodes):
dist_sqr = (M[obs_index, code_index] +
obs_sqr[obs_index] + codes_sqr[code_index])
if dist_sqr < low_dist[obs_index]:
codes[obs_index] = code_index
low_dist[obs_index] = dist_sqr

# dist_sqr may be negative due to float point errors
if low_dist[obs_index] > 0:
low_dist[obs_index] = sqrtf(low_dist[obs_index])
else:
low_dist[obs_index] = 0
cdef inline vq_type vec_sqr(int n, vq_type *p):
cdef vq_type result = 0.0
cdef int i
for i in range(n):
result += p[i] * p[i]
return result


cdef void float32_tvq_small_nf(float32_t *obs, float32_t *code_book,
int ncodes, int nfeat, int nobs,
np.npy_intp *codes, float32_t *low_dist):
cdef inline void cal_M(int nobs, int ncodes, int nfeat, vq_type *obs,
vq_type *code_book, vq_type *M):
"""
Vector quantization for float32 using naive algorithm.
This is perfered when nfeat is small.
Calculate M = obs * code_book.T
"""
# Temporary variables
cdef float32_t dist_sqr, diff
cdef int32_t obs_index, code_index, feature
cdef int32_t offset = 0

# Index and pointer to keep track of the current position in
# both arrays so that we don't have to always do index * nfeat.
cdef int codebook_pos
cdef float32_t *current_obs

for obs_index in range(nobs):
codebook_pos = 0
low_dist[obs_index] = NPY_INFINITY
for code_index in range(ncodes):
dist_sqr = 0
cdef vq_type alpha = -2.0, beta = 0.0

# Distance between code_book[code_index] and obs[obs_index]
for feature in range(nfeat):

# Use current_obs pointer and codebook_pos to minimize
# pointer arithmetic necessary (i.e. no multiplications)
current_obs = &(obs[offset])
diff = code_book[codebook_pos] - current_obs[feature]
dist_sqr += diff * diff
codebook_pos += 1

# Replace the code assignment and record distance if necessary
if dist_sqr < low_dist[obs_index]:
codes[obs_index] = code_index
low_dist[obs_index] = dist_sqr

low_dist[obs_index] = sqrtf(low_dist[obs_index])

# Update the offset of the current observation
offset += nfeat
# Call BLAS functions with Fortran ABI
# Note that BLAS Fortran ABI uses column-major order
if vq_type is float32_t:
f_sgemm("T", "N", &ncodes, &nobs, &nfeat,
&alpha, code_book, &nfeat, obs, &nfeat, &beta, M, &ncodes)
else:
f_dgemm("T", "N", &ncodes, &nobs, &nfeat,
&alpha, code_book, &nfeat, obs, &nfeat, &beta, M, &ncodes)


cdef void float64_tvq(float64_t *obs, float64_t *code_book,
int ncodes, int nfeat, int nobs,
np.npy_intp *codes, float64_t *low_dist):
cdef void _vq(vq_type *obs, vq_type *code_book,
int ncodes, int nfeat, int nobs,
int32_t *codes, vq_type *low_dist):
# Naive algorithm is prefered when nfeat is small
if nfeat < 5:
float64_tvq_small_nf(obs, code_book, ncodes, nfeat, nobs,
codes, low_dist)
if nfeat < NFEATURES_CUTOFF:
_vq_small_nf(obs, code_book, ncodes, nfeat, nobs, codes, low_dist)
return

cdef int obs_index, code_index
cdef float64_t *p_obs, *p_codes, dist_sqr
cdef np.ndarray[float64_t, ndim=1] obs_sqr, codes_sqr
cdef np.ndarray[float64_t, ndim=2] M

obs_sqr = np.ndarray(nobs, np.float64)
codes_sqr = np.ndarray(ncodes, np.float64)
M = np.ndarray((nobs, ncodes), np.float64)
cdef vq_type *p_obs
cdef vq_type *p_codes
cdef vq_type dist_sqr
cdef np.ndarray[vq_type, ndim=1] obs_sqr, codes_sqr
cdef np.ndarray[vq_type, ndim=2] M

if vq_type is float32_t:
obs_sqr = np.ndarray(nobs, np.float32)
codes_sqr = np.ndarray(ncodes, np.float32)
M = np.ndarray((nobs, ncodes), np.float32)
else:
obs_sqr = np.ndarray(nobs, np.float64)
codes_sqr = np.ndarray(ncodes, np.float64)
M = np.ndarray((nobs, ncodes), np.float64)

p_obs = obs
for obs_index in range(nobs):
# obs_sqr[i] is the inner product of the i-th observation with itself
obs_sqr[obs_index] = cblas_ddot(nfeat, p_obs, 1, p_obs, 1)
obs_sqr[obs_index] = vec_sqr(nfeat, p_obs)
p_obs += nfeat

p_codes = code_book
for code_index in range(ncodes):
# codes_sqr[i] is the inner product of the i-th code with itself
codes_sqr[code_index] = cblas_ddot(nfeat, p_codes, 1, p_codes, 1)
codes_sqr[code_index] = vec_sqr(nfeat, p_codes)
p_codes += nfeat

# M[i][j] is the inner product of the i-th obs and j-th code
# M = Obs * Codes.T
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
nobs, ncodes, nfeat, -2.0, obs, nfeat, code_book, nfeat,
0.0, <float64_t *>M.data, ncodes)
# M = obs * codes.T
cal_M(nobs, ncodes, nfeat, obs, code_book, <vq_type *>M.data)

for obs_index in range(nobs):
low_dist[obs_index] = NPY_INFINITY
for code_index in range(ncodes):
dist_sqr = (M[obs_index, code_index] +
obs_sqr[obs_index] + codes_sqr[code_index])
Expand All @@ -176,37 +115,35 @@ cdef void float64_tvq(float64_t *obs, float64_t *code_book,

# dist_sqr may be negative due to float point errors
if low_dist[obs_index] > 0:
low_dist[obs_index] = sqrt(low_dist[obs_index])
low_dist[obs_index] = _sqrt(low_dist[obs_index])
else:
low_dist[obs_index] = 0


cdef void float64_tvq_small_nf(float64_t *obs, float64_t *code_book,
int ncodes, int nfeat, int nobs,
np.npy_intp *codes, float64_t *low_dist):
cdef void _vq_small_nf(vq_type *obs, vq_type *code_book,
int ncodes, int nfeat, int nobs,
int32_t *codes, vq_type *low_dist):
"""
Vector quantization for float64 using naive algorithm.
This is perfered when nfeat is small.
Vector quantization for float32 using naive algorithm.
This is prefered when nfeat is small.
"""
# Temporary variables
cdef float64_t dist_sqr, diff
cdef vq_type dist_sqr, diff
cdef int32_t obs_index, code_index, feature
cdef int32_t offset = 0

# Index and pointer to keep track of the current position in
# both arrays so that we don't have to always do index * nfeat.
cdef int codebook_pos
cdef float64_t *current_obs
cdef vq_type *current_obs

for obs_index in range(nobs):
codebook_pos = 0
low_dist[obs_index] = NPY_INFINITY
for code_index in range(ncodes):
dist_sqr = 0

# Distance between code_book[code_index] and obs[obs_index]
for feature in range(nfeat):

# Use current_obs pointer and codebook_pos to minimize
# pointer arithmetic necessary (i.e. no multiplications)
current_obs = &(obs[offset])
Expand All @@ -219,19 +156,29 @@ cdef void float64_tvq_small_nf(float64_t *obs, float64_t *code_book,
codes[obs_index] = code_index
low_dist[obs_index] = dist_sqr

low_dist[obs_index] = sqrt(low_dist[obs_index])
low_dist[obs_index] = _sqrt(low_dist[obs_index])

# Update the offset of the current observation
offset += nfeat


def vq(np.ndarray obs, np.ndarray codes):
"""
vq(obs, codes)
Vector quantization ndarray wrapper.
Vector quantization ndarray wrapper. Only support float32 and float64.
Parameters
----------
obs : ndarray
The observation matrix. Each row is an observation.
codes : ndarray
The code book matrix.
Notes
-----
The observation matrix and code book matrix should have same rank and
same number of columns (features). Only rank 1 and rank 2 are supported.
"""
cdef np.npy_intp nobs, ncodes, nfeat
cdef int nobs, ncodes, nfeat
cdef np.ndarray obs_a, codes_a
cdef np.ndarray outcodes, outdists
cdef int flags = np.NPY_CONTIGUOUS | np.NPY_NOTSWAPPED | np.NPY_ALIGNED
Expand All @@ -257,22 +204,20 @@ def vq(np.ndarray obs, np.ndarray codes):
else:
raise ValueError('rank different than 1 or 2 are not supported')

# We create this with the C API so that we can be sure that
# the resulting array has elements big enough to store indices
# on that platform. Hence, PyArray_INTP.
outcodes = PyArray_EMPTY(1, &nobs, PyArray_INTP, 0)

# This we just want to match the dtype of the input, so np.empty is fine.
# Initialize outdists and outcodes array.
# Outdists should be initialized as INF.
outdists = np.empty((nobs,), dtype=obs.dtype)
outdists.fill(np.inf)
outcodes = np.empty((nobs,), dtype=np.int32)

if obs.dtype == np.float32:
float32_tvq(<float32_t *>obs_a.data, <float32_t *>codes_a.data,
ncodes, nfeat, nobs, <np.npy_intp *>outcodes.data,
<float32_t *>outdists.data)
_vq(<float32_t *>obs_a.data, <float32_t *>codes_a.data,
ncodes, nfeat, nobs, <int32_t *>outcodes.data,
<float32_t *>outdists.data)
elif obs.dtype == np.float64:
float64_tvq(<float64_t *>obs_a.data, <float64_t *>codes_a.data,
ncodes, nfeat, nobs, <np.npy_intp *>outcodes.data,
<float64_t *>outdists.data)
_vq(<float64_t *>obs_a.data, <float64_t *>codes_a.data,
ncodes, nfeat, nobs, <int32_t *>outcodes.data,
<float64_t *>outdists.data)
else:
raise ValueError('type other than float or double not supported')

Expand Down
20 changes: 0 additions & 20 deletions scipy/cluster/cblas.pxd

This file was deleted.

20 changes: 20 additions & 0 deletions scipy/cluster/cluster_blas.h
@@ -0,0 +1,20 @@
/*
* Handle different Fortran conventions.
*/

#if defined(NO_APPEND_FORTRAN)
#if defined(UPPERCASE_FORTRAN)
#define F_FUNC(f,F) F
#else
#define F_FUNC(f,F) f
#endif
#else
#if defined(UPPERCASE_FORTRAN)
#define F_FUNC(f,F) F##_
#else
#define F_FUNC(f,F) f##_
#endif
#endif

#define f_sgemm F_FUNC(sgemm,SGEMM)
#define f_dgemm F_FUNC(dgemm,DGEMM)
10 changes: 10 additions & 0 deletions scipy/cluster/cluster_blas.pxd
@@ -0,0 +1,10 @@
cdef extern from "cluster_blas.h":

void f_sgemm(char *transA, char *transB, int *m, int *n, int *k,
float *alpha, float *A, int *lda, float *B, int *ldb,
float *beta, float *C, int *ldc) nogil

void f_dgemm(char *transA, char *transB, int *m, int *n, int *k,
double *alpha, double *A, int *lda, double *B, int *ldb,
double *beta, double *C, int *ldc) nogil

2 changes: 1 addition & 1 deletion scipy/cluster/setup.py
Expand Up @@ -16,7 +16,7 @@ def configuration(parent_package='', top_path=None):
from numpy.distutils.misc_util import Configuration, get_numpy_include_dirs
config = Configuration('cluster', parent_package, top_path)

blas_opt = get_info('blas_opt')
blas_opt = get_info('lapack_opt')

config.add_data_dir('tests')

Expand Down

0 comments on commit a3736e2

Please sign in to comment.