MAINT: cluster: rewrite vq with fused types and BLAS Fortran ABI

scipy · May 31, 2014 · a3736e2 · a3736e2
1 parent 5541a1c
commit a3736e2
Show file tree

Hide file tree

Showing 6 changed files with 124 additions and 166 deletions.
diff --git a/scipy/cluster/_vq.pyx b/scipy/cluster/_vq.pyx
@@ -9,164 +9,103 @@ Translated to Cython by David Warde-Farley, October 2009.
 
 import numpy as np
 cimport numpy as np
-from cblas cimport *
+from cluster_blas cimport *
 
 cdef extern from "math.h":
     float sqrtf(float num)
     double sqrt(double num)
 
-cdef extern from "numpy/arrayobject.h":
-    object PyArray_EMPTY(int, np.npy_intp*, int, int)
-    cdef enum:
-        PyArray_INTP
-
-cdef extern from "numpy/npy_math.h":
-    cdef enum:
-        NPY_INFINITY
-
-# C types
 ctypedef np.float64_t float64_t
 ctypedef np.float32_t float32_t
 ctypedef np.int32_t int32_t
 
-# Initialize the NumPy C API
-np.import_array()
-
-
-cdef void float32_tvq(float32_t *obs, float32_t *code_book,
-                      int ncodes, int nfeat, int nobs,
-                      np.npy_intp *codes, float32_t *low_dist):
-    # Naive algorithm is prefered when nfeat is small
-    if nfeat < 5:
-        float32_tvq_small_nf(obs, code_book, ncodes, nfeat, nobs,
-                             codes, low_dist)
-        return
-
-    cdef int obs_index, code_index
-    cdef float32_t *p_obs, *p_codes, dist_sqr
-    cdef np.ndarray[float32_t, ndim=1] obs_sqr, codes_sqr
-    cdef np.ndarray[float32_t, ndim=2] M
+# Use Cython fused types for templating
+# Define supported data types as vq_type
+ctypedef fused vq_type:
+    float32_t
+    float64_t
 
-    obs_sqr = np.ndarray(nobs, np.float32)
-    codes_sqr = np.ndarray(ncodes, np.float32)
-    M = np.ndarray((nobs, ncodes), np.float32)
+# When the number of features is less than this number,
+# switch back to the naive algorithm to avoid high overhead.
+DEF NFEATURES_CUTOFF=5
 
-    p_obs = obs
-    for obs_index in range(nobs):
-        # obs_sqr[i] is the inner product of the i-th observation with itself
-        obs_sqr[obs_index] = cblas_sdot(nfeat, p_obs, 1, p_obs, 1)
-        p_obs += nfeat
+# Initialize the NumPy C API
+np.import_array()
 
-    p_codes = code_book
-    for code_index in range(ncodes):
-        # codes_sqr[i] is the inner product of the i-th code with itself
-        codes_sqr[code_index] = cblas_sdot(nfeat, p_codes, 1, p_codes, 1)
-        p_codes += nfeat
 
-    # M[i][j] is the inner product of the i-th obs and j-th code
-    # M = Obs * Codes.T
-    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                nobs, ncodes, nfeat, -2.0, obs, nfeat, code_book, nfeat,
-                0.0, <float32_t *>M.data, ncodes)
+cdef inline vq_type _sqrt(vq_type x):
+    if vq_type is float32_t:
+        return sqrtf(x)
+    else:
+        return sqrt(x)
 
-    for obs_index in range(nobs):
-        low_dist[obs_index] = NPY_INFINITY
-        for code_index in range(ncodes):
-            dist_sqr = (M[obs_index, code_index] +
-                    obs_sqr[obs_index] + codes_sqr[code_index])
-            if dist_sqr < low_dist[obs_index]:
-                codes[obs_index] = code_index
-                low_dist[obs_index] = dist_sqr
 
-        # dist_sqr may be negative due to float point errors
-        if low_dist[obs_index] > 0:
-            low_dist[obs_index] = sqrtf(low_dist[obs_index])
-        else:
-            low_dist[obs_index] = 0
+cdef inline vq_type vec_sqr(int n, vq_type *p):
+    cdef vq_type result = 0.0
+    cdef int i
+    for i in range(n):
+        result += p[i] * p[i]
+    return result
 
 
-cdef void float32_tvq_small_nf(float32_t *obs, float32_t *code_book,
-                               int ncodes, int nfeat, int nobs,
-                               np.npy_intp *codes, float32_t *low_dist):
+cdef inline void cal_M(int nobs, int ncodes, int nfeat, vq_type *obs,
+                       vq_type *code_book, vq_type *M):
     """
-    Vector quantization for float32 using naive algorithm.
-    This is perfered when nfeat is small.
+    Calculate M = obs * code_book.T
     """
-    # Temporary variables
-    cdef float32_t dist_sqr, diff
-    cdef int32_t obs_index, code_index, feature
-    cdef int32_t offset = 0
-
-    # Index and pointer to keep track of the current position in
-    # both arrays so that we don't have to always do index * nfeat.
-    cdef int codebook_pos
-    cdef float32_t *current_obs
-
-    for obs_index in range(nobs):
-        codebook_pos = 0
-        low_dist[obs_index] = NPY_INFINITY
-        for code_index in range(ncodes):
-            dist_sqr = 0
+    cdef vq_type alpha = -2.0, beta = 0.0
 
-            # Distance between code_book[code_index] and obs[obs_index]
-            for feature in range(nfeat):
-
-                # Use current_obs pointer and codebook_pos to minimize
-                # pointer arithmetic necessary (i.e. no multiplications)
-                current_obs = &(obs[offset])
-                diff = code_book[codebook_pos] - current_obs[feature]
-                dist_sqr += diff * diff
-                codebook_pos += 1
-
-            # Replace the code assignment and record distance if necessary
-            if dist_sqr < low_dist[obs_index]:
-                codes[obs_index] = code_index
-                low_dist[obs_index] = dist_sqr
-
-        low_dist[obs_index] = sqrtf(low_dist[obs_index])
-
-        # Update the offset of the current observation
-        offset += nfeat
+    # Call BLAS functions with Fortran ABI
+    # Note that BLAS Fortran ABI uses column-major order
+    if vq_type is float32_t:
+        f_sgemm("T", "N", &ncodes, &nobs, &nfeat,
+                &alpha, code_book, &nfeat, obs, &nfeat, &beta, M, &ncodes)
+    else:
+        f_dgemm("T", "N", &ncodes, &nobs, &nfeat,
+                &alpha, code_book, &nfeat, obs, &nfeat, &beta, M, &ncodes)
 
 
-cdef void float64_tvq(float64_t *obs, float64_t *code_book,
-                      int ncodes, int nfeat, int nobs,
-                      np.npy_intp *codes, float64_t *low_dist):
+cdef void _vq(vq_type *obs, vq_type *code_book,
+              int ncodes, int nfeat, int nobs,
+              int32_t *codes, vq_type *low_dist):
     # Naive algorithm is prefered when nfeat is small
-    if nfeat < 5:
-        float64_tvq_small_nf(obs, code_book, ncodes, nfeat, nobs,
-                             codes, low_dist)
+    if nfeat < NFEATURES_CUTOFF:
+        _vq_small_nf(obs, code_book, ncodes, nfeat, nobs, codes, low_dist)
         return
 
     cdef int obs_index, code_index
-    cdef float64_t *p_obs, *p_codes, dist_sqr
-    cdef np.ndarray[float64_t, ndim=1] obs_sqr, codes_sqr
-    cdef np.ndarray[float64_t, ndim=2] M
-
-    obs_sqr = np.ndarray(nobs, np.float64)
-    codes_sqr = np.ndarray(ncodes, np.float64)
-    M = np.ndarray((nobs, ncodes), np.float64)
+    cdef vq_type *p_obs
+    cdef vq_type *p_codes
+    cdef vq_type dist_sqr
+    cdef np.ndarray[vq_type, ndim=1] obs_sqr, codes_sqr
+    cdef np.ndarray[vq_type, ndim=2] M
+
+    if vq_type is float32_t:
+        obs_sqr = np.ndarray(nobs, np.float32)
+        codes_sqr = np.ndarray(ncodes, np.float32)
+        M = np.ndarray((nobs, ncodes), np.float32)
+    else:
+        obs_sqr = np.ndarray(nobs, np.float64)
+        codes_sqr = np.ndarray(ncodes, np.float64)
+        M = np.ndarray((nobs, ncodes), np.float64)
 
     p_obs = obs
     for obs_index in range(nobs):
         # obs_sqr[i] is the inner product of the i-th observation with itself
-        obs_sqr[obs_index] = cblas_ddot(nfeat, p_obs, 1, p_obs, 1)
+        obs_sqr[obs_index] = vec_sqr(nfeat, p_obs)
         p_obs += nfeat
 
     p_codes = code_book
     for code_index in range(ncodes):
         # codes_sqr[i] is the inner product of the i-th code with itself
-        codes_sqr[code_index] = cblas_ddot(nfeat, p_codes, 1, p_codes, 1)
+        codes_sqr[code_index] = vec_sqr(nfeat, p_codes)
         p_codes += nfeat
 
     # M[i][j] is the inner product of the i-th obs and j-th code
-    # M = Obs * Codes.T
-    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                nobs, ncodes, nfeat, -2.0, obs, nfeat, code_book, nfeat,
-                0.0, <float64_t *>M.data, ncodes)
+    # M = obs * codes.T
+    cal_M(nobs, ncodes, nfeat, obs, code_book, <vq_type *>M.data)
 
     for obs_index in range(nobs):
-        low_dist[obs_index] = NPY_INFINITY
         for code_index in range(ncodes):
             dist_sqr = (M[obs_index, code_index] +
                     obs_sqr[obs_index] + codes_sqr[code_index])
@@ -176,37 +115,35 @@ cdef void float64_tvq(float64_t *obs, float64_t *code_book,
 
         # dist_sqr may be negative due to float point errors
         if low_dist[obs_index] > 0:
-            low_dist[obs_index] = sqrt(low_dist[obs_index])
+            low_dist[obs_index] = _sqrt(low_dist[obs_index])
         else:
             low_dist[obs_index] = 0
 
 
-cdef void float64_tvq_small_nf(float64_t *obs, float64_t *code_book,
-                               int ncodes, int nfeat, int nobs,
-                               np.npy_intp *codes, float64_t *low_dist):
+cdef void _vq_small_nf(vq_type *obs, vq_type *code_book,
+                       int ncodes, int nfeat, int nobs,
+                       int32_t *codes, vq_type *low_dist):
     """
-    Vector quantization for float64 using naive algorithm.
-    This is perfered when nfeat is small.
+    Vector quantization for float32 using naive algorithm.
+    This is prefered when nfeat is small.
     """
     # Temporary variables
-    cdef float64_t dist_sqr, diff
+    cdef vq_type dist_sqr, diff
     cdef int32_t obs_index, code_index, feature
     cdef int32_t offset = 0
 
     # Index and pointer to keep track of the current position in
     # both arrays so that we don't have to always do index * nfeat.
     cdef int codebook_pos
-    cdef float64_t *current_obs
+    cdef vq_type *current_obs
 
     for obs_index in range(nobs):
         codebook_pos = 0
-        low_dist[obs_index] = NPY_INFINITY
         for code_index in range(ncodes):
             dist_sqr = 0
 
             # Distance between code_book[code_index] and obs[obs_index]
             for feature in range(nfeat):
-
                 # Use current_obs pointer and codebook_pos to minimize
                 # pointer arithmetic necessary (i.e. no multiplications)
                 current_obs = &(obs[offset])
@@ -219,19 +156,29 @@ cdef void float64_tvq_small_nf(float64_t *obs, float64_t *code_book,
                 codes[obs_index] = code_index
                 low_dist[obs_index] = dist_sqr
 
-        low_dist[obs_index] = sqrt(low_dist[obs_index])
+        low_dist[obs_index] = _sqrt(low_dist[obs_index])
 
         # Update the offset of the current observation
         offset += nfeat
 
 
 def vq(np.ndarray obs, np.ndarray codes):
     """
-    vq(obs, codes)
-
-    Vector quantization ndarray wrapper.
+    Vector quantization ndarray wrapper. Only support float32 and float64.
+
+    Parameters
+    ----------
+    obs : ndarray
+        The observation matrix. Each row is an observation.
+    codes : ndarray
+        The code book matrix.
+
+    Notes
+    -----
+    The observation matrix and code book matrix should have same rank and
+    same number of columns (features). Only rank 1 and rank 2 are supported.
     """
-    cdef np.npy_intp nobs, ncodes, nfeat
+    cdef int nobs, ncodes, nfeat
     cdef np.ndarray obs_a, codes_a
     cdef np.ndarray outcodes, outdists
     cdef int flags = np.NPY_CONTIGUOUS | np.NPY_NOTSWAPPED | np.NPY_ALIGNED
@@ -257,22 +204,20 @@ def vq(np.ndarray obs, np.ndarray codes):
     else:
         raise ValueError('rank different than 1 or 2 are not supported')
 
-    # We create this with the C API so that we can be sure that
-    # the resulting array has elements big enough to store indices
-    # on that platform. Hence, PyArray_INTP.
-    outcodes = PyArray_EMPTY(1, &nobs, PyArray_INTP, 0)
-
-    # This we just want to match the dtype of the input, so np.empty is fine.
+    # Initialize outdists and outcodes array.
+    # Outdists should be initialized as INF.
     outdists = np.empty((nobs,), dtype=obs.dtype)
+    outdists.fill(np.inf)
+    outcodes = np.empty((nobs,), dtype=np.int32)
 
     if obs.dtype == np.float32:
-        float32_tvq(<float32_t *>obs_a.data, <float32_t *>codes_a.data,
-                    ncodes, nfeat, nobs, <np.npy_intp *>outcodes.data,
-                    <float32_t *>outdists.data)
+        _vq(<float32_t *>obs_a.data, <float32_t *>codes_a.data,
+            ncodes, nfeat, nobs, <int32_t *>outcodes.data,
+            <float32_t *>outdists.data)
     elif obs.dtype == np.float64:
-        float64_tvq(<float64_t *>obs_a.data, <float64_t *>codes_a.data,
-                    ncodes, nfeat, nobs, <np.npy_intp *>outcodes.data,
-                    <float64_t *>outdists.data)
+        _vq(<float64_t *>obs_a.data, <float64_t *>codes_a.data,
+            ncodes, nfeat, nobs, <int32_t *>outcodes.data,
+            <float64_t *>outdists.data)
     else:
         raise ValueError('type other than float or double not supported')
 

diff --git a/scipy/cluster/cblas.pxd b/scipy/cluster/cblas.pxd
diff --git a/scipy/cluster/cluster_blas.h b/scipy/cluster/cluster_blas.h
@@ -0,0 +1,20 @@
+/*
+ * Handle different Fortran conventions.
+ */
+
+#if defined(NO_APPEND_FORTRAN)
+#if defined(UPPERCASE_FORTRAN)
+#define F_FUNC(f,F) F
+#else
+#define F_FUNC(f,F) f
+#endif
+#else
+#if defined(UPPERCASE_FORTRAN)
+#define F_FUNC(f,F) F##_
+#else
+#define F_FUNC(f,F) f##_
+#endif
+#endif
+
+#define f_sgemm F_FUNC(sgemm,SGEMM)
+#define f_dgemm F_FUNC(dgemm,DGEMM)
diff --git a/scipy/cluster/cluster_blas.pxd b/scipy/cluster/cluster_blas.pxd
@@ -0,0 +1,10 @@
+cdef extern from "cluster_blas.h":
+
+    void f_sgemm(char *transA, char *transB, int *m, int *n, int *k,
+                 float *alpha, float *A, int *lda, float *B, int *ldb,
+                 float *beta, float *C, int *ldc) nogil
+
+    void f_dgemm(char *transA, char *transB, int *m, int *n, int *k,
+                 double *alpha, double *A, int *lda, double *B, int *ldb,
+                 double *beta, double *C, int *ldc) nogil
+
diff --git a/scipy/cluster/setup.py b/scipy/cluster/setup.py
@@ -16,7 +16,7 @@ def configuration(parent_package='', top_path=None):
     from numpy.distutils.misc_util import Configuration, get_numpy_include_dirs
     config = Configuration('cluster', parent_package, top_path)
 
-    blas_opt = get_info('blas_opt')
+    blas_opt = get_info('lapack_opt')
 
     config.add_data_dir('tests')