BUG: spatial: fix Canberra distance. Closes #1430.

Also fix potential problems with integer division in braycurtis and kulsinski.
scipy · Jun 12, 2011 · 32f9e3d · 32f9e3d
1 parent fba8f78
commit 32f9e3d
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 20 deletions.
diff --git a/scipy/spatial/distance.py b/scipy/spatial/distance.py
@@ -415,7 +415,7 @@ def kulsinski(u, v):
     """
     u = np.asarray(u, order='c')
     v = np.asarray(v, order='c')
-    n = len(u)
+    n = float(len(u))
     (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
 
     return (ntf + nft - ntt + n) / (ntf + nft + n)
@@ -441,7 +441,7 @@ def seuclidean(u, v, V):
     """
     u = np.asarray(u, order='c')
     v = np.asarray(v, order='c')
-    V = np.asarray(V, order='c')
+    V = np.asarray(V, order='c', dtype=np.float64)
     if len(V.shape) != 1 or V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]:
         raise TypeError('V must be a 1-D array of the same dimension as u and v.')
     return np.sqrt(((u-v)**2 / V).sum())
@@ -533,6 +533,9 @@ def braycurtis(u, v):
 
        \sum{|u_i-v_i|} / \sum{|u_i+v_i|}.
 
+    The Bray-Curtis distance is in the range [0, 1] if all coordinates are
+    positive, and is undefined if the inputs are of length zero.
+
     Parameters
     ----------
     u : ndarray
@@ -546,8 +549,8 @@ def braycurtis(u, v):
         The Bray-Curtis distance between vectors ``u`` and ``v``.
     """
     u = np.asarray(u, order='c')
-    v = np.asarray(v, order='c')
-    return abs(u-v).sum() / abs(u+v).sum()
+    v = np.asarray(v, order='c', dtype=np.float64)
+    return abs(u - v).sum() / abs(u + v).sum()
 
 def canberra(u, v):
     r"""
@@ -556,9 +559,8 @@ def canberra(u, v):
 
     .. math::
 
-       \frac{\sum_i {|u_i-v_i|}}
-            {\sum_i {|u_i|+|v_i|}}.
-
+         \sum_u \frac{|u_i-v_i|}
+                     {(|u_i|+|v_i|)}.
 
     Parameters
     ----------
@@ -571,10 +573,21 @@ def canberra(u, v):
     -------
     d : double
         The Canberra distance between vectors ``u`` and ``v``.
+
+    Notes
+    -----
+    Whe u[i] and v[i] are 0 for given i, then the fraction 0/0 = 0 is used in
+    the calculation.
+
     """
     u = np.asarray(u, order='c')
-    v = np.asarray(v, order='c')
-    return abs(u-v).sum() / (abs(u).sum() + abs(v).sum())
+    v = np.asarray(v, order='c', dtype=np.float64)
+    olderr = np.seterr(invalid='ignore')
+    try:
+        d = np.nansum(abs(u - v) / (abs(u) + abs(v)))
+    finally:
+        np.seterr(**olderr)
+    return d
 
 def _nbool_correspond_all(u, v):
     if u.dtype != v.dtype:

diff --git a/scipy/spatial/src/distance.c b/scipy/spatial/src/distance.c
@@ -75,13 +75,15 @@ static NPY_INLINE double chebyshev_distance(const double *u, const double *v, in
 
 static NPY_INLINE double canberra_distance(const double *u, const double *v, int n) {
   int i;
-  double snum = 0.0, sdenom_u = 0.0, sdenom_v = 0.0;
+  double snum = 0.0, sdenom = 0.0, tot = 0.0;
   for (i = 0; i < n; i++) {
-    snum += fabs(u[i] - v[i]);
-    sdenom_u += fabs(u[i]);
-    sdenom_v += fabs(v[i]);
+    snum = fabs(u[i] - v[i]);
+    sdenom = fabs(u[i]) + fabs(v[i]);
+    if (sdenom > 0.0) {
+        tot += snum / sdenom;
+    }
   }
-  return snum / (sdenom_u + sdenom_v);
+  return tot;
 }
 
 static NPY_INLINE double bray_curtis_distance(const double *u, const double *v, int n) {

diff --git a/scipy/spatial/tests/test_distance.py b/scipy/spatial/tests/test_distance.py
@@ -38,11 +38,12 @@
 
 import numpy as np
 from numpy.testing import verbose, TestCase, run_module_suite, \
-        assert_raises, assert_array_equal
+        assert_raises, assert_array_equal, assert_equal, assert_almost_equal
 from scipy.spatial.distance import squareform, pdist, cdist, matching, \
                                    jaccard, dice, sokalsneath, rogerstanimoto, \
                                    russellrao, yule, num_obs_y, num_obs_dm, \
-                                   is_valid_dm, is_valid_y, wminkowski
+                                   is_valid_dm, is_valid_y, wminkowski, \
+                                   canberra, braycurtis
 
 _filenames = ["iris.txt",
               "cdist-X1.txt",
@@ -118,7 +119,7 @@ def test_cdist_euclidean_random(self):
         if verbose > 2:
             print (Y1-Y2).max()
         self.assertTrue(within_tol(Y1, Y2, eps))
-        
+
     def test_cdist_euclidean_random_unicode(self):
         "Tests cdist(X, u'euclidean') using unicode metric string"
         eps = 1e-07
@@ -130,7 +131,7 @@ def test_cdist_euclidean_random_unicode(self):
         if verbose > 2:
             print (Y1-Y2).max()
         self.assertTrue(within_tol(Y1, Y2, eps))
-        
+
     def test_cdist_sqeuclidean_random(self):
         "Tests cdist(X, 'sqeuclidean') on random data."
         eps = 1e-07
@@ -477,7 +478,7 @@ def test_pdist_euclidean_random(self):
 
         Y_test1 = pdist(X, 'euclidean')
         self.assertTrue(within_tol(Y_test1, Y_right, eps))
-        
+
     def test_pdist_euclidean_random_u(self):
         "Tests pdist(X, 'euclidean') with unicode metric string"
         eps = 1e-07
@@ -487,7 +488,7 @@ def test_pdist_euclidean_random_u(self):
 
         Y_test1 = pdist(X, u'euclidean')
         self.assertTrue(within_tol(Y_test1, Y_right, eps))
-        
+
     def test_pdist_euclidean_random_float32(self):
         "Tests pdist(X, 'euclidean') on random data (float32)."
         eps = 1e-07
@@ -1734,6 +1735,16 @@ def test_sokalsneath_all_false():
     """Regression test for ticket #876"""
     assert_raises(ValueError, sokalsneath, [False, False, False], [False, False, False])
 
+def test_canberra():
+    """Regression test for ticket #1430."""
+    assert_equal(canberra([1,2,3], [2,4,6]), 1)
+    assert_equal(canberra([1,1,0,0], [1,0,1,0]), 2)
+
+def test_braycurtis():
+    """Regression test for ticket #1430."""
+    assert_almost_equal(braycurtis([1,2,3], [2,4,6]), 1./3, decimal=15)
+    assert_almost_equal(braycurtis([1,1,0,0], [1,0,1,0]), 0.5, decimal=15)
+
 
 if __name__=="__main__":
     run_module_suite()