# scipy/scipy

BUG: spatial: fix Canberra distance. Closes #1430.

Also fix potential problems with integer division in braycurtis and kulsinski.
1 parent fba8f78 commit 32f9e3d8e0154da1b9413a9d2c6ebec578ee29af rgommers committed Jun 12, 2011
Showing with 46 additions and 20 deletions.
1. +22 −9 scipy/spatial/distance.py
2. +7 −5 scipy/spatial/src/distance.c
3. +17 −6 scipy/spatial/tests/test_distance.py
31 scipy/spatial/distance.py
 @@ -415,7 +415,7 @@ def kulsinski(u, v): """ u = np.asarray(u, order='c') v = np.asarray(v, order='c') - n = len(u) + n = float(len(u)) (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v) return (ntf + nft - ntt + n) / (ntf + nft + n) @@ -441,7 +441,7 @@ def seuclidean(u, v, V): """ u = np.asarray(u, order='c') v = np.asarray(v, order='c') - V = np.asarray(V, order='c') + V = np.asarray(V, order='c', dtype=np.float64) if len(V.shape) != 1 or V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]: raise TypeError('V must be a 1-D array of the same dimension as u and v.') return np.sqrt(((u-v)**2 / V).sum()) @@ -533,6 +533,9 @@ def braycurtis(u, v): \sum{|u_i-v_i|} / \sum{|u_i+v_i|}. + The Bray-Curtis distance is in the range [0, 1] if all coordinates are + positive, and is undefined if the inputs are of length zero. + Parameters ---------- u : ndarray @@ -546,8 +549,8 @@ def braycurtis(u, v): The Bray-Curtis distance between vectors u and v. """ u = np.asarray(u, order='c') - v = np.asarray(v, order='c') - return abs(u-v).sum() / abs(u+v).sum() + v = np.asarray(v, order='c', dtype=np.float64) + return abs(u - v).sum() / abs(u + v).sum() def canberra(u, v): r""" @@ -556,9 +559,8 @@ def canberra(u, v): .. math:: - \frac{\sum_i {|u_i-v_i|}} - {\sum_i {|u_i|+|v_i|}}. - + \sum_u \frac{|u_i-v_i|} + {(|u_i|+|v_i|)}. Parameters ---------- @@ -571,10 +573,21 @@ def canberra(u, v): ------- d : double The Canberra distance between vectors u and v. + + Notes + ----- + Whe u[i] and v[i] are 0 for given i, then the fraction 0/0 = 0 is used in + the calculation. + """ u = np.asarray(u, order='c') - v = np.asarray(v, order='c') - return abs(u-v).sum() / (abs(u).sum() + abs(v).sum()) + v = np.asarray(v, order='c', dtype=np.float64) + olderr = np.seterr(invalid='ignore') + try: + d = np.nansum(abs(u - v) / (abs(u) + abs(v))) + finally: + np.seterr(**olderr) + return d def _nbool_correspond_all(u, v): if u.dtype != v.dtype:
12 scipy/spatial/src/distance.c
 @@ -75,13 +75,15 @@ static NPY_INLINE double chebyshev_distance(const double *u, const double *v, in static NPY_INLINE double canberra_distance(const double *u, const double *v, int n) { int i; - double snum = 0.0, sdenom_u = 0.0, sdenom_v = 0.0; + double snum = 0.0, sdenom = 0.0, tot = 0.0; for (i = 0; i < n; i++) { - snum += fabs(u[i] - v[i]); - sdenom_u += fabs(u[i]); - sdenom_v += fabs(v[i]); + snum = fabs(u[i] - v[i]); + sdenom = fabs(u[i]) + fabs(v[i]); + if (sdenom > 0.0) { + tot += snum / sdenom; + } } - return snum / (sdenom_u + sdenom_v); + return tot; } static NPY_INLINE double bray_curtis_distance(const double *u, const double *v, int n) {
23 scipy/spatial/tests/test_distance.py
 @@ -38,11 +38,12 @@ import numpy as np from numpy.testing import verbose, TestCase, run_module_suite, \ - assert_raises, assert_array_equal + assert_raises, assert_array_equal, assert_equal, assert_almost_equal from scipy.spatial.distance import squareform, pdist, cdist, matching, \ jaccard, dice, sokalsneath, rogerstanimoto, \ russellrao, yule, num_obs_y, num_obs_dm, \ - is_valid_dm, is_valid_y, wminkowski + is_valid_dm, is_valid_y, wminkowski, \ + canberra, braycurtis _filenames = ["iris.txt", "cdist-X1.txt", @@ -118,7 +119,7 @@ def test_cdist_euclidean_random(self): if verbose > 2: print (Y1-Y2).max() self.assertTrue(within_tol(Y1, Y2, eps)) - + def test_cdist_euclidean_random_unicode(self): "Tests cdist(X, u'euclidean') using unicode metric string" eps = 1e-07 @@ -130,7 +131,7 @@ def test_cdist_euclidean_random_unicode(self): if verbose > 2: print (Y1-Y2).max() self.assertTrue(within_tol(Y1, Y2, eps)) - + def test_cdist_sqeuclidean_random(self): "Tests cdist(X, 'sqeuclidean') on random data." eps = 1e-07 @@ -477,7 +478,7 @@ def test_pdist_euclidean_random(self): Y_test1 = pdist(X, 'euclidean') self.assertTrue(within_tol(Y_test1, Y_right, eps)) - + def test_pdist_euclidean_random_u(self): "Tests pdist(X, 'euclidean') with unicode metric string" eps = 1e-07 @@ -487,7 +488,7 @@ def test_pdist_euclidean_random_u(self): Y_test1 = pdist(X, u'euclidean') self.assertTrue(within_tol(Y_test1, Y_right, eps)) - + def test_pdist_euclidean_random_float32(self): "Tests pdist(X, 'euclidean') on random data (float32)." eps = 1e-07 @@ -1734,6 +1735,16 @@ def test_sokalsneath_all_false(): """Regression test for ticket #876""" assert_raises(ValueError, sokalsneath, [False, False, False], [False, False, False]) +def test_canberra(): + """Regression test for ticket #1430.""" + assert_equal(canberra([1,2,3], [2,4,6]), 1) + assert_equal(canberra([1,1,0,0], [1,0,1,0]), 2) + +def test_braycurtis(): + """Regression test for ticket #1430.""" + assert_almost_equal(braycurtis([1,2,3], [2,4,6]), 1./3, decimal=15) + assert_almost_equal(braycurtis([1,1,0,0], [1,0,1,0]), 0.5, decimal=15) + if __name__=="__main__": run_module_suite()