MAINT: stats: rewrite find_repeats

* No more Fortran extension for something we can do with NumPy * Refactored to remove duplicate implementation * Document the implicit float64 cast (no longer necessary...)
scipy · Oct 1, 2015 · 5e5c455 · 5e5c455
1 parent 81c0960
commit 5e5c455
Show file tree

Hide file tree

Showing 9 changed files with 57 additions and 192 deletions.
diff --git a/scipy/stats/_stats_mstats_common.py b/scipy/stats/_stats_mstats_common.py
@@ -3,7 +3,6 @@
 import numpy as np
 
 from . import distributions
-from . import futil
 
 
 __all__ = ['find_repeats', 'linregress', 'theilslopes']
@@ -213,42 +212,12 @@ def theilslopes(y, x=None, alpha=0.95):
     return medslope, medinter, delta[0], delta[1]
 
 
-def find_repeats(arr):
-    """
-    Find repeats and repeat counts.
-
-    Parameters
-    ----------
-    arr : array_like
-        Input array
-
-    Returns
-    -------
-    values : ndarray
-        The unique values from the (flattened) input that are repeated.
-
-    counts : ndarray
-        Number of times the corresponding 'value' is repeated.
-
-    Notes
-    -----
-    In numpy >= 1.9 `numpy.unique` provides similar functionality. The main
-    difference is that `find_repeats` only returns repeated values.
-
-    Examples
-    --------
-    >>> from scipy import stats
-    >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5])
-    RepeatedResults(values=array([ 2.]), counts=array([4], dtype=int32))
-
-    >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
-    RepeatedResults(values=array([ 4.,  5.]), counts=array([2, 2], dtype=int32))
-
-    """
-    RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts'))
-
-    if np.asarray(arr).size == 0:
-        return RepeatedResults([], [])
+def _find_repeats(arr):
+    # XXX This cast was previously needed for the Fortran implementation,
+    # should we ditch it?
+    arr = arr.astype(np.float64)
 
-    v1, v2, n = futil.dfreps(arr)
-    return RepeatedResults(v1[:n], v2[:n])
+    un, ind = np.unique(arr, return_inverse=True)
+    freq = np.bincount(ind)
+    atleast2 = freq > 1
+    return un[atleast2], freq[atleast2]
diff --git a/scipy/stats/bento.info b/scipy/stats/bento.info
@@ -8,8 +8,6 @@ Library:
             statlib/swilk.f
     Extension: statlib
         Sources: statlib.pyf
-    Extension: futil
-        Sources: futil.f
     Extension: mvn
         Sources: mvn.pyf, mvndst.f
     Extension: vonmises_cython

diff --git a/scipy/stats/bscript b/scipy/stats/bscript
@@ -12,5 +12,3 @@ def pre_build(context):
                              use="statlibimp CLIB")
     context.tweak_extension("mvn", features="c fc cshlib pyext bento f2py",
                           use="CLIB")
-    context.tweak_extension("futil", features="c fc cshlib pyext bento f2py f2py_fortran",
-                          use="CLIB")
diff --git a/scipy/stats/futil.f b/scipy/stats/futil.f
diff --git a/scipy/stats/mstats_basic.py b/scipy/stats/mstats_basic.py
@@ -47,8 +47,8 @@
 
 from . import distributions
 import scipy.special as special
-from . import futil
 from ._stats_mstats_common import (
+        _find_repeats,
         linregress as stats_linregress,
         theilslopes as stats_theilslopes
         )
@@ -137,7 +137,8 @@ def argstoarray(*args):
 
 def find_repeats(arr):
     """Find repeats in arr and return a tuple (repeats, repeat_count).
-    Masked values are discarded.
+
+    The input is cast to float64. Masked values are discarded.
 
     Parameters
     ----------
@@ -152,12 +153,7 @@ def find_repeats(arr):
         Array of counts.
 
     """
-    marr = ma.compressed(arr)
-    if not marr.size:
-        return (np.array(0), np.array(0))
-
-    (v1, v2, n) = futil.dfreps(ma.array(ma.compressed(arr), copy=True))
-    return (v1[:n], v2[:n])
+    return _find_repeats(ma.compressed(arr))
 
 
 def count_tied_groups(x, use_missing=False):

diff --git a/scipy/stats/setup.py b/scipy/stats/setup.py
@@ -31,11 +31,6 @@ def configuration(parent_package='',top_path=None):
         sources=['_rank.c'],          # FIXME: use cython source
     )
 
-    # add futil module
-    config.add_extension('futil',
-        sources=['futil.f'],
-    )
-
     # add mvn module
     config.add_extension('mvn',
         sources=['mvn.pyf','mvndst.f'],

diff --git a/scipy/stats/stats.py b/scipy/stats/stats.py
@@ -183,7 +183,7 @@
 from . import distributions
 from . import mstats_basic
 from ._distn_infrastructure import _lazywhere
-from ._stats_mstats_common import find_repeats, linregress, theilslopes
+from ._stats_mstats_common import _find_repeats, linregress, theilslopes
 
 from ._rank import rankdata, tiecorrect
 
@@ -5114,6 +5114,42 @@ def f_value_multivariate(ER, EF, dfnum, dfden):
 #         SUPPORT FUNCTIONS         #
 #####################################
 
+def find_repeats(arr):
+    """
+    Find repeats and repeat counts.
+
+    Parameters
+    ----------
+    arr : array_like
+        Input array. This is cast to float64.
+
+    Returns
+    -------
+    values : ndarray
+        The unique values from the (flattened) input that are repeated.
+
+    counts : ndarray
+        Number of times the corresponding 'value' is repeated.
+
+    Notes
+    -----
+    In numpy >= 1.9 `numpy.unique` provides similar functionality. The main
+    difference is that `find_repeats` only returns repeated values.
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5])
+    RepeatedResults(values=array([ 2.]), counts=array([4]))
+
+    >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
+    RepeatedResults(values=array([ 4.,  5.]), counts=array([2, 2]))
+
+    """
+    RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts'))
+    return RepeatedResults(*_find_repeats(np.array(arr, dtype=np.float64)))
+
+
 @np.deprecate(message="scipy.stats.ss is deprecated in scipy 0.17.0")
 def ss(a, axis=0):
     return _sum_of_squares(a, axis)

diff --git a/scipy/stats/tests/test_mstats_basic.py b/scipy/stats/tests/test_mstats_basic.py
@@ -1180,11 +1180,14 @@ def test_find_repeats(self):
         tmp = np.asarray([1,1,2,2,3,3,3,4,4,4,4,5,5,5,5]).astype('float')
         mask = (tmp == 5.)
         xm = np.ma.array(tmp, mask=mask)
+        x_orig, xm_orig = x.copy(), xm.copy()
 
         r = stats.find_repeats(x)
         rm = stats.mstats.find_repeats(xm)
 
-        assert_equal(r,rm)
+        assert_equal(r, rm)
+        assert_equal(x, x_orig)
+        assert_equal(xm, xm_orig)
 
     def test_kendalltau(self):
         for n in self.get_n():

diff --git a/scipy/stats/tests/test_stats.py b/scipy/stats/tests/test_stats.py
@@ -751,10 +751,10 @@ def test_basic(self):
 
     def test_empty_result(self):
         # Check that empty arrays are returned when there are no repeats.
-        a = [10, 20, 50, 30, 40]
-        repeated, counts = stats.find_repeats(a)
-        assert_array_equal(repeated, [])
-        assert_array_equal(counts, [])
+        for a in [[10, 20, 50, 30, 40], []]:
+            repeated, counts = stats.find_repeats(a)
+            assert_array_equal(repeated, [])
+            assert_array_equal(counts, [])
 
 
 class TestRegression(TestCase):