Skip to content

Commit

Permalink
MAINT: stats: rewrite find_repeats
Browse files Browse the repository at this point in the history
* No more Fortran extension for something we can do with NumPy
* Refactored to remove duplicate implementation
* Document the implicit float64 cast (no longer necessary...)
  • Loading branch information
larsmans committed Oct 1, 2015
1 parent 81c0960 commit 5e5c455
Show file tree
Hide file tree
Showing 9 changed files with 57 additions and 192 deletions.
47 changes: 8 additions & 39 deletions scipy/stats/_stats_mstats_common.py
Expand Up @@ -3,7 +3,6 @@
import numpy as np

from . import distributions
from . import futil


__all__ = ['find_repeats', 'linregress', 'theilslopes']
Expand Down Expand Up @@ -213,42 +212,12 @@ def theilslopes(y, x=None, alpha=0.95):
return medslope, medinter, delta[0], delta[1]


def find_repeats(arr):
"""
Find repeats and repeat counts.
Parameters
----------
arr : array_like
Input array
Returns
-------
values : ndarray
The unique values from the (flattened) input that are repeated.
counts : ndarray
Number of times the corresponding 'value' is repeated.
Notes
-----
In numpy >= 1.9 `numpy.unique` provides similar functionality. The main
difference is that `find_repeats` only returns repeated values.
Examples
--------
>>> from scipy import stats
>>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5])
RepeatedResults(values=array([ 2.]), counts=array([4], dtype=int32))
>>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
RepeatedResults(values=array([ 4., 5.]), counts=array([2, 2], dtype=int32))
"""
RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts'))

if np.asarray(arr).size == 0:
return RepeatedResults([], [])
def _find_repeats(arr):
# XXX This cast was previously needed for the Fortran implementation,
# should we ditch it?
arr = arr.astype(np.float64)

v1, v2, n = futil.dfreps(arr)
return RepeatedResults(v1[:n], v2[:n])
un, ind = np.unique(arr, return_inverse=True)
freq = np.bincount(ind)
atleast2 = freq > 1
return un[atleast2], freq[atleast2]
2 changes: 0 additions & 2 deletions scipy/stats/bento.info
Expand Up @@ -8,8 +8,6 @@ Library:
statlib/swilk.f
Extension: statlib
Sources: statlib.pyf
Extension: futil
Sources: futil.f
Extension: mvn
Sources: mvn.pyf, mvndst.f
Extension: vonmises_cython
Expand Down
2 changes: 0 additions & 2 deletions scipy/stats/bscript
Expand Up @@ -12,5 +12,3 @@ def pre_build(context):
use="statlibimp CLIB")
context.tweak_extension("mvn", features="c fc cshlib pyext bento f2py",
use="CLIB")
context.tweak_extension("futil", features="c fc cshlib pyext bento f2py f2py_fortran",
use="CLIB")
130 changes: 0 additions & 130 deletions scipy/stats/futil.f

This file was deleted.

12 changes: 4 additions & 8 deletions scipy/stats/mstats_basic.py
Expand Up @@ -47,8 +47,8 @@

from . import distributions
import scipy.special as special
from . import futil
from ._stats_mstats_common import (
_find_repeats,
linregress as stats_linregress,
theilslopes as stats_theilslopes
)
Expand Down Expand Up @@ -137,7 +137,8 @@ def argstoarray(*args):

def find_repeats(arr):
"""Find repeats in arr and return a tuple (repeats, repeat_count).
Masked values are discarded.
The input is cast to float64. Masked values are discarded.
Parameters
----------
Expand All @@ -152,12 +153,7 @@ def find_repeats(arr):
Array of counts.
"""
marr = ma.compressed(arr)
if not marr.size:
return (np.array(0), np.array(0))

(v1, v2, n) = futil.dfreps(ma.array(ma.compressed(arr), copy=True))
return (v1[:n], v2[:n])
return _find_repeats(ma.compressed(arr))


def count_tied_groups(x, use_missing=False):
Expand Down
5 changes: 0 additions & 5 deletions scipy/stats/setup.py
Expand Up @@ -31,11 +31,6 @@ def configuration(parent_package='',top_path=None):
sources=['_rank.c'], # FIXME: use cython source
)

# add futil module
config.add_extension('futil',
sources=['futil.f'],
)

# add mvn module
config.add_extension('mvn',
sources=['mvn.pyf','mvndst.f'],
Expand Down
38 changes: 37 additions & 1 deletion scipy/stats/stats.py
Expand Up @@ -183,7 +183,7 @@
from . import distributions
from . import mstats_basic
from ._distn_infrastructure import _lazywhere
from ._stats_mstats_common import find_repeats, linregress, theilslopes
from ._stats_mstats_common import _find_repeats, linregress, theilslopes

from ._rank import rankdata, tiecorrect

Expand Down Expand Up @@ -5114,6 +5114,42 @@ def f_value_multivariate(ER, EF, dfnum, dfden):
# SUPPORT FUNCTIONS #
#####################################

def find_repeats(arr):
"""
Find repeats and repeat counts.
Parameters
----------
arr : array_like
Input array. This is cast to float64.
Returns
-------
values : ndarray
The unique values from the (flattened) input that are repeated.
counts : ndarray
Number of times the corresponding 'value' is repeated.
Notes
-----
In numpy >= 1.9 `numpy.unique` provides similar functionality. The main
difference is that `find_repeats` only returns repeated values.
Examples
--------
>>> from scipy import stats
>>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5])
RepeatedResults(values=array([ 2.]), counts=array([4]))
>>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
RepeatedResults(values=array([ 4., 5.]), counts=array([2, 2]))
"""
RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts'))
return RepeatedResults(*_find_repeats(np.array(arr, dtype=np.float64)))


@np.deprecate(message="scipy.stats.ss is deprecated in scipy 0.17.0")
def ss(a, axis=0):
return _sum_of_squares(a, axis)
Expand Down
5 changes: 4 additions & 1 deletion scipy/stats/tests/test_mstats_basic.py
Expand Up @@ -1180,11 +1180,14 @@ def test_find_repeats(self):
tmp = np.asarray([1,1,2,2,3,3,3,4,4,4,4,5,5,5,5]).astype('float')
mask = (tmp == 5.)
xm = np.ma.array(tmp, mask=mask)
x_orig, xm_orig = x.copy(), xm.copy()

r = stats.find_repeats(x)
rm = stats.mstats.find_repeats(xm)

assert_equal(r,rm)
assert_equal(r, rm)
assert_equal(x, x_orig)
assert_equal(xm, xm_orig)

def test_kendalltau(self):
for n in self.get_n():
Expand Down
8 changes: 4 additions & 4 deletions scipy/stats/tests/test_stats.py
Expand Up @@ -751,10 +751,10 @@ def test_basic(self):

def test_empty_result(self):
# Check that empty arrays are returned when there are no repeats.
a = [10, 20, 50, 30, 40]
repeated, counts = stats.find_repeats(a)
assert_array_equal(repeated, [])
assert_array_equal(counts, [])
for a in [[10, 20, 50, 30, 40], []]:
repeated, counts = stats.find_repeats(a)
assert_array_equal(repeated, [])
assert_array_equal(counts, [])


class TestRegression(TestCase):
Expand Down

0 comments on commit 5e5c455

Please sign in to comment.