Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

ENH: Cythonize stats.rankdata and stats.tiecorrect

  • Loading branch information...
commit ccd2f7d2e9a8dd09fd796ecd894b8401c5b6f851 1 parent 45791eb
@WarrenWeckesser WarrenWeckesser authored pv committed
View
5 scipy/stats/SConscript
@@ -32,5 +32,8 @@ env.NumpyPythonExtension('futil', source = futil_src + ['futil.f'])
# mvn extension
env.NumpyPythonExtension('mvn', source = ['mvn.pyf', 'mvndst.f'])
-# vonmisses_cython extension
+# vonmises_cython extension
env.NumpyPythonExtension('vonmises_cython', source = ['vonmises_cython.c'])
+
+# _rank extension
+env.NumpyPythonExtension('_rank', source = ['_rank.c'])
View
7,355 scipy/stats/_rank.c
7,355 additions, 0 deletions not shown
View
179 scipy/stats/_rank.pyx
@@ -0,0 +1,179 @@
+"""
+Some functions related to ranking data, implemented in Cython for speed.
+This file uses fused types, so Cython version 0.16 or later is required.
+
+This module provides the following functions:
+ rankdata
+ Ranks the data, dealing with ties appropriately.
+ tiecorrect
+ Tie correction factor for the Mann-Whitney U test (stats.mannwhitnyu)
+ and the Kruskal-Wallis H test (stats.kruskal).
+
+"""
+
+import numpy as _np
+cimport numpy as np
+cimport cython
+
+
+ctypedef fused array_data_type:
+ np.int64_t
+ np.uint64_t
+ np.float64_t
+
+
+# The function _rankdata_fused uses the fused data type, array_data_type,
+# so that arrays of unsigned integers, signed integers, floats are handled
+# separately. An alternative would be to convert any input array to, say,
+# float64, but that would result in spurious ties when given large integers.
+# For example float(2**60) == float(2**60 + 1), so the values 2**60 and
+# 2**60 + 1 would be assigned the same rank if the input array was first
+# converted to floats.
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+cdef _rankdata_fused(np.ndarray[array_data_type, ndim=1] b):
+ cdef unsigned int i, j, n, sumranks, dupcount, inext
+ cdef np.ndarray[np.int_t, ndim=1] ivec
+ cdef np.ndarray[array_data_type, ndim=1] svec
+ cdef np.ndarray[np.float64_t, ndim=1] ranks
+ cdef double averank
+
+ n = b.size
+ ivec = _np.argsort(b)
+ svec = b[ivec]
+ sumranks = 0
+ dupcount = 0
+ ranks = _np.zeros(n, float)
+ for i in xrange(n):
+ sumranks += i
+ dupcount += 1
+ inext = i + 1
+ if i == n-1 or svec[i] != svec[inext]:
+ averank = sumranks / float(dupcount) + 1
+ for j in xrange(inext - dupcount, inext):
+ ranks[ivec[j]] = averank
+ sumranks = 0
+ dupcount = 0
+ return ranks
+
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+def rankdata(a):
+ """
+ rankata(a)
+
+ Assign ranks to the data in `a`, dealing with ties appropriately.
+
+ Equal values are assigned a rank that is the average of the ranks that
+ would have been otherwise assigned to all of the values within that set.
+ Ranks begin at 1.
+
+ Parameters
+ ----------
+ a : array_like
+ This array is first flattened.
+
+ Returns
+ -------
+ ranks : ndarray
+ An array of length equal to the size of `a`, containing rank scores.
+
+ Notes
+ -----
+ All floating point types are converted to numpy.float64 before ranking.
+ This may result in spurious ties if the input array has a wider data
+ type than numpy.float64.
+
+ Examples
+ --------
+ >>> rankdata([0, 2, 3, 2])
+ array([ 1. , 2.5, 4. , 2.5])
+
+ """
+ cdef np.ndarray[np.int64_t, ndim=1] x_int64
+ cdef np.ndarray[np.uint64_t, ndim=1] x_uint64
+ cdef np.ndarray[np.float64_t, ndim=1] x_float64
+
+ cdef np.ndarray b = _np.ravel(_np.asarray(a))
+
+ if _np.issubdtype(b.dtype, _np.unsignedinteger):
+ x_uint64 = b.astype(_np.uint64)
+ ranks = _rankdata_fused(x_uint64)
+ elif _np.issubdtype(b.dtype, _np.integer):
+ x_int64 = b.astype(_np.int64)
+ ranks = _rankdata_fused(x_int64)
+ else:
+ x_float64 = b.astype(_np.float64)
+ ranks = _rankdata_fused(x_float64)
+
+ return ranks
+
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+def tiecorrect(np.ndarray[np.float64_t, ndim=1] rankvals):
+ """
+ tiecorrect(rankvals)
+
+ Tie correction factor for ties in the Mann-Whitney U and
+ Kruskal-Wallis H tests.
+
+ Parameters
+ ----------
+ rankvals : 1-d ndarray of type np.float64
+ For efficiency, this function requires a numpy array as its
+ argument. This is not a signficant inconvenience, because its
+ primary use is in the mannwhitneyu and kruskal functions, where
+ it is called with the result of stats.rankdata.
+
+ Returns
+ -------
+ T : float
+ Correction factor for U or H.
+
+ See Also
+ --------
+ rankdata : Assign ranks to the data
+ mannwhitney : Mann-Whitney rank test
+ kruskal : Kruskal-Wallis H test
+
+ Examples
+ --------
+ >>> ranks = rankdata([1, 3, 2, 4, 5, 7, 2, 8, 4])
+ >>> ranks
+ array([ 1. , 4. , 2.5, 5.5, 7. , 8. , 2.5, 9. , 5.5])
+ >>> tiecorrect(ranks)
+ 0.9833333333333333
+
+ References
+ ----------
+ .. [1] Siegel, S. (1956) Nonparametric Statistics for the Behavioral
+ Sciences. New York: McGraw-Hill.
+
+ """
+ cdef unsigned int i, inext, n, nties, T
+ cdef np.ndarray[np.int_t, ndim=1] posn
+ cdef np.ndarray[np.float64_t, ndim=1] sorted
+ cdef double t
+
+ posn = _np.argsort(rankvals)
+ sorted = rankvals[posn]
+ n = len(sorted)
+ if n < 2:
+ return 1.0
+ T = 0
+ i = 0
+ while i < n - 1:
+ inext = i + 1
+ if sorted[i] == sorted[inext]:
+ nties = 1
+ while i < n - 1 and sorted[i] == sorted[inext]:
+ nties += 1
+ i += 1
+ inext += 1
+ T = T + nties**3 - nties
+ i = inext
+ t = T / float(n**3 - n)
+ return 1.0 - t
View
2  scipy/stats/bento.info
@@ -14,3 +14,5 @@ Library:
Sources: mvn.pyf, mvndst.f
Extension: vonmises_cython
Sources: vonmises_cython.c
+ Extension: _rank
+ Sources: _rank.c
View
5 scipy/stats/setup.py
@@ -24,6 +24,11 @@ def configuration(parent_package='',top_path=None):
sources=['vonmises_cython.c'], # FIXME: use cython source
)
+ # add _rank module
+ config.add_extension('_rank',
+ sources=['_rank.c'], # FIXME: use cython source
+ )
+
# add futil module
config.add_extension('futil',
sources=['futil.f'],
View
81 scipy/stats/stats.py
@@ -200,6 +200,7 @@
# Local imports.
import _support
from _support import _chk_asarray, _chk2_asarray
+from _rank import rankdata, tiecorrect
__all__ = ['find_repeats', 'gmean', 'hmean', 'cmedian', 'mode',
'tmean', 'tvar', 'tmin', 'tmax', 'tstd', 'tsem',
@@ -3494,45 +3495,6 @@ def mannwhitneyu(x, y, use_continuity=True):
return smallu, distributions.norm.sf(z) #(1.0 - zprob(z))
-def tiecorrect(rankvals):
- """
- Tie-corrector for ties in Mann Whitney U and Kruskal Wallis H tests.
-
- Parameters
- ----------
- rankvals : array_like
- Input values
-
- Returns
- -------
- T correction factor for U or H
-
- Notes
- -----
- Code adapted from \\|STAT rankind.c code.
-
- References
- ----------
- Siegel, S. (1956) Nonparametric Statistics for the Behavioral
- Sciences. New York: McGraw-Hill.
-
- """
- sorted,posn = fastsort(asarray(rankvals))
- n = len(sorted)
- T = 0.0
- i = 0
- while (i<n-1):
- if sorted[i] == sorted[i+1]:
- nties = 1
- while (i<n-1) and (sorted[i] == sorted[i+1]):
- nties = nties +1
- i = i +1
- T = T + nties**3 - nties
- i = i+1
- T = T / float(n**3-n)
- return 1.0 - T
-
-
def ranksums(x, y):
"""
Compute the Wilcoxon rank-sum statistic for two samples.
@@ -3986,44 +3948,3 @@ def fastsort(a):
it = np.argsort(a)
as_ = a[it]
return as_, it
-
-def rankdata(a):
- """
- Ranks the data, dealing with ties appropriately.
-
- Equal values are assigned a rank that is the average of the ranks that
- would have been otherwise assigned to all of the values within that set.
- Ranks begin at 1, not 0.
-
- Parameters
- ----------
- a : array_like
- This array is first flattened.
-
- Returns
- -------
- rankdata : ndarray
- An array of length equal to the size of `a`, containing rank scores.
-
- Examples
- --------
- >>> stats.rankdata([0, 2, 2, 3])
- array([ 1. , 2.5, 2.5, 4. ])
-
- """
- a = np.ravel(a)
- n = len(a)
- svec, ivec = fastsort(a)
- sumranks = 0
- dupcount = 0
- newarray = np.zeros(n, float)
- for i in xrange(n):
- sumranks += i
- dupcount += 1
- if i==n-1 or svec[i] != svec[i+1]:
- averank = sumranks / float(dupcount) + 1
- for j in xrange(i-dupcount+1,i+1):
- newarray[ivec[j]] = averank
- sumranks = 0
- dupcount = 0
- return newarray
View
132 scipy/stats/tests/test_rank.py
@@ -0,0 +1,132 @@
+
+import numpy as np
+from numpy.testing import TestCase, run_module_suite, assert_equal, \
+ assert_array_equal
+
+from scipy.stats import rankdata, tiecorrect
+
+
+class TestTieCorrect(TestCase):
+
+ def test_empty(self):
+ """An empty array requires no correction, should return 1.0."""
+ ranks = np.array([], dtype=np.float64)
+ c = tiecorrect(ranks)
+ assert_equal(c, 1.0)
+
+ def test_one(self):
+ """A single element requires no correction, should return 1.0."""
+ ranks = np.array([1.0], dtype=np.float64)
+ c = tiecorrect(ranks)
+ assert_equal(c, 1.0)
+
+ def test_no_correction(self):
+ """Arrays with no ties require no correction."""
+ ranks = np.arange(2.0)
+ c = tiecorrect(ranks)
+ assert_equal(c, 1.0)
+ ranks = np.arange(3.0)
+ c = tiecorrect(ranks)
+ assert_equal(c, 1.0)
+
+ def test_basic(self):
+ """Check a few basic examples of the tie correction factor."""
+ # One tie of two elements
+ ranks = np.array([1.0, 2.5, 2.5])
+ c = tiecorrect(ranks)
+ T = 2.0
+ N = ranks.size
+ expected = 1.0 - (T**3 - T) / (N**3 - N)
+ assert_equal(c, expected)
+
+ # One tie of two elements (same as above, but tie is not at the end)
+ ranks = np.array([1.5, 1.5, 3.0])
+ c = tiecorrect(ranks)
+ T = 2.0
+ N = ranks.size
+ expected = 1.0 - (T**3 - T) / (N**3 - N)
+ assert_equal(c, expected)
+
+ # One tie of three elements
+ ranks = np.array([1.0, 3.0, 3.0, 3.0])
+ c = tiecorrect(ranks)
+ T = 3.0
+ N = ranks.size
+ expected = 1.0 - (T**3 - T) / (N**3 - N)
+ assert_equal(c, expected)
+
+ # Two ties, lengths 2 and 3.
+ ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0])
+ c = tiecorrect(ranks)
+ T1 = 2.0
+ T2 = 3.0
+ N = ranks.size
+ expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N)
+ assert_equal(c, expected)
+
+
+class TestRankData(TestCase):
+
+ def test_empty(self):
+ """stats.rankdata([]) should return an empty array."""
+ a = np.array([], dtype=np.int)
+ r = rankdata(a)
+ assert_array_equal(r, np.array([], dtype=np.float64))
+ r = rankdata([])
+ assert_array_equal(r, np.array([], dtype=np.float64))
+
+ def test_one(self):
+ """Check stats.rankdata with an array of length 1."""
+ data = [100]
+ a = np.array(data, dtype=np.int)
+ r = rankdata(a)
+ assert_array_equal(r, np.array([1.0], dtype=np.float64))
+ r = rankdata(data)
+ assert_array_equal(r, np.array([1.0], dtype=np.float64))
+
+ def test_basic(self):
+ """Basic tests of stats.rankdata."""
+ data = [100, 10, 50]
+ expected = np.array([3.0, 1.0, 2.0], dtype=np.float64)
+ a = np.array(data, dtype=np.int)
+ r = rankdata(a)
+ assert_array_equal(r, expected)
+ r = rankdata(data)
+ assert_array_equal(r, expected)
+
+ data = [40, 10, 30, 10, 50]
+ expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64)
+ a = np.array(data, dtype=np.int)
+ r = rankdata(a)
+ assert_array_equal(r, expected)
+ r = rankdata(data)
+ assert_array_equal(r, expected)
+
+ data = [20, 20, 20, 10, 10, 10]
+ expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64)
+ a = np.array(data, dtype=np.int)
+ r = rankdata(a)
+ assert_array_equal(r, expected)
+ r = rankdata(data)
+ assert_array_equal(r, expected)
+ # The docstring states explicitly that the argument is flattened.
+ a2d = a.reshape(2, 3)
+ r = rankdata(a2d)
+ assert_array_equal(r, expected)
+
+ def test_large_int(self):
+ data = np.array([2**60, 2**60+1], dtype=np.uint64)
+ r = rankdata(data)
+ assert_array_equal(r, [1.0, 2.0])
+
+ data = np.array([2**60, 2**60+1], dtype=np.int64)
+ r = rankdata(data)
+ assert_array_equal(r, [1.0, 2.0])
+
+ data = np.array([2**60, -2**60+1], dtype=np.int64)
+ r = rankdata(data)
+ assert_array_equal(r, [2.0, 1.0])
+
+
+if __name__ == "__main__":
+ run_module_suite()
Please sign in to comment.
Something went wrong with that request. Please try again.