Permalink
Browse files

BUG: fix precision of survival function of hypergeometric distributio…

…n. Closes #1218.
  • Loading branch information...
1 parent 32f9e3d commit ad0901ae1ed1480dcc6560ed6a714635b27cc84a @rgommers rgommers committed Jun 5, 2011
Showing with 71 additions and 33 deletions.
  1. +38 −19 scipy/stats/distributions.py
  2. +1 −0 scipy/stats/stats.py
  3. +32 −14 scipy/stats/tests/test_distributions.py
@@ -309,11 +309,10 @@ def instancemethod(func, obj, cls):
docdict_discrete['example'] = _doc_default_example.replace('[0.9,]',
'Replace with reasonable value')
-_doc_default_before_notes = ''.join([_doc_default_longsummary,
- _doc_allmethods,
- _doc_default_callparams,
- _doc_default_frozen_note])
-
+_doc_default_before_notes = ''.join([docdict_discrete['longsummary'],
+ docdict_discrete['allmethods'],
+ docdict_discrete['callparams'],
+ docdict_discrete['frozennote']])
docdict_discrete['before_notes'] = _doc_default_before_notes
_doc_default_disc = ''.join([docdict_discrete['longsummary'],
@@ -5780,7 +5779,6 @@ def pmf(self, k,*args, **kwds):
"""
Probability mass function at k of the given RV.
-
Parameters
----------
k : array_like
@@ -6593,6 +6591,26 @@ def _stats(self, pr):
## Hypergeometric distribution
class hypergeom_gen(rv_discrete):
+ """A hypergeometric discrete random variable.
+
+ The hypergeometric distribution models drawing objects from a bin.
+ M is the total number of objects, n is total number of Type I objects.
+ The random variate represents the number of Type I objects in N drawn
+ without replacement from the total population.
+
+ %(before_notes)s
+
+ Notes
+ -----
+ The probability mass function is defined as::
+
+ pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N),
+ for N - (M-n) <= k <= min(m,N)
+
+ %(example)s
+
+ """
+
def _rvs(self, M, n, N):
return mtrand.hypergeometric(n,M-n,N,size=self._size)
def _argcheck(self, M, n, N):
@@ -6634,20 +6652,21 @@ def _entropy(self, M, n, N):
vals = self.pmf(k,M,n,N)
lvals = where(vals==0.0,0.0,log(vals))
return -sum(vals*lvals,axis=0)
-hypergeom = hypergeom_gen(name='hypergeom',longname="A hypergeometric",
- shapes="M, n, N", extradoc="""
-
-Hypergeometric distribution
+ def _sf(self, k, M, n, N):
+ """More precise calculation, 1 - cdf doesn't cut it."""
+ # This for loop is needed because `k` can be an array. If that's the
+ # case, the sf() method makes M, n and N arrays of the same shape. We
+ # therefore unpack all inputs args, so we can do the manual integration.
+ res = []
+ for quant, tot, good, draw in zip(k, M, n, N):
+ # Manual integration over probability mass function. More accurate
+ # than integrate.quad.
+ k2 = np.arange(quant + 1, draw + 1)
+ res.append(np.sum(self._pmf(k2, tot, good, draw)))
+ return np.asarray(res)
+
+hypergeom = hypergeom_gen(name='hypergeom', shapes="M, n, N")
- Models drawing objects from a bin.
- M is total number of objects, n is total number of Type I objects.
- RV counts number of Type I objects in N drawn without replacement from
- population.
-
- hypergeom.pmf(k, M, n, N) = choose(n,k)*choose(M-n,N-k)/choose(M,N)
- for N - (M-n) <= k <= min(m,N)
-"""
- )
## Logarithmic (Log-Series), (Series) distribution
# FIXME: Fails _cdfvec
View
@@ -2242,6 +2242,7 @@ def fisher_exact(table) :
--------
>>> fisher_exact([[100, 2], [1000, 5]])
(0.25, 0.13007593634330314)
+
"""
hypergeom = distributions.hypergeom
c = np.asarray(table, dtype=np.int64) # int32 is not enough for the algorithm
@@ -4,7 +4,7 @@
from numpy.testing import TestCase, run_module_suite, assert_equal, \
assert_array_equal, assert_almost_equal, assert_array_almost_equal, \
- assert_, rand, dec
+ assert_allclose, assert_, rand, dec
import numpy
@@ -195,6 +195,37 @@ def test_rvs(self):
assert_(isinstance(val, numpy.ndarray))
assert_(val.dtype.char in typecodes['AllInteger'])
+ def test_precision(self):
+ # comparison number from mpmath
+ M = 2500
+ n = 50
+ N = 500
+ tot = M
+ good = n
+ hgpmf = stats.hypergeom.pmf(2, tot, good, N)
+ assert_almost_equal(hgpmf, 0.0010114963068932233, 11)
+
+ def test_precision2(self):
+ """Test hypergeom precision for large numbers. See #1218."""
+ # Results compared with those from R.
+ oranges = 9.9e4
+ pears = 1.1e5
+ fruits_eaten = np.array([3, 3.8, 3.9, 4, 4.1, 4.2, 5]) * 1e4
+ quantile = 2e4
+ res = []
+ for eaten in fruits_eaten:
+ res.append(stats.hypergeom.sf(quantile, oranges + pears, oranges, eaten))
+ expected = np.array([0, 1.904153e-114, 2.752693e-66, 4.931217e-32,
+ 8.265601e-11, 0.1237904, 1])
+ assert_allclose(res, expected, atol=0, rtol=5e-7)
+
+ # Test with array_like first argument
+ quantiles = [1.9e4, 2e4, 2.1e4, 2.15e4]
+ res2 = stats.hypergeom.sf(quantiles, oranges + pears, oranges, 4.2e4)
+ expected2 = [1, 0.1237904, 6.511452e-34, 3.277667e-69]
+ assert_allclose(res2, expected2, atol=0, rtol=5e-7)
+
+
class TestLogser(TestCase):
def test_rvs(self):
vals = stats.logser.rvs(0.75, size=(2, 50))
@@ -343,19 +374,6 @@ def test_pdf(self):
assert_almost_equal(pdf, 0.1620358)
-class TestHypergeom(TestCase):
- def test_precision(self):
- # comparison number from mpmath
- M = 2500
- n = 50
- N = 500
- tot = M
- good = n
- hgpmf = stats.hypergeom.pmf(2, tot, good, N)
-
- assert_almost_equal(hgpmf, 0.0010114963068932233, 11)
-
-
class TestChi2(TestCase):
# regression tests after precision improvements, ticket:1041, not verified
def test_precision(self):

0 comments on commit ad0901a

Please sign in to comment.