BUG: Correct intermediate overflow in KS one asymptotic in SciPy.stats

Overflow can occur for 6*n when integer n is large. Change the use of exp to expm1 when appropriate. See #17775
scipy · Jan 13, 2023 · 5c445c0 · 5c445c0
1 parent c68b1a0
commit 5c445c0
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 3 deletions.
diff --git a/scipy/special/cephes/kolmogorov.c b/scipy/special/cephes/kolmogorov.c
@@ -774,9 +774,15 @@ _smirnov(int n, double x)
     /* Special case:  n is so big, take too long to compute */
     if (n > SMIRNOV_MAX_COMPUTE_N) {
         /* p ~ e^(-(6nx+1)^2 / 18n) */
-        double logp = -pow(6*n*x+1.0, 2)/18.0/n;
-        sf = exp(logp);
-        cdf = 1 - sf;
+        double logp = -pow(6.0*n*x+1.0, 2)/18.0/n;
+        /* Maximise precision for small p-value. */
+        if (logp < -M_LN2) {
+            sf = exp(logp);
+            cdf = 1 - sf;
+        } else {
+            cdf = -expm1(logp);
+            sf = 1 - cdf;
+        }
         pdf = (6 * nx + 1) * 2 * sf/3;
         RETURN_3PROBS(sf, cdf, pdf);
     }

diff --git a/scipy/stats/tests/test_continuous_basic.py b/scipy/stats/tests/test_continuous_basic.py
@@ -363,6 +363,41 @@ def test_rvs_broadcast(dist, shape_args):
     check_rvs_broadcast(distfunc, dist, allargs, bshape, shape_only, 'd')
 
 
+def test_gh17775_regression():
+    # Regression test for gh-17775. In scipy 1.9.3 and earlier, these tests would fail.
+    # KS one asymptotic sf ~ e^(-(6nx+1)^2 / 18n)
+    # Given a large 32-bit integer n, 6n will overflow in the c implementation.
+    ks = stats.ksone
+    n = 1000000000 # 10**9
+    # Tests use 1/n < x < 1-1/n. Larger x has a smaller sf.
+    # Expected values computed using mpmath to 50 dps and output at 20.
+    # sf(x) ~ cdf(x) ~ 0.5
+    x = 2.0e-5
+    sf, cdf, pdf = ks.sf(x, n), ks.cdf(x, n), ks.pdf(x, n)
+    npt.assert_equal(sf + cdf, 1.0)
+    npt.assert_almost_equal(sf, 0.44932297307934442379)  # noqa, NOT 0.9374359693473666
+    npt.assert_almost_equal(cdf, 0.55067702692065557621) # noqa, NOT 0.06256403065263338
+    npt.assert_almost_equal(pdf, 35946.137394996276407)  # noqa, NOT 74995.50250510222
+    # Small values require ulp comparison.
+    # The actual is limited by computing at double precision (~16 digits).
+    x = 2.0e-9
+    sf, cdf, pdf = ks.sf(x, n), ks.cdf(x, n), ks.pdf(x, n)
+    npt.assert_equal(sf + cdf, 1.0)
+    a = np.array([sf, cdf, pdf])
+    e = np.array([0.99999999061111115519, 9.3888888448132728224e-9, 8.6666665852962971765])
+    # noqa, NOT 0.999999998919518, 1.0804820371745905e-09, 8.666666657302489
+    npt.assert_array_almost_equal_nulp(a, e, 256) # 45-bit precision
+    x = 5.0e-4
+    sf, cdf, pdf = ks.sf(x, n), ks.cdf(x, n), ks.pdf(x, n)
+    npt.assert_equal(sf + cdf, 1.0)
+    a = np.array([sf, cdf, pdf])
+    e = np.array([7.1222019433090374624e-218, 1.0, 1.4244408634752704094e-211])
+    # noqa, NOT 2.914041108584085e-18, 1.0, 5.8280841598622425e-12
+    npt.assert_array_almost_equal_nulp(a, e, 256) # 45-bit precision
+    # Check inverting the small sf
+    npt.assert_equal(ks.isf(sf, n), x)
+
+
 def test_rvs_gh2069_regression():
     # Regression tests for gh-2069.  In scipy 0.17 and earlier,
     # these tests would fail.