scipy · WarrenWeckesser · Jun 28, 2020 · May 30, 2020 · Jun 27, 2020 · WarrenWeckesser
diff --git a/scipy/stats/stats.py b/scipy/stats/stats.py
@@ -6702,7 +6702,7 @@ def ks_2samp(data1, data2, alternative='two-sided', mode='auto'):
     cdf1 = np.searchsorted(data1, data_all, side='right') / n1
     cdf2 = np.searchsorted(data2, data_all, side='right') / n2
     cddiffs = cdf1 - cdf2
-    minS = -np.min(cddiffs)
+    minS = np.clip(-np.min(cddiffs), 0, 1)  # Ensure sign of minS is not negative.
     maxS = np.max(cddiffs)
     alt2Dvalue = {'less': minS, 'greater': maxS, 'two-sided': max(minS, maxS)}
     d = alt2Dvalue[alternative]
@@ -6718,8 +6718,8 @@ def ks_2samp(data1, data2, alternative='two-sided', mode='auto'):
         if n1g >= np.iinfo(np.int).max / n2g:
             mode = 'asymp'
             warnings.warn(
-                "Exact ks_2samp calculation not possible with samples sizes "
-                "%d and %d. Switching to 'asymp' " % (n1, n2), RuntimeWarning)
+                f"Exact ks_2samp calculation not possible with samples sizes "
+                f"{n1} and {n2}. Switching to 'asymp'.", RuntimeWarning)
 
     if mode == 'exact':
         success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative)
@@ -6731,13 +6731,16 @@ def ks_2samp(data1, data2, alternative='two-sided', mode='auto'):
 
     if mode == 'asymp':
         # The product n1*n2 is large.  Use Smirnov's asymptoptic formula.
+        # Ensure float to avoid overflow in multiplication
+        # sorted because the one-sided formula is not symmetric in n1, n2
+        m, n = sorted(np.array([n1, n2], float), reverse=True)
-        m, n = sorted(np.array([n1, n2], float), reverse=True)
+        m, n = sorted([float(n1), float(n2)], reverse=True)
-        m, n = sorted(np.array([n1, n2], float), reverse=True)
+        m, n = sorted([float(n1), float(n2)], reverse=True)
+        en = m * n / (m + n)
         if alternative == 'two-sided':
-            en = n1 * n2 / (n1 + n2)
             prob = distributions.kstwo.sf(d, np.round(en))
         else:
-            m, n = max(n1, n2), min(n1, n2)
-            z = np.sqrt(m*n/(m+n)) * d
+            z = np.sqrt(en) * d
             # Use Hodges' suggested approximation Eqn 5.3
+            # Requires m to be the larger of (n1, n2)
             expt = -2 * z**2 - 2 * z * (m + 2*n)/np.sqrt(m*n*(m+n))/3.0
             prob = np.exp(expt)
 

diff --git a/scipy/stats/tests/test_stats.py b/scipy/stats/tests/test_stats.py
@@ -3265,6 +3265,18 @@ def test_argument_checking(self):
         assert_raises(ValueError, stats.ks_2samp, [1], [])
         assert_raises(ValueError, stats.ks_2samp, [], [])
 
+    def test_gh12218(self):
+        """Ensure gh-12218 is fixed."""
+        # gh-1228 triggered a TypeError calculating sqrt(n1*n2*(n1+n2)).
+        # n1, n2 both large integers, the product exceeded 2^64
+        np.random.seed(12345678)
+        n1 = 2097152  # 2*^21
+        rvs1 = stats.uniform.rvs(size=n1, loc=0., scale=1)
+        rvs2 = rvs1 + 1  # Exact value of rvs2 doesn't matter.
+        stats.ks_2samp(rvs1, rvs2, alternative='greater', mode='asymp')
+        stats.ks_2samp(rvs1, rvs2, alternative='less', mode='asymp')
+        stats.ks_2samp(rvs1, rvs2, alternative='two-sided', mode='asymp')
+
 
 def test_ttest_rel():
     # regression test