scipy · steppi · Aug 4, 2023 · Aug 4, 2020 · Aug 5, 2020 · Aug 5, 2020
diff --git a/THANKS.txt b/THANKS.txt
@@ -241,6 +241,8 @@ Wesley Alves for improvements to scipy.stats.jarque_bera and scipy.stats.shapiro
 Mark Borgerding for contributing linalg.convolution_matrix.
 Shashaank N for contributions to scipy.signal.
 Frank Torres for fixing a bug with solve_bvp for large problems.
+Romain Jacob for non-paremetric confidence intervals for quantiles
+    added in scipy.stats
 Ben West for updating the Gamma distribution documentation.
 
 Institutions

diff --git a/scipy/stats/__init__.py b/scipy/stats/__init__.py
@@ -201,6 +201,7 @@
    entropy
    median_absolute_deviation
    median_abs_deviation
+   confint_quantile
 
 Frequency statistics
 ====================

diff --git a/scipy/stats/stats.py b/scipy/stats/stats.py
@@ -179,14 +179,14 @@
 from scipy import linalg
 from . import distributions
 from . import mstats_basic
+from ._discrete_distns import binom
 from ._stats_mstats_common import (_find_repeats, linregress, theilslopes,
                                    siegelslopes)
 from ._stats import (_kendall_dis, _toint64, _weightedrankedtau,
                      _local_correlations)
 from ._rvs_sampling import rvs_ratio_uniforms
 from ._hypotests import epps_singleton_2samp
 
-
 __all__ = ['find_repeats', 'gmean', 'hmean', 'mode', 'tmean', 'tvar',
            'tmin', 'tmax', 'tstd', 'tsem', 'moment', 'variation',
            'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest',
@@ -207,6 +207,7 @@
            'kstest', 'ks_1samp', 'ks_2samp',
            'chisquare', 'power_divergence', 'mannwhitneyu',
            'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',
+           'confint_quantile',
            'rankdata', 'rvs_ratio_uniforms',
            'combine_pvalues', 'wasserstein_distance', 'energy_distance',
            'brunnermunzel', 'epps_singleton_2samp']
@@ -7469,6 +7470,216 @@ def brunnermunzel(x, y, alternative="two-sided", distribution="t",
     return BrunnerMunzelResult(wbfn, p)
 
 
+def _confint_lowerbound(n, quantile, confidence):
+    r"""
+    Compute the lower bound for a one-sided confidence interval
+    for a given
+    - quantile (0<`quantile`<1)
+    - confidence level (0<`confidence`<1)
+    - number of samples `n`.
+
+    Returns the largest index of the sample being a valid lower bound,
+    or `None` if there are not enough samples to derive one.
+
+    Used by the public function confint_quantile().
+
+    .. versionadded:: 1.6.0
+    """
+
+    # compute all probabilities from the binomial distribution for the quantile of interest
+    bd = binom(n, quantile)
+
+    # the lower bound is the last index before the invert survival function value for
+    # the target confidence level
+    lb = bd.isf(confidence) - 1
+
+    if lb < 0:  # isf returns -1 if there are no matching index
+        return None
+    else:
+        return int(lb)
+
+
+def confint_quantile(x, quantile, confidence, type='one-sided'):
+    r"""Compute non-parametric confidence intervals for any quantile.
+
+    This function implements a non-parametric approach to compute
+    confidence intervals for quantiles. The approach is attributed to Thompson [1]_
+    and later proven to be applicable to any set of i.i.d. samples [2]_.
+    The computation is based on the observation that the probability of a quantile
+    :math:`q` to be larger than any sample :math:`x_m (1\leq m \leq N)`
+    can be computed as
+
+    .. math::
+
+        \mathbb{P}(x_m \leq q) = 1 - \sum_{k=0}^{m-1} \binom{N}{k} q^k(1-q)^{N-k}
+
+    Furthermore, these probabilities are symmetric, which allows to compute both
+    upper and lower bounds from the same computation:
+
+    .. math::
+
+        \mathbb{P}(x_m \leq q) = \mathbb{P}(x_{N-m+1} \geq 1-q).
+
+    The function computes confidence intervals for a given quantile and
+    confidence level, based on `x` which is either a set of samples
+    (one-dimensional array_like) or the number of samples available.
+    The confidence intervals are valid if and only if the samples are i.i.d.
+
+    Both one-sided and two-sided confidence intervals can be obtained
+    (default is one-sided). The function returns two values: either the bounds for the two one-sided
+    confidence intervals, or the lower and upper bounds of a two-sided confidence interval.
+    The return values are either the indexes of the bounds (if `x` is an integer) or
+    sample values (if `x` is the set of samples).
+    `None` is returned when there are not enough samples to compute
+    the desired confidence interval.
+
+    There is no uniqueness of the two-sided confidence interval (see Notes below).
+    Without further assumption on the samples (eg, the nature of the underlying distribution),
+    the one-sided intervals are optimally tight.
+
+    Parameters
+    ----------
+    x : array_like or int
+        Array of samples, should be one-dimensional.
+        If integer, taken as the number of samples available (strictly positive)
+    quantile : float
+        The quantile for which we want to compute the confidence interval.
+        Must be strictly between 0 and 1.
+    confidence : float
+        The desired confidence level of the confidence interval.
+        Must be strictly between 0 and 1.
+    type : {'one-sided', 'two-sided'}, optional
+        Defines the type of confidence interval computed.
+        Default is 'one-sided'.
+
+          * 'one-sided' : computes the best possible one-sided confidence intervals (both lower and upper bounds) for the given quantile.
+          * 'two-sided' : computes a two-sided confidence interval by combination of two one-sided intervals. E.g., a 90% two-sided interval is computed by combining two 95% one-sided intervals
+
+    Returns
+    -------
+    LB : float or int or `None`
+        value or index of the lower bound of
+
+        * the right-open one-sided confidence interval (default, ``type=one-sided``),
+        * a two-sided confidence interval (if ``type=two-sided``)
+
+        `None` is returned when there are not enough samples to compute
+        the confidence interval with the desired level of confidence.
+    UB : float or int  or None
+        value or index of the upper bound of
+
+        * the left-open one-sided confidence interval (default, ``type=one-sided``),
+        * a two-sided confidence interval (if ``type=two-sided``)
+
+        `None` is returned when there are not enough samples to compute
+        the confidence interval with the desired level of confidence.
+
+    Notes
+    -----
+    Two-sided confidence intervals are not guaranteed to be optimal.
+    I.e., there may exist a tighter interval that may contain the quantile
+    of interest with probability larger than the confidence level.
+    These intervals may be found by exhaustive search,
+    which we do not do for efficiency reasons.
+
+    References
+    ----------
+    .. [1] W. R. Thompson, "On Confidence Ranges for the Median and
+       Other Expectation Distributions for Populations of Unknown
+       Distribution Form," The Annals of Mathematical Statistics,
+       vol. 7, no. 3, pp. 122-128, 1936,
+       Accessed: Sep. 18, 2019. [Online].
+       Available: https://www.jstor.org/stable/2957563.
+    .. [2] H. A. David and H. N. Nagaraja, "Order Statistics in
+       Nonparametric Inference" in Order Statistics,
+       John Wiley & Sons, Ltd, 2005, pp. 159-170.
+
+
+    Examples
+    --------
+    >>> from scipy.stats import confint_quantile
+    >>> x = [2, 8, 3, 6, 4, 1, 5, 9, 7]
+    >>> confint_quantile(x, 0.5, 0.95)
+    (2, 8)
+
+    To compute a two-sided interval instead, use the `type` parameter.
+
+    >>> confint_quantile(x, 0.5, 0.99, type='two-sided')
+    (1, 9)
+
+    You can also pass the number of samples as argument (instead of the samples)
+    themselves. The returned values are then the indexes of the upper and lower
+    bounds for the confidence intervals.
+
+    >>> N = 20
+    >>> confint_quantile(N, 0.75, 0.90)
+    (11, 17)
+
+
+    .. versionadded:: 1.6.0
+    """
+
+    ##
+    # Checking the inputs
+    #
+    # x can be either an integer or a one-dimensional array-like
+    if isinstance(x, int):
+        if x < 1:
+            raise ValueError("Invalid parameter: "+repr(x)+", `x` must be either a strictly positive integer or one-dimensional array-like.")
+        n = x
+        return_index = True  # The function will returns the confint indexes
+    else:
+        x = np.asarray(x)
+        if x.ndim != 1:
+            raise ValueError("Invalid parameter: "+repr(x)+", `x` must be either a strictly positive integer or one-dimensional array-like.")
+        x = np.sort(x, axis=0)
+        n = x.shape[0]
+        return_index = False  # The function will returns the confint as values of x
+    #
+    # `confidence` and `quantile` must be between 0 and 1
+    if confidence >= 1 or confidence <= 0:
+        raise ValueError("Invalid `confidence`: "+repr(confidence)+". Provide a real number strictly between 0 and 1.")
+    if quantile >= 1 or quantile <= 0:
+        raise ValueError("Invalid `quantile`: "+repr(quantile)+". Provide a real number strictly between 0 and 1.")
+    #
+    # `type` can be only `one-sided` or `two-sided`
+    if not (type == 'one-sided' or type == 'two-sided'):
+        raise ValueError("Invalid parameter: "+repr(type)+". Valid 'type' values: 'one-sided' or 'two-sided'")
+    ##
+
+    # Handle the type of intervals (one- or two-sided)
+    if type == 'two-sided':
+        conf_working = (1+confidence)/2
+    else:
+        # type == 'one-sided'
+        conf_working = confidence
+
+    # Compute the lower bound
+    LB = _confint_lowerbound(n, quantile, conf_working)
+
+    # Compute the upper bound
+    # -> deduced from the lower bound of (1-quantile)
+    lb = _confint_lowerbound(n, 1-quantile, conf_working)
+    if lb is None:
+        UB = None
+    else:
+        UB = ((n-1) - lb)   # First index is 0 (not 1), hence the -1
+
+    if return_index:
+        return LB, UB
+    else:
+        # Handle unfeasible bounds
+        if LB is None:
+            x_lb = None
+        else:
+            x_lb = x[LB]
+        if UB is None:
+            x_ub = None
+        else:
+            x_ub = x[UB]
+        return x_lb, x_ub
+
+
 def combine_pvalues(pvalues, method='fisher', weights=None):
     """
     Combine p-values from independent tests bearing upon the same hypothesis.

diff --git a/scipy/stats/tests/test_stats.py b/scipy/stats/tests/test_stats.py
@@ -5700,3 +5700,24 @@ def test_dist_perm(self):
                                                                random_state=1)
         assert_approx_equal(stat_dist, 0.163, significant=1)
         assert_approx_equal(pvalue_dist, 0.001, significant=1)
+
+class TestConfInt(object):
+    """ Test the computation of non-parametric
+    confidence intervals for quantiles
+    """
+    X = array([2, 8, 3, 6, 4, 1, 5, 9, 7], float)
+
+    def test_index_equal_value(self):
+        assert_equal(stats.confint_quantile(X, 0.5, 0.9), (3.0, 7.0))
+        assert_equal(stats.confint_quantile(X.shape[0], 0.5, 0.9), (2, 6))
+
+    def test_twosided(self):
+        assert_equal(stats.confint_quantile(X.shape[0], 0.5, 0.9, type='two-sided'), (1, 7))
+
+    def test_values(self):
+        N, q, c = 100, 0.75, 0.95
+        assert_equal(stats.confint_quantile(N, q, c), (67, 82))
+        N, q, c = 10, 0.75, 0.95
+        assert_equal(stats.confint_quantile(N, q, c), (4, None))
+        N, q, c = 20, 0.175, 0.95
+        assert_equal(stats.confint_quantile(N, q, c), (0, 6))