From 6ecda5f88d93192ac1e217b7f9957ce7378a4e5b Mon Sep 17 00:00:00 2001
From: Josef Perktold <josef.pktd@gmail.com>
Date: Wed, 19 Aug 2020 14:58:20 -0400
Subject: [PATCH 1/6] ENH: add prob(x1 > x2) effect size measure, inference
 based on brunner munzel

---
 statsmodels/stats/nonparametric.py            | 252 ++++++++++++++++++
 statsmodels/stats/tests/test_nonparametric.py | 124 ++++++++-
 2 files changed, 374 insertions(+), 2 deletions(-)
 create mode 100644 statsmodels/stats/nonparametric.py

diff --git a/statsmodels/stats/nonparametric.py b/statsmodels/stats/nonparametric.py
new file mode 100644
index 00000000000..8513bd03967
--- /dev/null
+++ b/statsmodels/stats/nonparametric.py
@@ -0,0 +1,252 @@
+# -*- coding: utf-8 -*-
+"""
+Rank based methods for inferential statistics
+
+Created on Sat Aug 15 10:18:53 2020
+
+Author: Josef Perktold
+License: BSD-3
+
+"""
+
+
+import numpy as np
+
+from scipy import stats
+from scipy.stats import rankdata
+
+from statsmodels.stats.base import HolderTuple
+from statsmodels.stats.weightstats import (
+    _zconfint_generic, _tconfint_generic, _zstat_generic, _tstat_generic)
+
+
+class BrunnerMunzelResult(HolderTuple):
+    """Results for rank comparison
+    """
+
+    def conf_int(self, alpha=0.05, value=None, alternative="two-sided"):
+
+        p0 = value
+        if p0 is None:
+            p0 = 0
+        diff = self.prob1 - p0
+        std_diff = np.sqrt(self.var / self.nobs)
+
+        if self.df is None:
+            return _zconfint_generic(diff, std_diff, alpha, alternative)
+        else:
+            return _tconfint_generic(diff, std_diff, self.df, alpha,
+                                     alternative)
+
+    def test_prob_superior(self, value=0.5, alternative="two-sided"):
+        """test for superiority probability
+
+        H0: P(x1 > x2) + 0.5 * P(x1 = x2) = value
+        """
+
+        p0 = value  # alias
+        # diff = self.prob1 - p0  # for reporting, not used in computation
+        # TODO: use var_prob
+        std_diff = np.sqrt(self.var / self.nobs)
+
+        # TODO: return HolderTuple
+        # corresponds to a one-sample test and either p0 or diff could be used
+        if self.df is None:
+            return _zstat_generic(self.prob1, p0, std_diff, alternative,
+                                  diff=0)
+        else:
+            return _tstat_generic(self.prob1, p0, std_diff, self.df,
+                                  alternative, diff=0)
+
+    def tost_prob_superior(self, low, upp):
+        '''test of stochastic (non-)equivalence of p = P(x1 > x2)
+
+        null hypothesis:  p < low or p > upp
+        alternative hypothesis:  low < p < upp
+
+        where p is the probability that a random draw from the population of
+        the first sample has a larger value than a random draw from the
+        population of the second sample, specifically
+
+            p = P(x1 > x2) + 0.5 * P(x1 = x2)
+
+        If the pvalue is smaller than a threshold, say 0.05, then we reject the
+        hypothesis that the probability p that distribution 1 is stochastically
+        superior to distribution 2 is outside of the interval given by
+        thresholds low and upp.
+
+        Parameters
+        ----------
+        low, upp : float
+            equivalence interval low < mean < upp
+
+        Returns
+        -------
+        pvalue : float
+            pvalue of the non-equivalence test
+        t1, pv1, df1 : tuple
+            test statistic, pvalue and degrees of freedom for lower threshold
+            test
+        t2, pv2, df2 : tuple
+            test statistic, pvalue and degrees of freedom for upper threshold
+            test
+
+        '''
+
+        t1, pv1 = self.test_prob_superior(low, alternative='larger')
+        t2, pv2 = self.test_prob_superior(upp, alternative='smaller')
+        df1 = df2 = None
+        # TODO: return HolderTuple
+        return np.maximum(pv1, pv2), (t1, pv1, df1), (t2, pv2, df2)
+
+
+def brunnermunzel(x, y, alternative="two-sided", distribution="t",
+                  nan_policy='propagate'):
+    """
+    Compute the Brunner-Munzel test on samples x and y.
+    The Brunner-Munzel test is a nonparametric test of the null hypothesis that
+    when values are taken one by one from each group, the probabilities of
+    getting large values in both groups are equal.
+    Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the
+    assumption of equivariance of two groups. Note that this does not assume
+    the distributions are same. This test works on two independent samples,
+    which may have different sizes.
+    Parameters
+    ----------
+    x, y : array_like
+        Array of samples, should be one-dimensional.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+          * 'two-sided'
+          * 'less': one-sided
+          * 'greater': one-sided
+    distribution : {'t', 'normal'}, optional
+        Defines how to get the p-value.
+        The following options are available (default is 't'):
+          * 't': get the p-value by t-distribution
+          * 'normal': get the p-value by standard normal distribution.
+    nan_policy : {'propagate', 'raise', 'omit'}, optional
+        Defines how to handle when input contains nan.
+        The following options are available (default is 'propagate'):
+          * 'propagate': returns nan
+          * 'raise': throws an error
+          * 'omit': performs the calculations ignoring nan values
+    Returns
+    -------
+    statistic : float
+        The Brunner-Munzer W statistic.
+    pvalue : float
+        p-value assuming an t distribution. One-sided or
+        two-sided, depending on the choice of `alternative` and `distribution`.
+    See Also
+    --------
+    mannwhitneyu : Mann-Whitney rank test on two samples.
+    Notes
+    -----
+    Brunner and Munzel recommended to estimate the p-value by t-distribution
+    when the size of data is 50 or less. If the size is lower than 10, it would
+    be better to use permuted Brunner Munzel test (see [2]_).
+    References
+    ----------
+    .. [1] Brunner, E. and Munzel, U. "The nonparametric Benhrens-Fisher
+           problem: Asymptotic theory and a small-sample approximation".
+           Biometrical Journal. Vol. 42(2000): 17-25.
+    .. [2] Neubert, K. and Brunner, E. "A studentized permutation test for the
+           non-parametric Behrens-Fisher problem". Computational Statistics and
+           Data Analysis. Vol. 51(2007): 5192-5204.
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> x1 = [1,2,1,1,1,1,1,1,1,1,2,4,1,1]
+    >>> x2 = [3,3,4,3,1,2,3,1,1,5,4]
+    >>> w, p_value = stats.brunnermunzel(x1, x2)
+    >>> w
+    3.1374674823029505
+    >>> p_value
+    0.0057862086661515377
+    """
+    x = np.asarray(x)
+    y = np.asarray(y)
+
+#    # check both x and y
+#    cnx, npx = _contains_nan(x, nan_policy)
+#    cny, npy = _contains_nan(y, nan_policy)
+#    contains_nan = cnx or cny
+#    if npx == "omit" or npy == "omit":
+#        nan_policy = "omit"
+
+#    if contains_nan and nan_policy == "propagate":
+#        return BrunnerMunzelResult(np.nan, np.nan)
+#    elif contains_nan and nan_policy == "omit":
+#        x = ma.masked_invalid(x)
+#        y = ma.masked_invalid(y)
+#        return mstats_basic.brunnermunzel(x, y, alternative, distribution)
+
+    nx = len(x)
+    ny = len(y)
+    nobs = nx + ny
+    if nx == 0 or ny == 0:
+        return BrunnerMunzelResult(np.nan, np.nan)
+    rankc = rankdata(np.concatenate((x, y)))
+    rankcx = rankc[0:nx]
+    rankcy = rankc[nx:nx+ny]
+    rankcx_mean = np.mean(rankcx)
+    rankcy_mean = np.mean(rankcy)
+    rankx = rankdata(x)
+    ranky = rankdata(y)
+    rankx_mean = np.mean(rankx)
+    ranky_mean = np.mean(ranky)
+
+    Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
+    Sx /= nx - 1
+    Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
+    Sy /= ny - 1
+
+    wbfn = nx * ny * (rankcy_mean - rankcx_mean)
+    wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
+
+    if distribution == "t":
+        df_numer = np.power(nx * Sx + ny * Sy, 2.0)
+        df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
+        df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
+        df = df_numer / df_denom
+        p = stats.t.cdf(wbfn, df)
+    elif distribution == "normal":
+        p = stats.norm.cdf(wbfn)
+        df = None
+    else:
+        raise ValueError(
+            "distribution should be 't' or 'normal'")
+
+    if alternative == "greater":
+        pass
+    elif alternative == "less":
+        p = 1 - p
+    elif alternative == "two-sided":
+        p = 2 * np.min([p, 1-p])
+    else:
+        raise ValueError(
+            "alternative should be 'less', 'greater' or 'two-sided'")
+
+    # other info
+    nobs1, nobs2 = nx, ny   # rename
+    mean1 = rankcx_mean
+    mean2 = rankcy_mean
+
+    var1 = Sx / (nobs - nx)**2
+    var2 = Sy / (nobs - ny)**2
+    var_prob = (var1 / nobs1 + var2 / nobs2)
+    var = nobs * (var1 / nobs1 + var2 / nobs2)
+    prob1 = (mean1 - (nobs1 + 1) / 2) / nobs2
+    prob2 = (mean2 - (nobs2 + 1) / 2) / nobs1
+
+    return BrunnerMunzelResult(statistic=wbfn, pvalue=p, x1=Sx, s2=Sy,
+                               var1=var1, var2=var2, var=var,
+                               var_prob=var_prob,
+                               nobs1=nx, nobs2=ny, nobs=nobs,
+                               mean1=mean1, mean2=mean2,
+                               prob1=prob1, prob2=prob2,
+                               somersd1=prob1 * 2 - 1, somersd2=prob2 * 2 - 1,
+                               df=df
+                               )
diff --git a/statsmodels/stats/tests/test_nonparametric.py b/statsmodels/stats/tests/test_nonparametric.py
index 65ac6a54cf9..6552692b554 100644
--- a/statsmodels/stats/tests/test_nonparametric.py
+++ b/statsmodels/stats/tests/test_nonparametric.py
@@ -2,18 +2,23 @@
 """
 
 Created on Fri Jul 05 14:05:24 2013
+Aug 15 2020: add brunnermunzel, rank_compare_2indep
 
 Author: Josef Perktold
 """
 from statsmodels.compat.python import lzip
 import numpy as np
-from numpy.testing import assert_allclose, assert_almost_equal
+from numpy.testing import (assert_allclose, assert_almost_equal,
+                           assert_approx_equal, assert_)
 import pytest
 
-from statsmodels.stats.contingency_tables import mcnemar, cochrans_q, SquareTable
+from statsmodels.stats.contingency_tables import (
+    mcnemar, cochrans_q, SquareTable)
 from statsmodels.sandbox.stats.runs import (Runs,
                                             runstest_1samp, runstest_2samp)
 from statsmodels.sandbox.stats.runs import mcnemar as sbmcnemar
+from statsmodels.stats.nonparametric import brunnermunzel
+from statsmodels.tools.testing import Holder
 
 
 def _expand_table(table):
@@ -274,3 +279,118 @@ def test_runstest_2sample():
     # check cutoff
     res2_1s = runstest_1samp(xy, xy.mean())
     assert_allclose(res2_1s, res_1s, rtol=1e-6)
+
+
+def test_brunnermunzel_one_sided():
+    # copied from scipy with adjustment
+    x = [1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 1, 1]
+    y = [3, 3, 4, 3, 1, 2, 3, 1, 1, 5, 4]
+    significant = 13
+
+    # Results are compared with R's lawstat package.
+    u1, p1 = brunnermunzel(x, y, alternative='less')
+    u2, p2 = brunnermunzel(y, x, alternative='greater')
+    u3, p3 = brunnermunzel(x, y, alternative='greater')
+    u4, p4 = brunnermunzel(y, x, alternative='less')
+
+    assert_approx_equal(p1, p2, significant=significant)
+    assert_approx_equal(p3, p4, significant=significant)
+    assert_(p1 != p3)
+    assert_approx_equal(u1, 3.1374674823029505,
+                        significant=significant)
+    assert_approx_equal(u2, -3.1374674823029505,
+                        significant=significant)
+    assert_approx_equal(u3, 3.1374674823029505,
+                        significant=significant)
+    assert_approx_equal(u4, -3.1374674823029505,
+                        significant=significant)
+    assert_approx_equal(p1, 0.0028931043330757342,
+                        significant=significant)
+    assert_approx_equal(p3, 0.99710689566692423,
+                        significant=significant)
+
+
+def test_brunnermunzel_two_sided():
+    # copied from scipy with adjustment
+    x = [1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 1, 1]
+    y = [3, 3, 4, 3, 1, 2, 3, 1, 1, 5, 4]
+    significant = 13
+
+    # Results are compared with R's lawstat package.
+    u1, p1 = brunnermunzel(x, y, alternative='two-sided')
+    u2, p2 = brunnermunzel(y, x, alternative='two-sided')
+
+    assert_approx_equal(p1, p2, significant=significant)
+    assert_approx_equal(u1, 3.1374674823029505,
+                        significant=significant)
+    assert_approx_equal(u2, -3.1374674823029505,
+                        significant=significant)
+    assert_approx_equal(p1, 0.0057862086661515377,
+                        significant=significant)
+
+
+def test_rank_compare1():
+    # Example from Munzel and Hauschke 2003
+    # data is given by counts, expand to observations
+    levels = [-2, -1, 0, 1, 2]
+    new = [24, 37, 21, 19, 6]
+    active = [11, 51, 22, 21, 7]
+    x1 = np.repeat(levels, new)
+    x2 = np.repeat(levels, active)
+
+    # using lawstat
+    # > brunner.munzel.test(xn, xa) #brunnermunzel.test(x, y)
+    res2_t = Holder(statistic=1.1757561456582,
+                    df=204.2984239868,
+                    pvalue=0.2410606649547,
+                    ci=[0.4700629827705593, 0.6183882855872511],
+                    prob=0.5442256341789052)
+
+    res = brunnermunzel(x1, x2, distribution="normal")
+    assert_allclose(res.statistic, res2_t.statistic, rtol=1e-13)
+    assert_allclose(res.prob1, 1 - res2_t.prob, rtol=1e-13)
+    assert_allclose(res.prob2, res2_t.prob, rtol=1e-13)
+    tt = res.test_prob_superior()
+    # TODO: return HolderTuple
+    # assert_allclose(tt.statistic, res2_t.statistic)
+    # TODO: check sign/direction in lawstat
+    assert_allclose(tt[0], -res2_t.statistic, rtol=1e-13)
+
+    ci = res.conf_int(alpha=0.05)
+    # we compare normal confint with t confint, lower rtol
+    assert_allclose(ci, 1 - np.array(res2_t.ci)[::-1], rtol=0.005)
+    # test consistency of test and confint
+    res_lb = res.test_prob_superior(value=ci[0])
+    assert_allclose(res_lb[1], 0.05, rtol=1e-13)
+    res_ub = res.test_prob_superior(value=ci[1])
+    assert_allclose(res_ub[1], 0.05, rtol=1e-13)
+
+    # test consistency of tost and confint
+    res_tost = res.tost_prob_superior(*ci)
+    assert_allclose(res_tost[1][1], 0.025, rtol=1e-13)
+    assert_allclose(res_tost[2][1], 0.025, rtol=1e-13)
+
+    res = brunnermunzel(x1, x2, distribution="t")
+    assert_allclose(res.statistic, res2_t.statistic, rtol=1e-13)
+    tt = res.test_prob_superior()
+    # TODO: return HolderTuple
+    # assert_allclose(tt.statistic, res2_t.statistic)
+    # TODO: check sign/direction in lawstat, reversed from ours
+    assert_allclose(tt[0], -res2_t.statistic, rtol=1e-13)
+    assert_allclose(tt[1], res2_t.pvalue, rtol=1e-13)
+    assert_allclose(res.pvalue, res2_t.pvalue, rtol=1e-13)
+    assert_allclose(res.df, res2_t.df, rtol=1e-13)
+
+    ci = res.conf_int(alpha=0.05)
+    # our ranking is defined as reversed from lawstat, and BM article
+    assert_allclose(ci, 1 - np.array(res2_t.ci)[::-1], rtol=1e-11)
+    # test consistency of test and confint
+    res_lb = res.test_prob_superior(value=ci[0])
+    assert_allclose(res_lb[1], 0.05, rtol=1e-11)
+    res_ub = res.test_prob_superior(value=ci[1])
+    assert_allclose(res_ub[1], 0.05, rtol=1e-11)
+
+    # test consistency of tost and confint
+    res_tost = res.tost_prob_superior(*ci)
+    assert_allclose(res_tost[1][1], 0.025, rtol=1e-11)
+    assert_allclose(res_tost[2][1], 0.025, rtol=1e-11)

From 58d83059c13f5dd4a95f9b20e0f514b45862120b Mon Sep 17 00:00:00 2001
From: Josef Perktold <josef.pktd@gmail.com>
Date: Fri, 21 Aug 2020 11:31:47 -0400
Subject: [PATCH 2/6] REF: rename function, adjust namings to statsmodels

---
 statsmodels/stats/nonparametric.py            | 186 +++++++++---------
 statsmodels/stats/tests/test_nonparametric.py |  35 ++--
 2 files changed, 118 insertions(+), 103 deletions(-)

diff --git a/statsmodels/stats/nonparametric.py b/statsmodels/stats/nonparametric.py
index 8513bd03967..b147c4fc143 100644
--- a/statsmodels/stats/nonparametric.py
+++ b/statsmodels/stats/nonparametric.py
@@ -20,7 +20,7 @@
     _zconfint_generic, _tconfint_generic, _zstat_generic, _tstat_generic)
 
 
-class BrunnerMunzelResult(HolderTuple):
+class RankCompareResult(HolderTuple):
     """Results for rank comparison
     """
 
@@ -100,53 +100,76 @@ def tost_prob_superior(self, low, upp):
         return np.maximum(pv1, pv2), (t1, pv1, df1), (t2, pv2, df2)
 
 
-def brunnermunzel(x, y, alternative="two-sided", distribution="t",
-                  nan_policy='propagate'):
+def rank_compare(x1, x2, alternative="two-sided", use_t=True):
     """
-    Compute the Brunner-Munzel test on samples x and y.
-    The Brunner-Munzel test is a nonparametric test of the null hypothesis that
-    when values are taken one by one from each group, the probabilities of
-    getting large values in both groups are equal.
-    Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the
-    assumption of equivariance of two groups. Note that this does not assume
-    the distributions are same. This test works on two independent samples,
-    which may have different sizes.
+    Statistics and tests for the probability that x1 has larger values than x2.
+
+    p is the probability that a random draw from the population of
+    the first sample has a larger value than a random draw from the
+    population of the second sample, specifically
+
+            p = P(x1 > x2) + 0.5 * P(x1 = x2)
+
+    This is a measure underlying Wilcoxon-Mann-Whitney's U test,
+    Fligner-Policello test and Brunner-Munzel test, and
+    Inference is based on the asymptotic distribution of the Brunner-Munzel
+    test.
+
+    The Null hypothesis for stochastic equality is p = 0.5, which corresponds
+    to the Brunner-Munzel test.
+
     Parameters
     ----------
-    x, y : array_like
+    x1, x2 : array_like
         Array of samples, should be one-dimensional.
     alternative : {'two-sided', 'less', 'greater'}, optional
         Defines the alternative hypothesis.
         The following options are available (default is 'two-sided'):
           * 'two-sided'
-          * 'less': one-sided
-          * 'greater': one-sided
-    distribution : {'t', 'normal'}, optional
-        Defines how to get the p-value.
-        The following options are available (default is 't'):
-          * 't': get the p-value by t-distribution
-          * 'normal': get the p-value by standard normal distribution.
-    nan_policy : {'propagate', 'raise', 'omit'}, optional
-        Defines how to handle when input contains nan.
-        The following options are available (default is 'propagate'):
-          * 'propagate': returns nan
-          * 'raise': throws an error
-          * 'omit': performs the calculations ignoring nan values
+          * 'smaller': one-sided
+          * 'larger': one-sided
+    use_t : poolean
+        If use_t is true, the t distribution with Welch-Satterthwaite type
+        degrees of freedom is used for p-value and confidence interval.
+        If use_t is false, then the normal distribution is used.
+
     Returns
     -------
-    statistic : float
-        The Brunner-Munzer W statistic.
-    pvalue : float
-        p-value assuming an t distribution. One-sided or
-        two-sided, depending on the choice of `alternative` and `distribution`.
+    res : RankCompareResult
+
+        statistic : float
+            The Brunner-Munzer W statistic.
+        pvalue : float
+            p-value assuming an t distribution. One-sided or
+            two-sided, depending on the choice of `alternative` and `use_t`.
+
+
     See Also
     --------
-    mannwhitneyu : Mann-Whitney rank test on two samples.
+    scipy.stats.brunnermunzel : Brunner-Munzel test for stochastic equality
+    scipy.stats.mannwhitneyu : Mann-Whitney rank test on two samples.
+
     Notes
     -----
+    Wilcoxon-Mann-Whitney assumes equal variance or equal distribution under
+    the Null hypothesis. Fligner-Policello test allows for unequal variances
+    but assumes continuous distribution, i.e. no ties.
+    Brunner-Munzel extend the test to allow for unequal variance and discrete
+    or ordered categorical random variables.
+
     Brunner and Munzel recommended to estimate the p-value by t-distribution
     when the size of data is 50 or less. If the size is lower than 10, it would
     be better to use permuted Brunner Munzel test (see [2]_).
+
+    This measure has been introduced in the literature under many different
+    names relying on a variety of assumptions.
+    In psychology, ... introduced it as Common Language effect size for the
+    continuous, normal distribution case, ... extended it to the nonparameteric
+    continuous distribution case as in Fligner-Policello.
+
+    Note: Brunner-Munzel define the probability for x1 to be stochastically
+    smaller than x2, while here we use stochastically larger.
+
     References
     ----------
     .. [1] Brunner, E. and Munzel, U. "The nonparametric Benhrens-Fisher
@@ -166,87 +189,70 @@ def brunnermunzel(x, y, alternative="two-sided", distribution="t",
     >>> p_value
     0.0057862086661515377
     """
-    x = np.asarray(x)
-    y = np.asarray(y)
-
-#    # check both x and y
-#    cnx, npx = _contains_nan(x, nan_policy)
-#    cny, npy = _contains_nan(y, nan_policy)
-#    contains_nan = cnx or cny
-#    if npx == "omit" or npy == "omit":
-#        nan_policy = "omit"
-
-#    if contains_nan and nan_policy == "propagate":
-#        return BrunnerMunzelResult(np.nan, np.nan)
-#    elif contains_nan and nan_policy == "omit":
-#        x = ma.masked_invalid(x)
-#        y = ma.masked_invalid(y)
-#        return mstats_basic.brunnermunzel(x, y, alternative, distribution)
-
-    nx = len(x)
-    ny = len(y)
-    nobs = nx + ny
-    if nx == 0 or ny == 0:
-        return BrunnerMunzelResult(np.nan, np.nan)
-    rankc = rankdata(np.concatenate((x, y)))
-    rankcx = rankc[0:nx]
-    rankcy = rankc[nx:nx+ny]
+    x1 = np.asarray(x1)
+    x2 = np.asarray(x2)
+
+    nobs1 = len(x1)
+    nobs2 = len(x2)
+    nobs = nobs1 + nobs2
+    if nobs1 == 0 or nobs2 == 0:
+        return RankCompareResult(statistic=np.nan, pvalue=np.nan)
+    rankc = rankdata(np.concatenate((x1, x2)))
+    rankcx = rankc[0:nobs1]
+    rankcy = rankc[nobs1:nobs1+nobs2]
     rankcx_mean = np.mean(rankcx)
     rankcy_mean = np.mean(rankcy)
-    rankx = rankdata(x)
-    ranky = rankdata(y)
+    rankx = rankdata(x1)
+    ranky = rankdata(x2)
     rankx_mean = np.mean(rankx)
     ranky_mean = np.mean(ranky)
 
-    Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
-    Sx /= nx - 1
-    Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
-    Sy /= ny - 1
+    S1 = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
+    S1 /= nobs1 - 1
+    S2 = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
+    S2 /= nobs2 - 1
 
-    wbfn = nx * ny * (rankcy_mean - rankcx_mean)
-    wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
+    wbfn = nobs1 * nobs2 * (rankcx_mean - rankcy_mean)
+    wbfn /= (nobs1 + nobs2) * np.sqrt(nobs1 * S1 + nobs2 * S2)
 
-    if distribution == "t":
-        df_numer = np.power(nx * Sx + ny * Sy, 2.0)
-        df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
-        df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
+    if use_t:
+        df_numer = np.power(nobs1 * S1 + nobs2 * S2, 2.0)
+        df_denom = np.power(nobs1 * S1, 2.0) / (nobs1 - 1)
+        df_denom += np.power(nobs2 * S2, 2.0) / (nobs2 - 1)
         df = df_numer / df_denom
-        p = stats.t.cdf(wbfn, df)
-    elif distribution == "normal":
-        p = stats.norm.cdf(wbfn)
-        df = None
+        pvalue = stats.t.cdf(wbfn, df)
     else:
-        raise ValueError(
-            "distribution should be 't' or 'normal'")
+        pvalue = stats.norm.cdf(wbfn)
+        df = None
 
-    if alternative == "greater":
+    if alternative == "larger":
         pass
-    elif alternative == "less":
-        p = 1 - p
+    elif alternative == "smaller":
+        pvalue = 1 - pvalue
     elif alternative == "two-sided":
-        p = 2 * np.min([p, 1-p])
+        pvalue = 2 * np.min([pvalue, 1-pvalue])
     else:
         raise ValueError(
             "alternative should be 'less', 'greater' or 'two-sided'")
 
     # other info
-    nobs1, nobs2 = nx, ny   # rename
+    nobs1, nobs2 = nobs1, nobs2   # rename
     mean1 = rankcx_mean
     mean2 = rankcy_mean
 
-    var1 = Sx / (nobs - nx)**2
-    var2 = Sy / (nobs - ny)**2
+    var1 = S1 / (nobs - nobs1)**2
+    var2 = S2 / (nobs - nobs2)**2
     var_prob = (var1 / nobs1 + var2 / nobs2)
     var = nobs * (var1 / nobs1 + var2 / nobs2)
     prob1 = (mean1 - (nobs1 + 1) / 2) / nobs2
     prob2 = (mean2 - (nobs2 + 1) / 2) / nobs1
 
-    return BrunnerMunzelResult(statistic=wbfn, pvalue=p, x1=Sx, s2=Sy,
-                               var1=var1, var2=var2, var=var,
-                               var_prob=var_prob,
-                               nobs1=nx, nobs2=ny, nobs=nobs,
-                               mean1=mean1, mean2=mean2,
-                               prob1=prob1, prob2=prob2,
-                               somersd1=prob1 * 2 - 1, somersd2=prob2 * 2 - 1,
-                               df=df
-                               )
+    return RankCompareResult(statistic=wbfn, pvalue=pvalue, s1=S1, s2=S2,
+                             var1=var1, var2=var2, var=var,
+                             var_prob=var_prob,
+                             nobs1=nobs1, nobs2=nobs2, nobs=nobs,
+                             mean1=mean1, mean2=mean2,
+                             prob1=prob1, prob2=prob2,
+                             somersd1=prob1 * 2 - 1, somersd2=prob2 * 2 - 1,
+                             df=df
+                             )
diff --git a/statsmodels/stats/tests/test_nonparametric.py b/statsmodels/stats/tests/test_nonparametric.py
index 6552692b554..f6ecf548667 100644
--- a/statsmodels/stats/tests/test_nonparametric.py
+++ b/statsmodels/stats/tests/test_nonparametric.py
@@ -17,7 +17,7 @@
 from statsmodels.sandbox.stats.runs import (Runs,
                                             runstest_1samp, runstest_2samp)
 from statsmodels.sandbox.stats.runs import mcnemar as sbmcnemar
-from statsmodels.stats.nonparametric import brunnermunzel
+from statsmodels.stats.nonparametric import rank_compare
 from statsmodels.tools.testing import Holder
 
 
@@ -287,11 +287,14 @@ def test_brunnermunzel_one_sided():
     y = [3, 3, 4, 3, 1, 2, 3, 1, 1, 5, 4]
     significant = 13
 
+    # revere direction to match our definition
+    x, y = y, x
+
     # Results are compared with R's lawstat package.
-    u1, p1 = brunnermunzel(x, y, alternative='less')
-    u2, p2 = brunnermunzel(y, x, alternative='greater')
-    u3, p3 = brunnermunzel(x, y, alternative='greater')
-    u4, p4 = brunnermunzel(y, x, alternative='less')
+    u1, p1 = rank_compare(x, y, alternative='smaller')
+    u2, p2 = rank_compare(y, x, alternative='larger')
+    u3, p3 = rank_compare(x, y, alternative='larger')
+    u4, p4 = rank_compare(y, x, alternative='smaller')
 
     assert_approx_equal(p1, p2, significant=significant)
     assert_approx_equal(p3, p4, significant=significant)
@@ -316,9 +319,12 @@ def test_brunnermunzel_two_sided():
     y = [3, 3, 4, 3, 1, 2, 3, 1, 1, 5, 4]
     significant = 13
 
+    # revere direction to match our definition
+    x, y = y, x
+
     # Results are compared with R's lawstat package.
-    u1, p1 = brunnermunzel(x, y, alternative='two-sided')
-    u2, p2 = brunnermunzel(y, x, alternative='two-sided')
+    u1, p1 = rank_compare(x, y, alternative='two-sided')
+    u2, p2 = rank_compare(y, x, alternative='two-sided')
 
     assert_approx_equal(p1, p2, significant=significant)
     assert_approx_equal(u1, 3.1374674823029505,
@@ -346,8 +352,8 @@ def test_rank_compare1():
                     ci=[0.4700629827705593, 0.6183882855872511],
                     prob=0.5442256341789052)
 
-    res = brunnermunzel(x1, x2, distribution="normal")
-    assert_allclose(res.statistic, res2_t.statistic, rtol=1e-13)
+    res = rank_compare(x1, x2, use_t=False)
+    assert_allclose(res.statistic, -res2_t.statistic, rtol=1e-13)
     assert_allclose(res.prob1, 1 - res2_t.prob, rtol=1e-13)
     assert_allclose(res.prob2, res2_t.prob, rtol=1e-13)
     tt = res.test_prob_superior()
@@ -370,20 +376,23 @@ def test_rank_compare1():
     assert_allclose(res_tost[1][1], 0.025, rtol=1e-13)
     assert_allclose(res_tost[2][1], 0.025, rtol=1e-13)
 
-    res = brunnermunzel(x1, x2, distribution="t")
+    # use t-distribution
+    # our ranking is defined as reversed from lawstat, and BM article
+    # revere direction to match our definition
+    x1, x2 = x2, x1
+    res = rank_compare(x1, x2, use_t=True)
     assert_allclose(res.statistic, res2_t.statistic, rtol=1e-13)
     tt = res.test_prob_superior()
     # TODO: return HolderTuple
     # assert_allclose(tt.statistic, res2_t.statistic)
     # TODO: check sign/direction in lawstat, reversed from ours
-    assert_allclose(tt[0], -res2_t.statistic, rtol=1e-13)
+    assert_allclose(tt[0], res2_t.statistic, rtol=1e-13)
     assert_allclose(tt[1], res2_t.pvalue, rtol=1e-13)
     assert_allclose(res.pvalue, res2_t.pvalue, rtol=1e-13)
     assert_allclose(res.df, res2_t.df, rtol=1e-13)
 
     ci = res.conf_int(alpha=0.05)
-    # our ranking is defined as reversed from lawstat, and BM article
-    assert_allclose(ci, 1 - np.array(res2_t.ci)[::-1], rtol=1e-11)
+    assert_allclose(ci, res2_t.ci, rtol=1e-11)
     # test consistency of test and confint
     res_lb = res.test_prob_superior(value=ci[0])
     assert_allclose(res_lb[1], 0.05, rtol=1e-11)

From 4bddf82f3dffeeb181d71ed02b47fde9d608b718 Mon Sep 17 00:00:00 2001
From: Josef Perktold <josef.pktd@gmail.com>
Date: Sun, 23 Aug 2020 14:25:59 -0400
Subject: [PATCH 3/6] REF: more cleanup refactoring and renaming, add
 rankdata_2samp function

---
 statsmodels/stats/nonparametric.py            | 98 +++++++++++--------
 statsmodels/stats/tests/test_nonparametric.py | 37 ++++---
 2 files changed, 82 insertions(+), 53 deletions(-)

diff --git a/statsmodels/stats/nonparametric.py b/statsmodels/stats/nonparametric.py
index b147c4fc143..5430c757dde 100644
--- a/statsmodels/stats/nonparametric.py
+++ b/statsmodels/stats/nonparametric.py
@@ -20,6 +20,23 @@
     _zconfint_generic, _tconfint_generic, _zstat_generic, _tstat_generic)
 
 
+def rankdata_2samp(x1, x2):
+    x1 = np.asarray(x1)
+    x2 = np.asarray(x2)
+
+    nobs1 = len(x1)
+    nobs2 = len(x2)
+    if nobs1 == 0 or nobs2 == 0:
+        raise ValueError("one sample has zero length")
+
+    rank = rankdata(np.concatenate((x1, x2)))
+    rank1 = rank[:nobs1]
+    rank2 = rank[nobs1:]
+    ranki1 = rankdata(x1)
+    ranki2 = rankdata(x2)
+    return rank1, rank2, ranki1, ranki2
+
+
 class RankCompareResult(HolderTuple):
     """Results for rank comparison
     """
@@ -99,8 +116,30 @@ def tost_prob_superior(self, low, upp):
         # TODO: return HolderTuple
         return np.maximum(pv1, pv2), (t1, pv1, df1), (t2, pv2, df2)
 
+    def summary(self, alpha=0.05):
+
+        xname = ["prob(x1>x2)"]
+        yname = "None"
+        effect = np.atleast_1d(self.prob1)
+        pvalues = np.atleast_1d(self.pvalue)
+        ci = np.array(self.conf_int(alpha))[None, :]
+        use_t = (self.df is not None)
+        sd = np.atleast_1d(np.sqrt(self.var_prob))
+        statistic = np.atleast_1d(self.statistic)
+        if xname is None:
+            xname = ['c%d' % ii for ii in range(len(self.effect))]
 
-def rank_compare(x1, x2, alternative="two-sided", use_t=True):
+        title = "Probability sample 1 is stochastically larger"
+        from statsmodels.iolib.summary import summary_params
+
+        summ = summary_params((self, effect, sd, statistic,
+                               pvalues, ci),
+                              yname=yname, xname=xname, use_t=use_t,
+                              title=title, alpha=alpha)
+        return summ
+
+
+def rank_compare_2indep(x1, x2, use_t=True):
     """
     Statistics and tests for the probability that x1 has larger values than x2.
 
@@ -122,12 +161,6 @@ def rank_compare(x1, x2, alternative="two-sided", use_t=True):
     ----------
     x1, x2 : array_like
         Array of samples, should be one-dimensional.
-    alternative : {'two-sided', 'less', 'greater'}, optional
-        Defines the alternative hypothesis.
-        The following options are available (default is 'two-sided'):
-          * 'two-sided'
-          * 'smaller': one-sided
-          * 'larger': one-sided
     use_t : poolean
         If use_t is true, the t distribution with Welch-Satterthwaite type
         degrees of freedom is used for p-value and confidence interval.
@@ -196,62 +229,47 @@ def rank_compare(x1, x2, alternative="two-sided", use_t=True):
     nobs2 = len(x2)
     nobs = nobs1 + nobs2
     if nobs1 == 0 or nobs2 == 0:
-        return RankCompareResult(statistic=np.nan, pvalue=np.nan)
-    rankc = rankdata(np.concatenate((x1, x2)))
-    rankcx = rankc[0:nobs1]
-    rankcy = rankc[nobs1:nobs1+nobs2]
-    rankcx_mean = np.mean(rankcx)
-    rankcy_mean = np.mean(rankcy)
-    rankx = rankdata(x1)
-    ranky = rankdata(x2)
-    rankx_mean = np.mean(rankx)
-    ranky_mean = np.mean(ranky)
-
-    S1 = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
+        raise ValueError("one sample has zero length")
+
+    rank1, rank2, ranki1, ranki2 = rankdata_2samp(x1, x2)
+
+    meanr1 = np.mean(rank1)
+    meanr2 = np.mean(rank2)
+    meanri1 = np.mean(ranki1)
+    meanri2 = np.mean(ranki2)
+
+    S1 = np.sum(np.power(rank1 - ranki1 - meanr1 + meanri1, 2.0))
     S1 /= nobs1 - 1
-    S2 = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
+    S2 = np.sum(np.power(rank2 - ranki2 - meanr2 + meanri2, 2.0))
     S2 /= nobs2 - 1
 
-    wbfn = nobs1 * nobs2 * (rankcx_mean - rankcy_mean)
+    wbfn = nobs1 * nobs2 * (meanr1 - meanr2)
     wbfn /= (nobs1 + nobs2) * np.sqrt(nobs1 * S1 + nobs2 * S2)
 
+    # Here we only use alternative == "two-sided"
     if use_t:
         df_numer = np.power(nobs1 * S1 + nobs2 * S2, 2.0)
         df_denom = np.power(nobs1 * S1, 2.0) / (nobs1 - 1)
         df_denom += np.power(nobs2 * S2, 2.0) / (nobs2 - 1)
         df = df_numer / df_denom
-        pvalue = stats.t.cdf(wbfn, df)
+        pvalue = 2 * stats.t.sf(np.abs(wbfn), df)
     else:
-        pvalue = stats.norm.cdf(wbfn)
+        pvalue = 2 * stats.norm.sf(np.abs(wbfn))
         df = None
 
-    if alternative == "larger":
-        pass
-    elif alternative == "smaller":
-        pvalue = 1 - pvalue
-    elif alternative == "two-sided":
-        pvalue = 2 * np.min([pvalue, 1-pvalue])
-    else:
-        raise ValueError(
-            "alternative should be 'less', 'greater' or 'two-sided'")
-
     # other info
-    nobs1, nobs2 = nobs1, nobs2   # rename
-    mean1 = rankcx_mean
-    mean2 = rankcy_mean
-
     var1 = S1 / (nobs - nobs1)**2
     var2 = S2 / (nobs - nobs2)**2
     var_prob = (var1 / nobs1 + var2 / nobs2)
     var = nobs * (var1 / nobs1 + var2 / nobs2)
-    prob1 = (mean1 - (nobs1 + 1) / 2) / nobs2
-    prob2 = (mean2 - (nobs2 + 1) / 2) / nobs1
+    prob1 = (meanr1 - (nobs1 + 1) / 2) / nobs2
+    prob2 = (meanr2 - (nobs2 + 1) / 2) / nobs1
 
     return RankCompareResult(statistic=wbfn, pvalue=pvalue, s1=S1, s2=S2,
                              var1=var1, var2=var2, var=var,
                              var_prob=var_prob,
                              nobs1=nobs1, nobs2=nobs2, nobs=nobs,
-                             mean1=mean1, mean2=mean2,
+                             mean1=meanr1, mean2=meanr2,
                              prob1=prob1, prob2=prob2,
                              somersd1=prob1 * 2 - 1, somersd2=prob2 * 2 - 1,
                              df=df
diff --git a/statsmodels/stats/tests/test_nonparametric.py b/statsmodels/stats/tests/test_nonparametric.py
index f6ecf548667..2b25e5c27eb 100644
--- a/statsmodels/stats/tests/test_nonparametric.py
+++ b/statsmodels/stats/tests/test_nonparametric.py
@@ -17,7 +17,7 @@
 from statsmodels.sandbox.stats.runs import (Runs,
                                             runstest_1samp, runstest_2samp)
 from statsmodels.sandbox.stats.runs import mcnemar as sbmcnemar
-from statsmodels.stats.nonparametric import rank_compare
+from statsmodels.stats.nonparametric import rank_compare_2indep
 from statsmodels.tools.testing import Holder
 
 
@@ -291,10 +291,10 @@ def test_brunnermunzel_one_sided():
     x, y = y, x
 
     # Results are compared with R's lawstat package.
-    u1, p1 = rank_compare(x, y, alternative='smaller')
-    u2, p2 = rank_compare(y, x, alternative='larger')
-    u3, p3 = rank_compare(x, y, alternative='larger')
-    u4, p4 = rank_compare(y, x, alternative='smaller')
+    u1, p1 = rank_compare_2indep(x, y).test_prob_superior(alternative='smaller')
+    u2, p2 = rank_compare_2indep(y, x).test_prob_superior(alternative='larger')
+    u3, p3 = rank_compare_2indep(x, y).test_prob_superior(alternative='larger')
+    u4, p4 = rank_compare_2indep(y, x).test_prob_superior(alternative='smaller')
 
     assert_approx_equal(p1, p2, significant=significant)
     assert_approx_equal(p3, p4, significant=significant)
@@ -307,9 +307,11 @@ def test_brunnermunzel_one_sided():
                         significant=significant)
     assert_approx_equal(u4, -3.1374674823029505,
                         significant=significant)
-    assert_approx_equal(p1, 0.0028931043330757342,
+
+    # Note: scipy and lawstat tail is reversed compared to test statistic
+    assert_approx_equal(p3, 0.0028931043330757342,
                         significant=significant)
-    assert_approx_equal(p3, 0.99710689566692423,
+    assert_approx_equal(p1, 0.99710689566692423,
                         significant=significant)
 
 
@@ -323,19 +325,28 @@ def test_brunnermunzel_two_sided():
     x, y = y, x
 
     # Results are compared with R's lawstat package.
-    u1, p1 = rank_compare(x, y, alternative='two-sided')
-    u2, p2 = rank_compare(y, x, alternative='two-sided')
+    res1 = rank_compare_2indep(x, y)
+    u1, p1 = res1
+    t1 = res1.test_prob_superior(alternative='two-sided')
+    res2 = rank_compare_2indep(y, x)
+    u2, p2 = res2
+    t2 = res2.test_prob_superior(alternative='two-sided')
 
     assert_approx_equal(p1, p2, significant=significant)
     assert_approx_equal(u1, 3.1374674823029505,
                         significant=significant)
     assert_approx_equal(u2, -3.1374674823029505,
                         significant=significant)
-    assert_approx_equal(p1, 0.0057862086661515377,
+    assert_approx_equal(p2, 0.0057862086661515377,
                         significant=significant)
 
+    assert_allclose(t1[0], u1, rtol=1e-13)
+    assert_allclose(t2[0], u2, rtol=1e-13)
+    assert_allclose(t1[1], p1, rtol=1e-13)
+    assert_allclose(t2[1], p2, rtol=1e-13)
+
 
-def test_rank_compare1():
+def test_rank_compare_2indep1():
     # Example from Munzel and Hauschke 2003
     # data is given by counts, expand to observations
     levels = [-2, -1, 0, 1, 2]
@@ -352,7 +363,7 @@ def test_rank_compare1():
                     ci=[0.4700629827705593, 0.6183882855872511],
                     prob=0.5442256341789052)
 
-    res = rank_compare(x1, x2, use_t=False)
+    res = rank_compare_2indep(x1, x2, use_t=False)
     assert_allclose(res.statistic, -res2_t.statistic, rtol=1e-13)
     assert_allclose(res.prob1, 1 - res2_t.prob, rtol=1e-13)
     assert_allclose(res.prob2, res2_t.prob, rtol=1e-13)
@@ -380,7 +391,7 @@ def test_rank_compare1():
     # our ranking is defined as reversed from lawstat, and BM article
     # revere direction to match our definition
     x1, x2 = x2, x1
-    res = rank_compare(x1, x2, use_t=True)
+    res = rank_compare_2indep(x1, x2, use_t=True)
     assert_allclose(res.statistic, res2_t.statistic, rtol=1e-13)
     tt = res.test_prob_superior()
     # TODO: return HolderTuple

From 772279cf9e887cf66d9812b0fcb8b51ce3556d6d Mon Sep 17 00:00:00 2001
From: Josef Perktold <josef.pktd@gmail.com>
Date: Mon, 24 Aug 2020 16:22:27 -0400
Subject: [PATCH 4/6] ENH: allow vectorized, return HolderTuple in test and
 tost

---
 statsmodels/stats/nonparametric.py            | 79 +++++++++++++------
 statsmodels/stats/tests/test_nonparametric.py | 34 +++++++-
 statsmodels/tools/testing.py                  |  8 +-
 3 files changed, 91 insertions(+), 30 deletions(-)

diff --git a/statsmodels/stats/nonparametric.py b/statsmodels/stats/nonparametric.py
index 5430c757dde..0ea9d17f892 100644
--- a/statsmodels/stats/nonparametric.py
+++ b/statsmodels/stats/nonparametric.py
@@ -29,11 +29,19 @@ def rankdata_2samp(x1, x2):
     if nobs1 == 0 or nobs2 == 0:
         raise ValueError("one sample has zero length")
 
-    rank = rankdata(np.concatenate((x1, x2)))
+    x_combined = np.concatenate((x1, x2))
+    if x_combined.ndim > 1:
+        rank = np.apply_along_axis(rankdata, 0, x_combined)
+    else:
+        rank = rankdata(x_combined)  # no axis in older scipy
     rank1 = rank[:nobs1]
     rank2 = rank[nobs1:]
-    ranki1 = rankdata(x1)
-    ranki2 = rankdata(x2)
+    if x_combined.ndim > 1:
+        ranki1 = np.apply_along_axis(rankdata, 0, x1)
+        ranki2 = np.apply_along_axis(rankdata, 0, x2)
+    else:
+        ranki1 = rankdata(x1)
+        ranki2 = rankdata(x2)
     return rank1, rank2, ranki1, ranki2
 
 
@@ -69,11 +77,20 @@ def test_prob_superior(self, value=0.5, alternative="two-sided"):
         # TODO: return HolderTuple
         # corresponds to a one-sample test and either p0 or diff could be used
         if self.df is None:
-            return _zstat_generic(self.prob1, p0, std_diff, alternative,
-                                  diff=0)
+            stat, pv = _zstat_generic(self.prob1, p0, std_diff, alternative,
+                                      diff=0)
+            distr = "normal"
         else:
-            return _tstat_generic(self.prob1, p0, std_diff, self.df,
-                                  alternative, diff=0)
+            stat, pv = _tstat_generic(self.prob1, p0, std_diff, self.df,
+                                      alternative, diff=0)
+            distr = "t"
+
+        res = HolderTuple(statistic=stat,
+                          pvalue=pv,
+                          df=self.df,
+                          distribution=distr
+                          )
+        return res
 
     def tost_prob_superior(self, low, upp):
         '''test of stochastic (non-)equivalence of p = P(x1 > x2)
@@ -110,31 +127,45 @@ def tost_prob_superior(self, low, upp):
 
         '''
 
-        t1, pv1 = self.test_prob_superior(low, alternative='larger')
-        t2, pv2 = self.test_prob_superior(upp, alternative='smaller')
-        df1 = df2 = None
-        # TODO: return HolderTuple
-        return np.maximum(pv1, pv2), (t1, pv1, df1), (t2, pv2, df2)
-
-    def summary(self, alpha=0.05):
+        t1 = self.test_prob_superior(low, alternative='larger')
+        t2 = self.test_prob_superior(upp, alternative='smaller')
+
+        # idx_max = 0 if t1.pvalue < t2.pvalue else 1
+        idx_max = np.asarray(t1.pvalue < t2.pvalue, int)
+        title = "Equivalence test for Prob(x1 > x2) + 0.5 Prob(x1 = x2) "
+        res = HolderTuple(statistic=np.choose(idx_max,
+                                              [t1.statistic, t2.statistic]),
+                          # pvalue=[t1.pvalue, t2.pvalue][idx_max], # python
+                          # use np.choose for vectorized selection
+                          pvalue=np.choose(idx_max, [t1.pvalue, t2.pvalue]),
+                          results_larger=t1,
+                          results_smaller=t2,
+                          title=title
+                          )
+        return res
+
+    def summary(self, alpha=0.05, xname=None):
 
-        xname = ["prob(x1>x2)"]
         yname = "None"
         effect = np.atleast_1d(self.prob1)
         pvalues = np.atleast_1d(self.pvalue)
-        ci = np.array(self.conf_int(alpha))[None, :]
+        ci = np.atleast_2d(self.conf_int(alpha))
+        if ci.shape[0] > 1:
+            ci = ci.T
         use_t = (self.df is not None)
         sd = np.atleast_1d(np.sqrt(self.var_prob))
         statistic = np.atleast_1d(self.statistic)
         if xname is None:
-            xname = ['c%d' % ii for ii in range(len(self.effect))]
+            xname = ['c%d' % ii for ii in range(len(effect))]
+
+        xname2 = ['prob(x1>x2) %s' % ii for ii in xname]
 
         title = "Probability sample 1 is stochastically larger"
         from statsmodels.iolib.summary import summary_params
 
         summ = summary_params((self, effect, sd, statistic,
                                pvalues, ci),
-                              yname=yname, xname=xname, use_t=use_t,
+                              yname=yname, xname=xname2, use_t=use_t,
                               title=title, alpha=alpha)
         return summ
 
@@ -233,14 +264,14 @@ def rank_compare_2indep(x1, x2, use_t=True):
 
     rank1, rank2, ranki1, ranki2 = rankdata_2samp(x1, x2)
 
-    meanr1 = np.mean(rank1)
-    meanr2 = np.mean(rank2)
-    meanri1 = np.mean(ranki1)
-    meanri2 = np.mean(ranki2)
+    meanr1 = np.mean(rank1, axis=0)
+    meanr2 = np.mean(rank2, axis=0)
+    meanri1 = np.mean(ranki1, axis=0)
+    meanri2 = np.mean(ranki2, axis=0)
 
-    S1 = np.sum(np.power(rank1 - ranki1 - meanr1 + meanri1, 2.0))
+    S1 = np.sum(np.power(rank1 - ranki1 - meanr1 + meanri1, 2.0), axis=0)
     S1 /= nobs1 - 1
-    S2 = np.sum(np.power(rank2 - ranki2 - meanr2 + meanri2, 2.0))
+    S2 = np.sum(np.power(rank2 - ranki2 - meanr2 + meanri2, 2.0), axis=0)
     S2 /= nobs2 - 1
 
     wbfn = nobs1 * nobs2 * (meanr1 - meanr2)
diff --git a/statsmodels/stats/tests/test_nonparametric.py b/statsmodels/stats/tests/test_nonparametric.py
index 2b25e5c27eb..d0d09b77ddf 100644
--- a/statsmodels/stats/tests/test_nonparametric.py
+++ b/statsmodels/stats/tests/test_nonparametric.py
@@ -384,8 +384,8 @@ def test_rank_compare_2indep1():
 
     # test consistency of tost and confint
     res_tost = res.tost_prob_superior(*ci)
-    assert_allclose(res_tost[1][1], 0.025, rtol=1e-13)
-    assert_allclose(res_tost[2][1], 0.025, rtol=1e-13)
+    assert_allclose(res_tost.results_smaller.pvalue, 0.025, rtol=1e-13)
+    assert_allclose(res_tost.results_larger.pvalue, 0.025, rtol=1e-13)
 
     # use t-distribution
     # our ranking is defined as reversed from lawstat, and BM article
@@ -412,5 +412,31 @@ def test_rank_compare_2indep1():
 
     # test consistency of tost and confint
     res_tost = res.tost_prob_superior(*ci)
-    assert_allclose(res_tost[1][1], 0.025, rtol=1e-11)
-    assert_allclose(res_tost[2][1], 0.025, rtol=1e-11)
+    assert_allclose(res_tost.results_smaller.pvalue, 0.025, rtol=1e-11)
+    assert_allclose(res_tost.results_larger.pvalue, 0.025, rtol=1e-11)
+
+
+def test_rank_compare_vectorized():
+    np.random.seed(987126)
+    x1 = np.random.randint(0, 20, (50, 3))
+    x2 = np.random.randint(5, 25, (50, 3))
+    res = rank_compare_2indep(x1, x2)
+    tst = res.test_prob_superior(0.5)
+    tost = res.tost_prob_superior(0.4, 0.6)
+
+    # smoke test for summary
+    res.summary()
+
+    for i in range(3):
+        res_i = rank_compare_2indep(x1[:, i], x2[:, i])
+        assert_allclose(res.statistic[i], res_i.statistic, rtol=1e-14)
+        assert_allclose(res.pvalue[i], res_i.pvalue, rtol=1e-14)
+        assert_allclose(res.prob1[i], res_i.prob1, rtol=1e-14)
+
+        tst_i = res_i.test_prob_superior(0.5)
+        assert_allclose(tst.statistic[i], tst_i.statistic, rtol=1e-14)
+        assert_allclose(tst.pvalue[i], tst_i.pvalue, rtol=1e-14)
+
+        tost_i = res_i.tost_prob_superior(0.4, 0.6)
+        assert_allclose(tost.statistic[i], tost_i.statistic, rtol=1e-14)
+        assert_allclose(tost.pvalue[i], tost_i.pvalue, rtol=1e-14)
diff --git a/statsmodels/tools/testing.py b/statsmodels/tools/testing.py
index ead4e16bfe1..bd6571668ca 100644
--- a/statsmodels/tools/testing.py
+++ b/statsmodels/tools/testing.py
@@ -54,11 +54,15 @@ def __init__(self, **kwds):
         self.__dict__.update(kwds)
 
     def __str__(self):
-        ss = "\n".join(str(k) + " = " + str(v) for k, v in vars(self).items())
+        ss = "\n".join(str(k) + " = " + str(v).replace('\n', '\n    ')
+                       for k, v in vars(self).items())
         return ss
 
     def __repr__(self):
-        ss = str(self.__class__) + "\n" + self.__str__()
+        # use repr for values including nested cases as in tost
+        ss = "\n".join(str(k) + " = " + repr(v).replace('\n', '\n    ')
+                       for k, v in vars(self).items())
+        ss = str(self.__class__) + "\n" + ss
         return ss
 
 

From 18a490acf7e1d562ee976b594d0e83636c60ab1f Mon Sep 17 00:00:00 2001
From: Josef Perktold <josef.pktd@gmail.com>
Date: Thu, 27 Aug 2020 16:25:58 -0400
Subject: [PATCH 5/6] ENH: add function for ordinal counts, add normal
 reference

---
 statsmodels/stats/nonparametric.py            | 171 +++++++++++++++++-
 statsmodels/stats/tests/test_nonparametric.py |  32 +++-
 2 files changed, 196 insertions(+), 7 deletions(-)

diff --git a/statsmodels/stats/nonparametric.py b/statsmodels/stats/nonparametric.py
index 0ea9d17f892..0270c698290 100644
--- a/statsmodels/stats/nonparametric.py
+++ b/statsmodels/stats/nonparametric.py
@@ -57,7 +57,7 @@ def conf_int(self, alpha=0.05, value=None, alternative="two-sided"):
         diff = self.prob1 - p0
         std_diff = np.sqrt(self.var / self.nobs)
 
-        if self.df is None:
+        if self.use_t is False:
             return _zconfint_generic(diff, std_diff, alpha, alternative)
         else:
             return _tconfint_generic(diff, std_diff, self.df, alpha,
@@ -76,7 +76,7 @@ def test_prob_superior(self, value=0.5, alternative="two-sided"):
 
         # TODO: return HolderTuple
         # corresponds to a one-sample test and either p0 or diff could be used
-        if self.df is None:
+        if not self.use_t:
             stat, pv = _zstat_generic(self.prob1, p0, std_diff, alternative,
                                       diff=0)
             distr = "normal"
@@ -144,17 +144,63 @@ def tost_prob_superior(self, low, upp):
                           )
         return res
 
+    def confint_lintransf(self, const=-1, slope=2, alpha=0.05,
+                          alternative="two-sided"):
+        """confidence interval of a linear transformation of prob1
+
+        This computes the confidence interval for
+
+            d = const + slope * prob1
+
+        Default values correspond to Somers' d.
+
+        Parameters
+        ----------
+        const, slope : float
+            Constant and slope for linear transformation.
+        alpha : float in [0, 1]
+        alternative :
+
+        """
+
+        low_p, upp_p = self.conf_int(alpha=alpha, alternative=alternative)
+        low = const + slope * low_p
+        upp = const + slope * upp_p
+        if slope < 0:
+            low, upp = upp, low
+        return low, upp
+
+    def effectsize_normal(self):
+        """
+        Cohen's d, standardized mean difference under normality assumption.
+
+        This computes the standardized mean difference effect size that is
+        equivalent to the rank based probability of superiority estimate,
+        if we assume that the data is normally distributed.
+
+        Returns
+        -------
+        equivalent smd effect size
+
+        """
+        return stats.norm.ppf(self.prob1) * np.sqrt(2)
+
     def summary(self, alpha=0.05, xname=None):
 
         yname = "None"
         effect = np.atleast_1d(self.prob1)
-        pvalues = np.atleast_1d(self.pvalue)
+        if self.pvalue is None:
+            statistic, pvalue = self.test_prob_superior()
+        else:
+            pvalue = self.pvalue
+            statistic = self.statistic
+        pvalues = np.atleast_1d(pvalue)
         ci = np.atleast_2d(self.conf_int(alpha))
         if ci.shape[0] > 1:
             ci = ci.T
-        use_t = (self.df is not None)
+        use_t = self.use_t
         sd = np.atleast_1d(np.sqrt(self.var_prob))
-        statistic = np.atleast_1d(self.statistic)
+        statistic = np.atleast_1d(statistic)
         if xname is None:
             xname = ['c%d' % ii for ii in range(len(effect))]
 
@@ -303,5 +349,118 @@ def rank_compare_2indep(x1, x2, use_t=True):
                              mean1=meanr1, mean2=meanr2,
                              prob1=prob1, prob2=prob2,
                              somersd1=prob1 * 2 - 1, somersd2=prob2 * 2 - 1,
-                             df=df
+                             df=df, use_t=use_t
                              )
+
+
+def rank_compare_2ordinal(count1, count2, ddof=1, use_t=True):
+    """stochastically larger probability for 2 independend ordinal sample
+
+    This is a special case of `rank_compare_2indep` when the data are given as
+    counts of two independent ordinal, i.e. ordered multinomial samples.
+
+    The statistic of interest is the probability that a random draw from the
+    population of the first sample has a larger value than a random draw from
+    the population of the second sample, specifically
+
+        p = P(x1 > x2) + 0.5 * P(x1 = x2)
+
+    Parameters
+    ----------
+    count1 : array_like
+        counts of the first sample, categories are assumed to be ordered.
+    count1 : array_like
+        counts of the second sample, number of categories and ordering needs
+        to be the same as for sample 1
+    ddof : scalar
+        Degrees of freedom correction for variance estimation. The default
+        ddof=1 corresponds to `rank_compare_2indep`.
+    use_t : bool
+        If use_t is true, the t distribution with Welch-Satterthwaite type
+        degrees of freedom is used for p-value and confidence interval.
+        If use_t is false, then the normal distribution is used.
+
+    Returns
+    -------
+    res : RankCompareResult
+        This includes methods for hypothesis tests and confidence intervals
+        for the probability that sample 1 is stochastically larger than
+        sample 2.
+    """
+
+    count1 = np.asarray(count1)
+    count2 = np.asarray(count2)
+    nobs1, nobs2 = count1.sum(), count2.sum()
+    freq1 = count1 / nobs1
+    freq2 = count2 / nobs2
+    cdf1 = np.concatenate(([0], freq1)).cumsum(axis=0)
+    cdf2 = np.concatenate(([0], freq2)).cumsum(axis=0)
+
+    # mid rank cdf
+    cdfm1 = (cdf1[1:] + cdf1[:-1]) / 2
+    cdfm2 = (cdf2[1:] + cdf2[:-1]) / 2
+    prob1 = (cdfm2 * freq1).sum()
+    prob2 = (cdfm1 * freq2).sum()
+
+    var1 = (cdfm2**2 * freq1).sum() - prob1**2
+    var2 = (cdfm1**2 * freq2).sum() - prob2**2
+
+    var_prob = (var1 / (nobs1 - ddof) + var2 / (nobs2 - ddof))
+    nobs = nobs1 + nobs2
+    var = nobs * var_prob
+    vn1 = var1 * nobs2 * nobs1 / (nobs1 - ddof)
+    vn2 = var2 * nobs1 * nobs2 / (nobs2 - ddof)
+    df = (vn1 + vn2)**2 / (vn1**2 / (nobs1 - 1) + vn2**2 / (nobs2 - 1))
+    res = RankCompareResult(statistic=None, pvalue=None, s1=None, s2=None,
+                            var1=var1, var2=var2, var=var,
+                            var_prob=var_prob,
+                            nobs1=nobs1, nobs2=nobs2, nobs=nobs,
+                            mean1=None, mean2=None,
+                            prob1=prob1, prob2=prob2,
+                            somersd1=prob1 * 2 - 1, somersd2=prob2 * 2 - 1,
+                            df=df, use_t=use_t
+                            )
+
+    return res
+
+
+def prob_larger_continuous(distr1, distr2):
+    """probability that distr1 is stochastically larger than distr2
+
+    This computes
+
+        p = P(x1 > x2)
+
+    for two distributions.
+
+    Parameters
+    ----------
+    distr1, distr2 : distributions
+        Two instances of scipy.stats.distributions. Methods that are required
+        are cdf, pdf and expect.
+
+    Returns
+    -------
+    p : probability x1 is larger than xw
+
+
+    Notes
+    -----
+    This is a one-liner that is added mainly as reference.
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> prob_larger_continuous(stats.norm, stats.t(5))
+    0.4999999999999999
+
+    # which is the same as
+    >>> stats.norm.expect(stats.t(5).cdf)
+    0.4999999999999999
+
+    >>> prob_larger_continuous(stats.norm, stats.norm(loc=1))
+    0.23975006109347669
+
+    """
+
+    return distr1.expect(distr2.cdf)
diff --git a/statsmodels/stats/tests/test_nonparametric.py b/statsmodels/stats/tests/test_nonparametric.py
index d0d09b77ddf..5c1d36b68bc 100644
--- a/statsmodels/stats/tests/test_nonparametric.py
+++ b/statsmodels/stats/tests/test_nonparametric.py
@@ -10,6 +10,8 @@
 import numpy as np
 from numpy.testing import (assert_allclose, assert_almost_equal,
                            assert_approx_equal, assert_)
+
+from scipy import stats
 import pytest
 
 from statsmodels.stats.contingency_tables import (
@@ -17,7 +19,8 @@
 from statsmodels.sandbox.stats.runs import (Runs,
                                             runstest_1samp, runstest_2samp)
 from statsmodels.sandbox.stats.runs import mcnemar as sbmcnemar
-from statsmodels.stats.nonparametric import rank_compare_2indep
+from statsmodels.stats.nonparametric import (
+    rank_compare_2indep, rank_compare_2ordinal, prob_larger_continuous)
 from statsmodels.tools.testing import Holder
 
 
@@ -415,6 +418,33 @@ def test_rank_compare_2indep1():
     assert_allclose(res_tost.results_smaller.pvalue, 0.025, rtol=1e-11)
     assert_allclose(res_tost.results_larger.pvalue, 0.025, rtol=1e-11)
 
+    # extras
+    # cohen's d
+    esd = res.effectsize_normal()
+    p = prob_larger_continuous(stats.norm(loc=esd), stats.norm)
+    # round trip
+    assert_allclose(p, res.prob1, rtol=1e-13)
+
+
+def test_rank_compare_ord():
+    # compare ordinal count version with full version
+    # Example from Munzel and Hauschke 2003
+    # data is given by counts, expand to observations
+    levels = [-2, -1, 0, 1, 2]
+    new = [24, 37, 21, 19, 6]
+    active = [11, 51, 22, 21, 7]
+    x1 = np.repeat(levels, new)
+    x2 = np.repeat(levels, active)
+
+    for use_t in [False, True]:
+        res2 = rank_compare_2indep(x1, x2, use_t=use_t)
+        res1 = rank_compare_2ordinal(new, active, use_t=use_t)
+        assert_allclose(res2.prob1, res1.prob1, rtol=1e-13)
+        assert_allclose(res2.var_prob, res1.var_prob, rtol=1e-13)
+        s1 = str(res1.summary())
+        s2 = str(res2.summary())
+        assert s1 == s2
+
 
 def test_rank_compare_vectorized():
     np.random.seed(987126)

From c33d27c1481325733e4d1ca4ee1a2ad17d15ea0a Mon Sep 17 00:00:00 2001
From: Josef Perktold <josef.pktd@gmail.com>
Date: Tue, 1 Sep 2020 11:52:26 -0400
Subject: [PATCH 6/6] DOC: improve docstrings, add to stats.rst, unit test for
 cohensd2problarger

---
 docs/source/stats.rst                         |  11 +
 statsmodels/stats/nonparametric.py            | 280 +++++++++++++++---
 statsmodels/stats/tests/test_nonparametric.py |  22 +-
 3 files changed, 261 insertions(+), 52 deletions(-)

diff --git a/docs/source/stats.rst b/docs/source/stats.rst
index ea7deb8ffd3..aa7859ba4c9 100644
--- a/docs/source/stats.rst
+++ b/docs/source/stats.rst
@@ -181,6 +181,17 @@ Non-Parametric Tests
 
    sign_test
 
+.. currentmodule:: statsmodels.stats.nonparametric
+
+.. autosummary::
+   :toctree: generated/
+
+   rank_compare_2indep
+   rank_compare_2ordinal
+   cohensd2problarger
+   prob_larger_continuous
+   rankdata_2samp
+
 
 Descriptive Statistics
 ----------------------
diff --git a/statsmodels/stats/nonparametric.py b/statsmodels/stats/nonparametric.py
index 0270c698290..ff814ec3eb6 100644
--- a/statsmodels/stats/nonparametric.py
+++ b/statsmodels/stats/nonparametric.py
@@ -21,6 +21,25 @@
 
 
 def rankdata_2samp(x1, x2):
+    """Compute midranks for two samples
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Original data for two samples that will be converted to midranks.
+
+    Returns
+    -------
+    rank1 : ndarray
+        Midranks of the first sample in the pooled sample.
+    rank2 : ndarray
+        Midranks of the second sample in the pooled sample.
+    ranki1 : ndarray
+        Internal midranks of the first sample.
+    ranki2 : ndarray
+        Internal midranks of the second sample.
+
+    """
     x1 = np.asarray(x1)
     x2 = np.asarray(x2)
 
@@ -47,9 +66,45 @@ def rankdata_2samp(x1, x2):
 
 class RankCompareResult(HolderTuple):
     """Results for rank comparison
+
+    This is a subclass of HolderTuple that includes results from intermediate
+    computations, as well as methods for hypothesis tests, confidence intervals
+    ans summary.
     """
 
-    def conf_int(self, alpha=0.05, value=None, alternative="two-sided"):
+    def conf_int(self, value=None, alpha=0.05, alternative="two-sided"):
+        """
+        Confidence interval for probability that sample 1 has larger values
+
+        Confidence interval is for the shifted probability
+
+            P(x1 > x2) + 0.5 * P(x1 = x2) - value
+
+        Parameters
+        ----------
+        value : float
+            Value, default 0, shifts the confidence interval,
+            e.g. ``value=0.5`` centers the confidence interval at zero.
+        alpha : float
+            Significance level for the confidence interval, coverage is
+            ``1-alpha``
+        alternative : str
+            The alternative hypothesis, H1, has to be one of the following
+
+               * 'two-sided' : H1: ``prob - value`` not equal to 0.
+               * 'larger' :   H1: ``prob - value > 0``
+               * 'smaller' :  H1: ``prob - value < 0``
+
+        Returns
+        -------
+        lower : float or ndarray
+            Lower confidence limit. This is -inf for the one-sided alternative
+            "smaller".
+        upper : float or ndarray
+            Upper confidence limit. This is inf for the one-sided alternative
+            "larger".
+
+        """
 
         p0 = value
         if p0 is None:
@@ -67,6 +122,31 @@ def test_prob_superior(self, value=0.5, alternative="two-sided"):
         """test for superiority probability
 
         H0: P(x1 > x2) + 0.5 * P(x1 = x2) = value
+
+        The alternative is that the probability is either not equal, larger
+        or smaller than the null-value depending on the chosen alternative.
+
+        Parameters
+        ----------
+        value : float
+            Value of the probability under the Null hypothesis.
+        alternative : str
+            The alternative hypothesis, H1, has to be one of the following
+
+               * 'two-sided' : H1: ``prob - value`` not equal to 0.
+               * 'larger' :   H1: ``prob - value > 0``
+               * 'smaller' :  H1: ``prob - value < 0``
+
+        Returns
+        -------
+        res : HolderTuple
+            HolderTuple instance with the following main attributes
+
+            statistic : float
+                Test statistic for z- or t-test
+            pvalue : float
+                Pvalue of the test based on either normal or t distribution.
+
         """
 
         p0 = value  # alias
@@ -95,8 +175,8 @@ def test_prob_superior(self, value=0.5, alternative="two-sided"):
     def tost_prob_superior(self, low, upp):
         '''test of stochastic (non-)equivalence of p = P(x1 > x2)
 
-        null hypothesis:  p < low or p > upp
-        alternative hypothesis:  low < p < upp
+        Null hypothesis:  p < low or p > upp
+        Alternative hypothesis:  low < p < upp
 
         where p is the probability that a random draw from the population of
         the first sample has a larger value than a random draw from the
@@ -116,14 +196,21 @@ def tost_prob_superior(self, low, upp):
 
         Returns
         -------
-        pvalue : float
-            pvalue of the non-equivalence test
-        t1, pv1, df1 : tuple
-            test statistic, pvalue and degrees of freedom for lower threshold
-            test
-        t2, pv2, df2 : tuple
-            test statistic, pvalue and degrees of freedom for upper threshold
-            test
+        res : HolderTuple
+            HolderTuple instance with the following main attributes
+
+            pvalue : float
+                Pvalue of the equivalence test given by the larger pvalue of
+                the two one-sided tests.
+            statistic : float
+                Test statistic of the one-sided test that has the larger
+                pvalue.
+            results_larger : HolderTuple
+                Results instanc with test statistic, pvalue and degrees of
+                freedom for lower threshold test.
+            results_smaller : HolderTuple
+                Results instanc with test statistic, pvalue and degrees of
+                freedom for upper threshold test.
 
         '''
 
@@ -157,9 +244,25 @@ def confint_lintransf(self, const=-1, slope=2, alpha=0.05,
         Parameters
         ----------
         const, slope : float
-            Constant and slope for linear transformation.
-        alpha : float in [0, 1]
-        alternative :
+            Constant and slope for linear (affine) transformation.
+        alpha : float
+            Significance level for the confidence interval, coverage is
+            ``1-alpha``
+        alternative : str
+            The alternative hypothesis, H1, has to be one of the following
+
+               * 'two-sided' : H1: ``prob - value`` not equal to 0.
+               * 'larger' :   H1: ``prob - value > 0``
+               * 'smaller' :  H1: ``prob - value < 0``
+
+        Returns
+        -------
+        lower : float or ndarray
+            Lower confidence limit. This is -inf for the one-sided alternative
+            "smaller".
+        upper : float or ndarray
+            Upper confidence limit. This is inf for the one-sided alternative
+            "larger".
 
         """
 
@@ -170,22 +273,52 @@ def confint_lintransf(self, const=-1, slope=2, alpha=0.05,
             low, upp = upp, low
         return low, upp
 
-    def effectsize_normal(self):
+    def effectsize_normal(self, prob=None):
         """
         Cohen's d, standardized mean difference under normality assumption.
 
-        This computes the standardized mean difference effect size that is
-        equivalent to the rank based probability of superiority estimate,
-        if we assume that the data is normally distributed.
+        This computes the standardized mean difference, Cohen's d, effect size
+        that is equivalent to the rank based probability ``p`` of being
+        stochastically larger if we assume that the data is normally
+        distributed, given by
+
+            :math: `d = F^{-1}(p) * \\sqrt{2}`
+
+        where :math:`F^{-1}` is the inverse of the cdf of the normal
+        distribution.
+
+        Parameters
+        ----------
+        prob : float in (0, 1)
+            Probability to be converted to Cohen's d effect size.
+            If prob is None, then the ``prob1`` attribute is used.
 
         Returns
         -------
-        equivalent smd effect size
+        equivalent Cohen's d effect size under normality assumption.
 
         """
-        return stats.norm.ppf(self.prob1) * np.sqrt(2)
+        if prob is None:
+            prob = self.prob1
+        return stats.norm.ppf(prob) * np.sqrt(2)
 
     def summary(self, alpha=0.05, xname=None):
+        """summary table for probability that random draw x1 is larger than x2
+
+        Parameters
+        ----------
+        alpha : float
+            Significance level for confidence intervals. Coverage is 1 - alpha
+        xname : None or list of str
+            If None, then each row has a name column with generic names.
+            If xname is a list of strings, then it will be included as part
+            of those names.
+
+        Returns
+        -------
+        SimpleTable instance with methods to convert to different output
+        formats.
+        """
 
         yname = "None"
         effect = np.atleast_1d(self.prob1)
@@ -229,7 +362,8 @@ def rank_compare_2indep(x1, x2, use_t=True):
     This is a measure underlying Wilcoxon-Mann-Whitney's U test,
     Fligner-Policello test and Brunner-Munzel test, and
     Inference is based on the asymptotic distribution of the Brunner-Munzel
-    test.
+    test. The half probability for ties corresponds to the use of midranks
+    and make it valid for discrete variables.
 
     The Null hypothesis for stochastic equality is p = 0.5, which corresponds
     to the Brunner-Munzel test.
@@ -253,7 +387,6 @@ def rank_compare_2indep(x1, x2, use_t=True):
             p-value assuming an t distribution. One-sided or
             two-sided, depending on the choice of `alternative` and `use_t`.
 
-
     See Also
     --------
     scipy.stats.brunnermunzel : Brunner-Munzel test for stochastic equality
@@ -269,16 +402,28 @@ def rank_compare_2indep(x1, x2, use_t=True):
 
     Brunner and Munzel recommended to estimate the p-value by t-distribution
     when the size of data is 50 or less. If the size is lower than 10, it would
-    be better to use permuted Brunner Munzel test (see [2]_).
+    be better to use permuted Brunner Munzel test (see [2]_) for the test
+    of stochastic equality.
 
     This measure has been introduced in the literature under many different
     names relying on a variety of assumptions.
-    In psychology, ... introduced it as Common Language effect size for the
-    continuous, normal distribution case, ... extended it to the nonparameteric
+    In psychology, McGraw and Wong (1992) introduced it as Common Language
+    effect size for the continuous, normal distribution case,
+    Vargha and Delaney (2000) [3]_ extended it to the nonparameteric
     continuous distribution case as in Fligner-Policello.
 
-    Note: Brunner-Munzel define the probability for x1 to be stochastically
-    smaller than x2, while here we use stochastically larger.
+    WMW and related tests can only be interpreted as test of medians or tests
+    of central location only under very restrictive additional assumptions
+    such as both distribution are identical under the equality null hypothesis
+    (assumed by Mann-Whitney) or both distributions are symmetric (shown by
+    Fligner-Policello). If the distribution of the two samples can differ in
+    an arbitrary way, then the equality Null hypothesis corresponds to p=0.5
+    against an alternative p != 0.5.  see for example Conroy (2012) [4]_ and
+    Divine et al (2018) [5]_ .
+
+    Note: Brunner-Munzel and related literature define the probability that x1
+    is stochastically smaller than x2, while here we use stochastically larger.
+    This equivalent to switching x1 and x2 in the two sample case.
 
     References
     ----------
@@ -288,16 +433,19 @@ def rank_compare_2indep(x1, x2, use_t=True):
     .. [2] Neubert, K. and Brunner, E. "A studentized permutation test for the
            non-parametric Behrens-Fisher problem". Computational Statistics and
            Data Analysis. Vol. 51(2007): 5192-5204.
-    Examples
-    --------
-    >>> from scipy import stats
-    >>> x1 = [1,2,1,1,1,1,1,1,1,1,2,4,1,1]
-    >>> x2 = [3,3,4,3,1,2,3,1,1,5,4]
-    >>> w, p_value = stats.brunnermunzel(x1, x2)
-    >>> w
-    3.1374674823029505
-    >>> p_value
-    0.0057862086661515377
+    .. [3] Vargha, András, and Harold D. Delaney. 2000. “A Critique and
+           Improvement of the CL Common Language Effect Size Statistics of
+           McGraw and Wong.” Journal of Educational and Behavioral Statistics
+           25 (2): 101–32. https://doi.org/10.3102/10769986025002101.
+    .. [4] Conroy, Ronán M. 2012. “What Hypotheses Do ‘Nonparametric’ Two-Group
+           Tests Actually Test?” The Stata Journal: Promoting Communications on
+           Statistics and Stata 12 (2): 182–90.
+           https://doi.org/10.1177/1536867X1201200202.
+    .. [5] Divine, George W., H. James Norton, Anna E. Barón, and Elizabeth
+           Juarez-Colunga. 2018. “The Wilcoxon–Mann–Whitney Procedure Fails as
+           a Test of Medians.” The American Statistician 72 (3): 278–86.
+           https://doi.org/10.1080/00031305.2017.1305291.
+
     """
     x1 = np.asarray(x1)
     x2 = np.asarray(x2)
@@ -354,10 +502,10 @@ def rank_compare_2indep(x1, x2, use_t=True):
 
 
 def rank_compare_2ordinal(count1, count2, ddof=1, use_t=True):
-    """stochastically larger probability for 2 independend ordinal sample
+    """stochastically larger probability for 2 independend ordinal samples
 
     This is a special case of `rank_compare_2indep` when the data are given as
-    counts of two independent ordinal, i.e. ordered multinomial samples.
+    counts of two independent ordinal, i.e. ordered multinomial, samples.
 
     The statistic of interest is the probability that a random draw from the
     population of the first sample has a larger value than a random draw from
@@ -368,9 +516,9 @@ def rank_compare_2ordinal(count1, count2, ddof=1, use_t=True):
     Parameters
     ----------
     count1 : array_like
-        counts of the first sample, categories are assumed to be ordered.
-    count1 : array_like
-        counts of the second sample, number of categories and ordering needs
+        Counts of the first sample, categories are assumed to be ordered.
+    count2 : array_like
+        Counts of the second sample, number of categories and ordering needs
         to be the same as for sample 1
     ddof : scalar
         Degrees of freedom correction for variance estimation. The default
@@ -386,6 +534,13 @@ def rank_compare_2ordinal(count1, count2, ddof=1, use_t=True):
         This includes methods for hypothesis tests and confidence intervals
         for the probability that sample 1 is stochastically larger than
         sample 2.
+
+    Notes
+    -----
+    The implementation is based on the appendix of Munzel and Hauschke (2003)
+    with the addition of ``ddof`` so that the results match the general
+    function `rank_compare_2indep`.
+
     """
 
     count1 = np.asarray(count1)
@@ -425,23 +580,24 @@ def rank_compare_2ordinal(count1, count2, ddof=1, use_t=True):
 
 
 def prob_larger_continuous(distr1, distr2):
-    """probability that distr1 is stochastically larger than distr2
+    """probability indicating that distr1 is stochastically larger than distr2
 
     This computes
 
         p = P(x1 > x2)
 
-    for two distributions.
+    for two continuous distributions, where `distr1` and `distr2` are the
+    distributions of random variables x1 and x2 respectively.
 
     Parameters
     ----------
     distr1, distr2 : distributions
-        Two instances of scipy.stats.distributions. Methods that are required
-        are cdf, pdf and expect.
+        Two instances of scipy.stats.distributions. The required methods are
+        cdf of the second distribution and expect of the first distribution.
 
     Returns
     -------
-    p : probability x1 is larger than xw
+    p : probability x1 is larger than x2
 
 
     Notes
@@ -458,9 +614,39 @@ def prob_larger_continuous(distr1, distr2):
     >>> stats.norm.expect(stats.t(5).cdf)
     0.4999999999999999
 
+    # distribution 1 with smaller mean (loc) than distribution 2
     >>> prob_larger_continuous(stats.norm, stats.norm(loc=1))
     0.23975006109347669
 
     """
 
     return distr1.expect(distr2.cdf)
+
+
+def cohensd2problarger(d):
+    """convert Cohen's d effect size to stochastically-larger-probability
+
+    This assumes observations are normally distributed.
+
+    Computed as
+
+        p = Prob(x1 > x2) = F(d / sqrt(2))
+
+    where `F` is cdf of normal distribution. Cohen's d is defined as
+
+        d = (mean1 - mean2) / std
+
+    where ``std`` is the pooled within standard deviation.
+
+    Parameters
+    ----------
+    d : float or array_like
+        Cohen's d effect size for difference mean1 - mean2.
+
+    Returns
+    -------
+    prob : float or ndarray
+        Prob(x1 > x2)
+    """
+
+    return stats.norm.cdf(d / np.sqrt(2))
diff --git a/statsmodels/stats/tests/test_nonparametric.py b/statsmodels/stats/tests/test_nonparametric.py
index 5c1d36b68bc..c5bac26aee3 100644
--- a/statsmodels/stats/tests/test_nonparametric.py
+++ b/statsmodels/stats/tests/test_nonparametric.py
@@ -20,7 +20,8 @@
                                             runstest_1samp, runstest_2samp)
 from statsmodels.sandbox.stats.runs import mcnemar as sbmcnemar
 from statsmodels.stats.nonparametric import (
-    rank_compare_2indep, rank_compare_2ordinal, prob_larger_continuous)
+    rank_compare_2indep, rank_compare_2ordinal, prob_larger_continuous,
+    cohensd2problarger)
 from statsmodels.tools.testing import Holder
 
 
@@ -294,10 +295,14 @@ def test_brunnermunzel_one_sided():
     x, y = y, x
 
     # Results are compared with R's lawstat package.
-    u1, p1 = rank_compare_2indep(x, y).test_prob_superior(alternative='smaller')
-    u2, p2 = rank_compare_2indep(y, x).test_prob_superior(alternative='larger')
-    u3, p3 = rank_compare_2indep(x, y).test_prob_superior(alternative='larger')
-    u4, p4 = rank_compare_2indep(y, x).test_prob_superior(alternative='smaller')
+    u1, p1 = rank_compare_2indep(x, y
+                                 ).test_prob_superior(alternative='smaller')
+    u2, p2 = rank_compare_2indep(y, x
+                                 ).test_prob_superior(alternative='larger')
+    u3, p3 = rank_compare_2indep(x, y
+                                 ).test_prob_superior(alternative='larger')
+    u4, p4 = rank_compare_2indep(y, x
+                                 ).test_prob_superior(alternative='smaller')
 
     assert_approx_equal(p1, p2, significant=significant)
     assert_approx_equal(p3, p4, significant=significant)
@@ -425,6 +430,13 @@ def test_rank_compare_2indep1():
     # round trip
     assert_allclose(p, res.prob1, rtol=1e-13)
 
+    # round trip with cohen's d
+    pc = cohensd2problarger(esd)
+    assert_allclose(pc, res.prob1, rtol=1e-13)
+
+    ci_tr = res.confint_lintransf(1, -1)
+    assert_allclose(ci_tr, 1 - np.array(res2_t.ci)[::-1], rtol=0.005)
+
 
 def test_rank_compare_ord():
     # compare ordinal count version with full version