Merge b04b51c into 26d901a

statsmodels · Aug 24, 2020 · a1236f3 · a1236f3
2 parents 26d901a + b04b51c
commit a1236f3
Show file tree

Hide file tree

Showing 7 changed files with 308 additions and 1 deletion.
diff --git a/docs/source/release/version0.12.rst b/docs/source/release/version0.12.rst
@@ -293,6 +293,10 @@ Submodules
 - Add expanding initialization to RollingOLS/WLS  (:pr:`6838`)
 - Add  a note when R2 is uncentered  (:pr:`6844`)
 
+``robust``
+~~~~~~~~~~~~~~
+- Add an implementation of the Qn robust scale estimator (:pr:`6977`)
+
 ``stats``
 ~~~~~~~~~
 - Multivariate mean tests and confint  (:pr:`4107`)
@@ -800,6 +804,7 @@ New Functions
 * :func:`statsmodels.multivariate.plots.plot_loadings`
 * :func:`statsmodels.multivariate.plots.plot_scree`
 * :func:`statsmodels.robust.scale.iqr`
+* :func:`statsmodels.robust.scale.qn`
 * :func:`statsmodels.stats.contrast.wald_test_noncent`
 * :func:`statsmodels.stats.contrast.wald_test_noncent_generic`
 * :func:`statsmodels.stats.descriptivestats.describe`

diff --git a/docs/source/rlm.rst b/docs/source/rlm.rst
@@ -44,6 +44,7 @@ References
 * PJ Huber. ‘Robust Statistics’ John Wiley and Sons, Inc., New York. 1981.
 * PJ Huber. 1973, ‘The 1972 Wald Memorial Lectures: Robust Regression: Asymptotics, Conjectures, and Monte Carlo.’ The Annals of Statistics, 1.5, 799-821.
 * R Venables, B Ripley. ‘Modern Applied Statistics in S’ Springer, New York,
+* C Croux, PJ Rousseeuw, 'Time-efficient algorithms for two highly robust estimators of scale' Computational statistics. Physica, Heidelberg, 1992.
 
 Module Reference
 ----------------
@@ -105,3 +106,4 @@ Scale
     mad
     hubers_scale
     iqr
+    qn
diff --git a/examples/notebooks/robust_models_1.ipynb b/examples/notebooks/robust_models_1.ipynb
@@ -435,7 +435,27 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The IQR is less robust than the MAD in the sense that it has a lower breakdown point: it can withstand 25\\% outlying observations before being completely ruined, whereas the MAD can withstand 50\\% outlying observations. However, the IQR is better suited for asymmetric distributions. See Rousseeuw & Croux (1993), 'Alternatives to the Median Absolute Deviation' for more details."
+    "The IQR is less robust than the MAD in the sense that it has a lower breakdown point: it can withstand 25\\% outlying observations before being completely ruined, whereas the MAD can withstand 50\\% outlying observations. However, the IQR is better suited for asymmetric distributions."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Yet another robust estimator of scale is the $Q_n$ estimator, introduced in Rousseeuw & Croux (1993), 'Alternatives to the Median Absolute Deviation'. Then $Q_n$ estimator is given by\n",
+    "$$\n",
+    "Q_n = K \\left\\lbrace \\vert X_{i} - X_{j}\\vert : i<j\\right\\rbrace_{(h)}\n",
+    "$$\n",
+    "where $h\\approx (1/4){{n}\\choose{2}}$ and $K$ is a given constant. In words, the $Q_n$ estimator is the normalized $h$-th order statistic of the absolute differences of the data. The normalizing constant $K$ is usually chosen as 2.219144, to make the estimator consistent for the standard deviation in the case of normal data. The $Q_n$ estimator has a 50\\% breakdown point and a 82\\% asymptotic efficiency at the normal distribution, much higher than the 37\\% efficiency of the MAD."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sm.robust.scale.qn(x)"
    ]
   },
   {

diff --git a/setup.py b/setup.py
@@ -159,6 +159,7 @@
     _kim_smoother={'source': 'statsmodels/tsa/regime_switching/_kim_smoother.pyx.in'},  # noqa: E501
     _arma_innovations={'source': 'statsmodels/tsa/innovations/_arma_innovations.pyx.in'},  # noqa: E501
     linbin={'source': 'statsmodels/nonparametric/linbin.pyx'},
+    _qn={'source': 'statsmodels/robust/_qn.pyx'},
     _smoothers_lowess={'source': 'statsmodels/nonparametric/_smoothers_lowess.pyx'},  # noqa: E501
     kalman_loglike={'source': 'statsmodels/tsa/kalmanf/kalman_loglike.pyx',
                     'include_dirs': ['statsmodels/src'],

diff --git a/statsmodels/robust/_qn.pyx b/statsmodels/robust/_qn.pyx
@@ -0,0 +1,135 @@
+#!python
+#cython: wraparound=False, boundscheck=False, cdivision=True
+
+import numpy as np
+
+
+def _high_weighted_median(double[::1] a, int[::1] weights):
+    """
+    Computes a weighted high median of a. This is defined as the
+    smallest a[j] such that the sum over all a[i]<=a[j] is strictly
+    greater than half the total sum of the weights
+    """
+    cdef:
+        double[::1] a_cp = np.copy(a)
+        int[::1] weights_cp = np.copy(weights)
+        int n = a_cp.shape[0]
+        double[::1] sorted_a = np.zeros((n,), dtype=np.double)
+        double[::1] a_cand = np.zeros((n,), dtype=np.double)
+        int[::1] weights_cand = np.zeros((n,), dtype=np.intc)
+        int kcand = 0
+        int wleft, wright, wmid, wtot, wrest = 0
+        double trial = 0
+    wtot = np.sum(weights_cp)
+    while True:
+        wleft = 0
+        wmid = 0
+        wright = 0
+        for i in range(n):
+            sorted_a[i] = a_cp[i]
+        sorted_a = np.partition(sorted_a, kth=n//2)
+        trial = sorted_a[n//2]
+        for i in range(n):
+            if a_cp[i] < trial:
+                wleft = wleft + weights_cp[i]
+            elif a_cp[i] > trial:
+                wright = wright + weights_cp[i]
+            else:
+                wmid = wmid + weights_cp[i]
+        kcand = 0
+        if 2 * (wrest + wleft) > wtot:
+            for i in range(n):
+                if a_cp[i] < trial:
+                    a_cand[kcand] = a_cp[i]
+                    weights_cand[kcand] = weights_cp[i]
+                    kcand = kcand + 1
+        elif 2 * (wrest + wleft + wmid) <= wtot:
+            for i in range(n):
+                if a_cp[i] > trial:
+                    a_cand[kcand] = a_cp[i]
+                    weights_cand[kcand] = weights_cp[i]
+                    kcand = kcand + 1
+            wrest = wrest + wleft + wmid
+        else:
+            break
+        n = kcand
+        for i in range(n):
+            a_cp[i] = a_cand[i]
+            weights_cp[i] = weights_cand[i]
+    return trial
+
+
+def _qn(double[:] a, double c):
+    """
+    Computes the Qn robust estimator of scale, a more efficient alternative
+    to the MAD. The implementation follows the algorithm described in Croux
+    and Rousseeuw (1992).
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    c : float, optional
+        The normalization constant, used to get consistent estimates of the
+        standard deviation at the normal distribution.  Defined as
+        1/(np.sqrt(2) * scipy.stats.norm.ppf(5/8)), which is 2.219144.
+
+    Returns
+    -------
+    The Qn robust estimator of scale
+    """
+    cdef:
+        int n = a.shape[0]
+        int h = n/2 + 1
+        int k = h * (h - 1) / 2
+        int n_left = n * (n + 1) / 2
+        int n_right = n * n
+        int k_new = k + n_left
+        Py_ssize_t i, j, jh, l = 0
+        int sump, sumq = 0
+        double trial, output = 0
+        double[::1] a_sorted = np.sort(a)
+        int[::1] left = np.array([n - i + 1 for i in range(0, n)], dtype=np.intc)
+        int[::1] right = np.array([n if i <= h else n - (i - h) for i in range(0, n)], dtype=np.intc)
+        int[::1] weights = np.zeros((n,), dtype=np.intc)
+        double[::1] work = np.zeros((n,), dtype=np.double)
+        int[::1] p = np.zeros((n,), dtype=np.intc)
+        int[::1] q = np.zeros((n,), dtype=np.intc)
+    while n_right - n_left > n:
+        j = 0
+        for i in range(1, n):
+            if left[i] <= right[i]:
+                weights[j] = right[i] - left[i] + 1
+                jh = left[i] + weights[j] // 2
+                work[j] = a_sorted[i] - a_sorted[n - jh]
+                j = j + 1
+        trial = _high_weighted_median(work[:j], weights[:j])
+        j = 0
+        for i in range(n - 1, -1, -1):
+            while j < n and (a_sorted[i] - a_sorted[n - j - 1]) < trial:
+                j = j + 1
+            p[i] = j
+        j = n + 1
+        for i in range(n):
+            while (a_sorted[i] - a_sorted[n - j + 1]) > trial:
+                j = j - 1
+            q[i] = j
+        sump = np.sum(p)
+        sumq = np.sum(q) - n
+        if k_new <= sump:
+            right = np.copy(p)
+            n_right = sump
+        elif k_new > sumq:
+            left = np.copy(q)
+            n_left = sumq
+        else:
+            output = c * trial
+            return output
+    j = 0
+    for i in range(1, n):
+        for l in range(left[i], right[i] + 1):
+            work[j] = a_sorted[i] - a_sorted[n - l]
+            j = j + 1
+    k_new = k_new - (n_left + 1)
+    output = c * np.sort(work[:j])[k_new]
+    return output
diff --git a/statsmodels/robust/scale.py b/statsmodels/robust/scale.py
@@ -7,12 +7,16 @@
 
 R Venables, B Ripley. 'Modern Applied Statistics in S'
     Springer, New York, 2002.
+
+C Croux, PJ Rousseeuw, 'Time-efficient algorithms for two highly robust
+estimators of scale' Computational statistics. Physica, Heidelberg, 1992.
 """
 import numpy as np
 from scipy.stats import norm as Gaussian
 from . import norms
 from statsmodels.tools import tools
 from statsmodels.tools.validation import array_like, float_like
+from ._qn import _qn
 
 
 def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
@@ -81,6 +85,79 @@ def iqr(a, c=Gaussian.ppf(3/4) - Gaussian.ppf(1/4), axis=0):
         return np.squeeze(np.diff(quantiles, axis=0) / c)
 
 
+def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0):
+    """
+    Computes the Qn robust estimator of scale
+
+    The Qn scale estimator is a more efficient alternative to the MAD.
+    The Qn scale estimator of an array a of length n is defined as
+    c * {abs(a[i] - a[j]): i<j}_(k), for k equal to [n/2] + 1 choose 2. Thus,
+    the Qn estimator is the k-th order statistic of the absolute differences
+    of the array. The optional constant is used to normalize the estimate
+    as explained below. The implementation follows the algorithm described
+    in Croux and Rousseeuw (1992).
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    c : float, optional
+        The normalization constant. The default value is used to get consistent
+        estimates of the standard deviation at the normal distribution.
+    axis : int, optional
+        The default is 0.
+
+    Returns
+    -------
+    {float, ndarray}
+        The Qn robust estimator of scale
+    """
+    a = array_like(a, 'a', ndim=None, dtype=np.float64, contiguous=True,
+                   order='C')
+    c = float_like(c, 'c')
+    if a.ndim == 0:
+        raise ValueError("a should have at least one dimension")
+    elif a.size == 0:
+        return np.nan
+    else:
+        out = np.apply_along_axis(_qn, axis=axis, arr=a, c=c)
+        if out.ndim == 0:
+            return float(out)
+        return out
+
+
+def _qn_naive(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8))):
+    """
+    A naive implementation of the Qn robust estimator of scale, used solely
+    to test the faster, more involved one
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    c : float, optional
+        The normalization constant, used to get consistent estimates of the
+        standard deviation at the normal distribution.  Defined as
+        1/(np.sqrt(2) * scipy.stats.norm.ppf(5/8)), which is 2.219144.
+
+    Returns
+    -------
+    The Qn robust estimator of scale
+    """
+    a = np.squeeze(a)
+    n = a.shape[0]
+    if a.size == 0:
+        return np.nan
+    else:
+        h = int(n // 2 + 1)
+        k = int(h * (h - 1) / 2)
+        idx = np.triu_indices(n, k=1)
+        diffs = np.abs(a[idx[0]] - a[idx[1]])
+        output = np.partition(diffs, kth=k - 1)[k - 1]
+        output = c * output
+        return output
+
+
 class Huber(object):
     """
     Huber's proposal 2 for estimating location and scale jointly.

diff --git a/statsmodels/robust/tests/test_scale.py b/statsmodels/robust/tests/test_scale.py
@@ -9,6 +9,7 @@
 # Example from Section 5.5, Venables & Ripley (2002)
 
 import statsmodels.robust.scale as scale
+import statsmodels.api as sm
 
 DECIMAL = 4
 # TODO: Can replicate these tests using stackloss data and R if this
@@ -35,6 +36,9 @@ def test_mad(self):
     def test_iqr(self):
         assert_almost_equal(scale.iqr(self.chem), 0.68570, DECIMAL)
 
+    def test_qn(self):
+        assert_almost_equal(scale.qn_scale(self.chem), 0.73231, DECIMAL)
+
     def test_huber_scale(self):
         assert_almost_equal(scale.huber(self.chem)[0], 3.20549, DECIMAL)
 
@@ -147,6 +151,69 @@ def test_axisneg1(self):
         assert_equal(m.shape, (40, 10))
 
 
+class TestQn(object):
+    @classmethod
+    def setup_class(cls):
+        np.random.seed(54321)
+        cls.normal = standard_normal(size=40)
+        cls.range = np.arange(0, 40)
+        cls.exponential = np.random.exponential(size=40)
+        cls.stackloss = sm.datasets.stackloss.load_pandas().data
+        cls.sunspot = sm.datasets.sunspots.load_pandas().data.SUNACTIVITY
+
+    def test_qn_naive(self):
+        assert_almost_equal(scale.qn_scale(self.normal),
+                            scale._qn_naive(self.normal), DECIMAL)
+        assert_almost_equal(scale.qn_scale(self.range),
+                            scale._qn_naive(self.range), DECIMAL)
+        assert_almost_equal(scale.qn_scale(self.exponential),
+                            scale._qn_naive(self.exponential), DECIMAL)
+
+    def test_qn_robustbase(self):
+        # from R's robustbase with finite.corr = FALSE
+        assert_almost_equal(scale.qn_scale(self.range), 13.3148, DECIMAL)
+        assert_almost_equal(scale.qn_scale(self.stackloss),
+                            np.array([8.87656, 8.87656, 2.21914, 4.43828]),
+                            DECIMAL)
+        # sunspot.year from datasets in R only goes up to 289
+        assert_almost_equal(scale.qn_scale(self.sunspot[0:289]), 33.50901,
+                            DECIMAL)
+
+    def test_qn_empty(self):
+        empty = np.empty(0)
+        assert np.isnan(scale.qn_scale(empty))
+        empty = np.empty((10, 100, 0))
+        assert_equal(scale.qn_scale(empty, axis=1), np.empty((10, 0)))
+        empty = np.empty((100, 100, 0, 0))
+        assert_equal(scale.qn_scale(empty, axis=-1), np.empty((100, 100, 0)))
+        empty = np.empty(shape=())
+        with pytest.raises(ValueError):
+            scale.iqr(empty)
+
+
+class TestQnAxes(object):
+    @classmethod
+    def setup_class(cls):
+        np.random.seed(54321)
+        cls.X = standard_normal((40, 10, 30))
+
+    def test_axis0(self):
+        m = scale.qn_scale(self.X, axis=0)
+        assert_equal(m.shape, (10, 30))
+
+    def test_axis1(self):
+        m = scale.qn_scale(self.X, axis=1)
+        assert_equal(m.shape, (40, 30))
+
+    def test_axis2(self):
+        m = scale.qn_scale(self.X, axis=2)
+        assert_equal(m.shape, (40, 10))
+
+    def test_axisneg1(self):
+        m = scale.qn_scale(self.X, axis=-1)
+        assert_equal(m.shape, (40, 10))
+
+
 class TestHuber(object):
     @classmethod
     def setup_class(cls):