Skip to content

Commit

Permalink
Merge b04b51c into 26d901a
Browse files Browse the repository at this point in the history
  • Loading branch information
esmucler committed Aug 24, 2020
2 parents 26d901a + b04b51c commit a1236f3
Show file tree
Hide file tree
Showing 7 changed files with 308 additions and 1 deletion.
5 changes: 5 additions & 0 deletions docs/source/release/version0.12.rst
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,10 @@ Submodules
- Add expanding initialization to RollingOLS/WLS (:pr:`6838`)
- Add a note when R2 is uncentered (:pr:`6844`)

``robust``
~~~~~~~~~~~~~~
- Add an implementation of the Qn robust scale estimator (:pr:`6977`)

``stats``
~~~~~~~~~
- Multivariate mean tests and confint (:pr:`4107`)
Expand Down Expand Up @@ -800,6 +804,7 @@ New Functions
* :func:`statsmodels.multivariate.plots.plot_loadings`
* :func:`statsmodels.multivariate.plots.plot_scree`
* :func:`statsmodels.robust.scale.iqr`
* :func:`statsmodels.robust.scale.qn`
* :func:`statsmodels.stats.contrast.wald_test_noncent`
* :func:`statsmodels.stats.contrast.wald_test_noncent_generic`
* :func:`statsmodels.stats.descriptivestats.describe`
Expand Down
2 changes: 2 additions & 0 deletions docs/source/rlm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ References
* PJ Huber. ‘Robust Statistics’ John Wiley and Sons, Inc., New York. 1981.
* PJ Huber. 1973, ‘The 1972 Wald Memorial Lectures: Robust Regression: Asymptotics, Conjectures, and Monte Carlo.’ The Annals of Statistics, 1.5, 799-821.
* R Venables, B Ripley. ‘Modern Applied Statistics in S’ Springer, New York,
* C Croux, PJ Rousseeuw, 'Time-efficient algorithms for two highly robust estimators of scale' Computational statistics. Physica, Heidelberg, 1992.

Module Reference
----------------
Expand Down Expand Up @@ -105,3 +106,4 @@ Scale
mad
hubers_scale
iqr
qn
22 changes: 21 additions & 1 deletion examples/notebooks/robust_models_1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,27 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The IQR is less robust than the MAD in the sense that it has a lower breakdown point: it can withstand 25\\% outlying observations before being completely ruined, whereas the MAD can withstand 50\\% outlying observations. However, the IQR is better suited for asymmetric distributions. See Rousseeuw & Croux (1993), 'Alternatives to the Median Absolute Deviation' for more details."
"The IQR is less robust than the MAD in the sense that it has a lower breakdown point: it can withstand 25\\% outlying observations before being completely ruined, whereas the MAD can withstand 50\\% outlying observations. However, the IQR is better suited for asymmetric distributions."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Yet another robust estimator of scale is the $Q_n$ estimator, introduced in Rousseeuw & Croux (1993), 'Alternatives to the Median Absolute Deviation'. Then $Q_n$ estimator is given by\n",
"$$\n",
"Q_n = K \\left\\lbrace \\vert X_{i} - X_{j}\\vert : i<j\\right\\rbrace_{(h)}\n",
"$$\n",
"where $h\\approx (1/4){{n}\\choose{2}}$ and $K$ is a given constant. In words, the $Q_n$ estimator is the normalized $h$-th order statistic of the absolute differences of the data. The normalizing constant $K$ is usually chosen as 2.219144, to make the estimator consistent for the standard deviation in the case of normal data. The $Q_n$ estimator has a 50\\% breakdown point and a 82\\% asymptotic efficiency at the normal distribution, much higher than the 37\\% efficiency of the MAD."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sm.robust.scale.qn(x)"
]
},
{
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@
_kim_smoother={'source': 'statsmodels/tsa/regime_switching/_kim_smoother.pyx.in'}, # noqa: E501
_arma_innovations={'source': 'statsmodels/tsa/innovations/_arma_innovations.pyx.in'}, # noqa: E501
linbin={'source': 'statsmodels/nonparametric/linbin.pyx'},
_qn={'source': 'statsmodels/robust/_qn.pyx'},
_smoothers_lowess={'source': 'statsmodels/nonparametric/_smoothers_lowess.pyx'}, # noqa: E501
kalman_loglike={'source': 'statsmodels/tsa/kalmanf/kalman_loglike.pyx',
'include_dirs': ['statsmodels/src'],
Expand Down
135 changes: 135 additions & 0 deletions statsmodels/robust/_qn.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!python
#cython: wraparound=False, boundscheck=False, cdivision=True

import numpy as np


def _high_weighted_median(double[::1] a, int[::1] weights):
"""
Computes a weighted high median of a. This is defined as the
smallest a[j] such that the sum over all a[i]<=a[j] is strictly
greater than half the total sum of the weights
"""
cdef:
double[::1] a_cp = np.copy(a)
int[::1] weights_cp = np.copy(weights)
int n = a_cp.shape[0]
double[::1] sorted_a = np.zeros((n,), dtype=np.double)
double[::1] a_cand = np.zeros((n,), dtype=np.double)
int[::1] weights_cand = np.zeros((n,), dtype=np.intc)
int kcand = 0
int wleft, wright, wmid, wtot, wrest = 0
double trial = 0
wtot = np.sum(weights_cp)
while True:
wleft = 0
wmid = 0
wright = 0
for i in range(n):
sorted_a[i] = a_cp[i]
sorted_a = np.partition(sorted_a, kth=n//2)
trial = sorted_a[n//2]
for i in range(n):
if a_cp[i] < trial:
wleft = wleft + weights_cp[i]
elif a_cp[i] > trial:
wright = wright + weights_cp[i]
else:
wmid = wmid + weights_cp[i]
kcand = 0
if 2 * (wrest + wleft) > wtot:
for i in range(n):
if a_cp[i] < trial:
a_cand[kcand] = a_cp[i]
weights_cand[kcand] = weights_cp[i]
kcand = kcand + 1
elif 2 * (wrest + wleft + wmid) <= wtot:
for i in range(n):
if a_cp[i] > trial:
a_cand[kcand] = a_cp[i]
weights_cand[kcand] = weights_cp[i]
kcand = kcand + 1
wrest = wrest + wleft + wmid
else:
break
n = kcand
for i in range(n):
a_cp[i] = a_cand[i]
weights_cp[i] = weights_cand[i]
return trial


def _qn(double[:] a, double c):
"""
Computes the Qn robust estimator of scale, a more efficient alternative
to the MAD. The implementation follows the algorithm described in Croux
and Rousseeuw (1992).
Parameters
----------
a : array_like
Input array.
c : float, optional
The normalization constant, used to get consistent estimates of the
standard deviation at the normal distribution. Defined as
1/(np.sqrt(2) * scipy.stats.norm.ppf(5/8)), which is 2.219144.
Returns
-------
The Qn robust estimator of scale
"""
cdef:
int n = a.shape[0]
int h = n/2 + 1
int k = h * (h - 1) / 2
int n_left = n * (n + 1) / 2
int n_right = n * n
int k_new = k + n_left
Py_ssize_t i, j, jh, l = 0
int sump, sumq = 0
double trial, output = 0
double[::1] a_sorted = np.sort(a)
int[::1] left = np.array([n - i + 1 for i in range(0, n)], dtype=np.intc)
int[::1] right = np.array([n if i <= h else n - (i - h) for i in range(0, n)], dtype=np.intc)
int[::1] weights = np.zeros((n,), dtype=np.intc)
double[::1] work = np.zeros((n,), dtype=np.double)
int[::1] p = np.zeros((n,), dtype=np.intc)
int[::1] q = np.zeros((n,), dtype=np.intc)
while n_right - n_left > n:
j = 0
for i in range(1, n):
if left[i] <= right[i]:
weights[j] = right[i] - left[i] + 1
jh = left[i] + weights[j] // 2
work[j] = a_sorted[i] - a_sorted[n - jh]
j = j + 1
trial = _high_weighted_median(work[:j], weights[:j])
j = 0
for i in range(n - 1, -1, -1):
while j < n and (a_sorted[i] - a_sorted[n - j - 1]) < trial:
j = j + 1
p[i] = j
j = n + 1
for i in range(n):
while (a_sorted[i] - a_sorted[n - j + 1]) > trial:
j = j - 1
q[i] = j
sump = np.sum(p)
sumq = np.sum(q) - n
if k_new <= sump:
right = np.copy(p)
n_right = sump
elif k_new > sumq:
left = np.copy(q)
n_left = sumq
else:
output = c * trial
return output
j = 0
for i in range(1, n):
for l in range(left[i], right[i] + 1):
work[j] = a_sorted[i] - a_sorted[n - l]
j = j + 1
k_new = k_new - (n_left + 1)
output = c * np.sort(work[:j])[k_new]
return output
77 changes: 77 additions & 0 deletions statsmodels/robust/scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@
R Venables, B Ripley. 'Modern Applied Statistics in S'
Springer, New York, 2002.
C Croux, PJ Rousseeuw, 'Time-efficient algorithms for two highly robust
estimators of scale' Computational statistics. Physica, Heidelberg, 1992.
"""
import numpy as np
from scipy.stats import norm as Gaussian
from . import norms
from statsmodels.tools import tools
from statsmodels.tools.validation import array_like, float_like
from ._qn import _qn


def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
Expand Down Expand Up @@ -81,6 +85,79 @@ def iqr(a, c=Gaussian.ppf(3/4) - Gaussian.ppf(1/4), axis=0):
return np.squeeze(np.diff(quantiles, axis=0) / c)


def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0):
"""
Computes the Qn robust estimator of scale
The Qn scale estimator is a more efficient alternative to the MAD.
The Qn scale estimator of an array a of length n is defined as
c * {abs(a[i] - a[j]): i<j}_(k), for k equal to [n/2] + 1 choose 2. Thus,
the Qn estimator is the k-th order statistic of the absolute differences
of the array. The optional constant is used to normalize the estimate
as explained below. The implementation follows the algorithm described
in Croux and Rousseeuw (1992).
Parameters
----------
a : array_like
Input array.
c : float, optional
The normalization constant. The default value is used to get consistent
estimates of the standard deviation at the normal distribution.
axis : int, optional
The default is 0.
Returns
-------
{float, ndarray}
The Qn robust estimator of scale
"""
a = array_like(a, 'a', ndim=None, dtype=np.float64, contiguous=True,
order='C')
c = float_like(c, 'c')
if a.ndim == 0:
raise ValueError("a should have at least one dimension")
elif a.size == 0:
return np.nan
else:
out = np.apply_along_axis(_qn, axis=axis, arr=a, c=c)
if out.ndim == 0:
return float(out)
return out


def _qn_naive(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8))):
"""
A naive implementation of the Qn robust estimator of scale, used solely
to test the faster, more involved one
Parameters
----------
a : array_like
Input array.
c : float, optional
The normalization constant, used to get consistent estimates of the
standard deviation at the normal distribution. Defined as
1/(np.sqrt(2) * scipy.stats.norm.ppf(5/8)), which is 2.219144.
Returns
-------
The Qn robust estimator of scale
"""
a = np.squeeze(a)
n = a.shape[0]
if a.size == 0:
return np.nan
else:
h = int(n // 2 + 1)
k = int(h * (h - 1) / 2)
idx = np.triu_indices(n, k=1)
diffs = np.abs(a[idx[0]] - a[idx[1]])
output = np.partition(diffs, kth=k - 1)[k - 1]
output = c * output
return output


class Huber(object):
"""
Huber's proposal 2 for estimating location and scale jointly.
Expand Down
67 changes: 67 additions & 0 deletions statsmodels/robust/tests/test_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# Example from Section 5.5, Venables & Ripley (2002)

import statsmodels.robust.scale as scale
import statsmodels.api as sm

DECIMAL = 4
# TODO: Can replicate these tests using stackloss data and R if this
Expand All @@ -35,6 +36,9 @@ def test_mad(self):
def test_iqr(self):
assert_almost_equal(scale.iqr(self.chem), 0.68570, DECIMAL)

def test_qn(self):
assert_almost_equal(scale.qn_scale(self.chem), 0.73231, DECIMAL)

def test_huber_scale(self):
assert_almost_equal(scale.huber(self.chem)[0], 3.20549, DECIMAL)

Expand Down Expand Up @@ -147,6 +151,69 @@ def test_axisneg1(self):
assert_equal(m.shape, (40, 10))


class TestQn(object):
@classmethod
def setup_class(cls):
np.random.seed(54321)
cls.normal = standard_normal(size=40)
cls.range = np.arange(0, 40)
cls.exponential = np.random.exponential(size=40)
cls.stackloss = sm.datasets.stackloss.load_pandas().data
cls.sunspot = sm.datasets.sunspots.load_pandas().data.SUNACTIVITY

def test_qn_naive(self):
assert_almost_equal(scale.qn_scale(self.normal),
scale._qn_naive(self.normal), DECIMAL)
assert_almost_equal(scale.qn_scale(self.range),
scale._qn_naive(self.range), DECIMAL)
assert_almost_equal(scale.qn_scale(self.exponential),
scale._qn_naive(self.exponential), DECIMAL)

def test_qn_robustbase(self):
# from R's robustbase with finite.corr = FALSE
assert_almost_equal(scale.qn_scale(self.range), 13.3148, DECIMAL)
assert_almost_equal(scale.qn_scale(self.stackloss),
np.array([8.87656, 8.87656, 2.21914, 4.43828]),
DECIMAL)
# sunspot.year from datasets in R only goes up to 289
assert_almost_equal(scale.qn_scale(self.sunspot[0:289]), 33.50901,
DECIMAL)

def test_qn_empty(self):
empty = np.empty(0)
assert np.isnan(scale.qn_scale(empty))
empty = np.empty((10, 100, 0))
assert_equal(scale.qn_scale(empty, axis=1), np.empty((10, 0)))
empty = np.empty((100, 100, 0, 0))
assert_equal(scale.qn_scale(empty, axis=-1), np.empty((100, 100, 0)))
empty = np.empty(shape=())
with pytest.raises(ValueError):
scale.iqr(empty)


class TestQnAxes(object):
@classmethod
def setup_class(cls):
np.random.seed(54321)
cls.X = standard_normal((40, 10, 30))

def test_axis0(self):
m = scale.qn_scale(self.X, axis=0)
assert_equal(m.shape, (10, 30))

def test_axis1(self):
m = scale.qn_scale(self.X, axis=1)
assert_equal(m.shape, (40, 30))

def test_axis2(self):
m = scale.qn_scale(self.X, axis=2)
assert_equal(m.shape, (40, 10))

def test_axisneg1(self):
m = scale.qn_scale(self.X, axis=-1)
assert_equal(m.shape, (40, 10))


class TestHuber(object):
@classmethod
def setup_class(cls):
Expand Down

0 comments on commit a1236f3

Please sign in to comment.