Skip to content

Commit

Permalink
BUG: Correct axis None case
Browse files Browse the repository at this point in the history
Correct axis None in mad

closes #7027
  • Loading branch information
Kevin Sheppard committed Sep 6, 2020
1 parent 3d6a493 commit 4ffb563
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 62 deletions.
101 changes: 63 additions & 38 deletions statsmodels/robust/scale.py
Expand Up @@ -19,8 +19,7 @@
from ._qn import _qn


def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
# c \approx .6745
def mad(a, c=Gaussian.ppf(3 / 4.0), axis=0, center=np.median):
"""
The Median Absolute Deviation along given axis of an array
Expand All @@ -30,7 +29,7 @@ def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
Input array.
c : float, optional
The normalization constant. Defined as scipy.stats.norm.ppf(3/4.),
which is approximately .6745.
which is approximately 0.6745.
axis : int, optional
The default is 0. Can also be None.
center : callable or float
Expand All @@ -43,19 +42,22 @@ def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
mad : float
`mad` = median(abs(`a` - center))/`c`
"""
a = array_like(a, 'a', ndim=None)
c = float_like(c, 'c')
a = array_like(a, "a", ndim=None)
c = float_like(c, "c")
if not a.size:
center = 0.0
center_val = 0.0
elif callable(center):
center = np.apply_over_axes(center, a, axis)
if axis is not None:
center_val = np.apply_over_axes(center, a, axis)
else:
center_val = center(a.ravel())
else:
center = float_like(center, "center")
center_val = float_like(center, "center")

return np.median((np.abs(a-center)) / c, axis=axis)
return np.median((np.abs(a - center_val)) / c, axis=axis)


def iqr(a, c=Gaussian.ppf(3/4) - Gaussian.ppf(1/4), axis=0):
def iqr(a, c=Gaussian.ppf(3 / 4) - Gaussian.ppf(1 / 4), axis=0):
"""
The normalized interquartile range along given axis of an array
Expand All @@ -75,8 +77,8 @@ def iqr(a, c=Gaussian.ppf(3/4) - Gaussian.ppf(1/4), axis=0):
-------
The normalized interquartile range
"""
a = array_like(a, 'a', ndim=None)
c = float_like(c, 'c')
a = array_like(a, "a", ndim=None)
c = float_like(c, "c")

if a.ndim == 0:
raise ValueError("a should have at least one dimension")
Expand Down Expand Up @@ -114,9 +116,10 @@ def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0):
{float, ndarray}
The Qn robust estimator of scale
"""
a = array_like(a, 'a', ndim=None, dtype=np.float64, contiguous=True,
order='C')
c = float_like(c, 'c')
a = array_like(
a, "a", ndim=None, dtype=np.float64, contiguous=True, order="C"
)
c = float_like(c, "c")
if a.ndim == 0:
raise ValueError("a should have at least one dimension")
elif a.size == 0:
Expand Down Expand Up @@ -197,7 +200,7 @@ def __init__(self, c=1.5, tol=1.0e-08, maxiter=30, norm=None):
self.tol = tol
self.norm = norm
tmp = 2 * Gaussian.cdf(c) - 1
self.gamma = tmp + c**2 * (1 - tmp) - 2 * c * Gaussian.pdf(c)
self.gamma = tmp + c ** 2 * (1 - tmp) - 2 * c * Gaussian.pdf(c)

def __call__(self, a, mu=None, initscale=None, axis=0):
"""
Expand Down Expand Up @@ -262,35 +265,44 @@ def _estimate_both(self, a, scale, mu, axis, est_mu, n):
# This is a one-step fixed-point estimator
# if self.norm == norms.HuberT
# It should be faster than using norms.HuberT
nmu = np.clip(a, mu-self.c*scale,
mu+self.c*scale).sum(axis) / a.shape[axis]
nmu = (
np.clip(
a, mu - self.c * scale, mu + self.c * scale
).sum(axis)
/ a.shape[axis]
)
else:
nmu = norms.estimate_location(a, scale, self.norm, axis,
mu, self.maxiter, self.tol)
nmu = norms.estimate_location(
a, scale, self.norm, axis, mu, self.maxiter, self.tol
)
else:
# Effectively, do nothing
nmu = mu.squeeze()
nmu = tools.unsqueeze(nmu, axis, a.shape)

subset = np.less_equal(np.abs((a - mu)/scale), self.c)
subset = np.less_equal(np.abs((a - mu) / scale), self.c)
card = subset.sum(axis)

scale_num = np.sum(subset * (a - nmu)**2, axis)
scale_denom = (n * self.gamma - (a.shape[axis] - card) * self.c**2)
scale_num = np.sum(subset * (a - nmu) ** 2, axis)
scale_denom = n * self.gamma - (a.shape[axis] - card) * self.c ** 2
nscale = np.sqrt(scale_num / scale_denom)
nscale = tools.unsqueeze(nscale, axis, a.shape)

test1 = np.alltrue(np.less_equal(np.abs(scale - nscale),
nscale * self.tol))
test1 = np.alltrue(
np.less_equal(np.abs(scale - nscale), nscale * self.tol)
)
test2 = np.alltrue(
np.less_equal(np.abs(mu - nmu), nscale * self.tol))
np.less_equal(np.abs(mu - nmu), nscale * self.tol)
)
if not (test1 and test2):
mu = nmu
scale = nscale
else:
return nmu.squeeze(), nscale.squeeze()
raise ValueError('joint estimation of location and scale failed '
'to converge in %d iterations' % self.maxiter)
raise ValueError(
"joint estimation of location and scale failed "
"to converge in %d iterations" % self.maxiter
)


huber = Huber()
Expand Down Expand Up @@ -331,32 +343,45 @@ class HuberScale(object):
and the Huber constant h = (n-p)/n*(d**2 + (1-d**2)*\
scipy.stats.norm.cdf(d) - .5 - d*sqrt(2*pi)*exp(-0.5*d**2)
"""

def __init__(self, d=2.5, tol=1e-08, maxiter=30):
self.d = d
self.tol = tol
self.maxiter = maxiter

def __call__(self, df_resid, nobs, resid):
h = df_resid / nobs * (
self.d ** 2
+ (1 - self.d ** 2) * Gaussian.cdf(self.d)
- .5 - self.d / (np.sqrt(2 * np.pi)) * np.exp(-.5 * self.d ** 2)
h = (
df_resid
/ nobs
* (
self.d ** 2
+ (1 - self.d ** 2) * Gaussian.cdf(self.d)
- 0.5
- self.d / (np.sqrt(2 * np.pi)) * np.exp(-0.5 * self.d ** 2)
)
)
s = mad(resid)

def subset(x):
return np.less(np.abs(resid / x), self.d)

def chi(s):
return subset(s) * (resid / s) ** 2 / 2 + (1 - subset(s)) * \
(self.d ** 2 / 2)
return subset(s) * (resid / s) ** 2 / 2 + (1 - subset(s)) * (
self.d ** 2 / 2
)

scalehist = [np.inf, s]
niter = 1
while (np.abs(scalehist[niter - 1] - scalehist[niter]) > self.tol
and niter < self.maxiter):
nscale = np.sqrt(1 / (nobs * h) * np.sum(chi(scalehist[-1])) *
scalehist[-1] ** 2)
while (
np.abs(scalehist[niter - 1] - scalehist[niter]) > self.tol
and niter < self.maxiter
):
nscale = np.sqrt(
1
/ (nobs * h)
* np.sum(chi(scalehist[-1]))
* scalehist[-1] ** 2
)
scalehist.append(nscale)
niter += 1
# TODO: raise on convergence failure?
Expand Down
101 changes: 77 additions & 24 deletions statsmodels/robust/tests/test_scale.py
Expand Up @@ -7,23 +7,50 @@
from numpy.testing import assert_almost_equal, assert_equal
from scipy.stats import norm as Gaussian
import pytest

# Example from Section 5.5, Venables & Ripley (2002)

import statsmodels.robust.scale as scale
import statsmodels.api as sm
import numpy as np
from statsmodels.robust.scale import mad

DECIMAL = 4
# TODO: Can replicate these tests using stackloss data and R if this
# data is a problem
# data is a problem


class TestChem(object):
@classmethod
def setup_class(cls):
cls.chem = np.array([
2.20, 2.20, 2.4, 2.4, 2.5, 2.7, 2.8, 2.9, 3.03,
3.03, 3.10, 3.37, 3.4, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.7, 3.7,
3.77, 5.28, 28.95])
cls.chem = np.array(
[
2.20,
2.20,
2.4,
2.4,
2.5,
2.7,
2.8,
2.9,
3.03,
3.03,
3.10,
3.37,
3.4,
3.4,
3.4,
3.5,
3.6,
3.7,
3.7,
3.7,
3.7,
3.77,
5.28,
28.95,
]
)

def test_mean(self):
assert_almost_equal(np.mean(self.chem), 4.2804, DECIMAL)
Expand All @@ -50,10 +77,12 @@ def test_huber_huberT(self):
n = scale.norms.HuberT()
n.t = 1.5
h = scale.Huber(norm=n)
assert_almost_equal(scale.huber(self.chem)[0], h(self.chem)[0],
DECIMAL)
assert_almost_equal(scale.huber(self.chem)[1], h(self.chem)[1],
DECIMAL)
assert_almost_equal(
scale.huber(self.chem)[0], h(self.chem)[0], DECIMAL
)
assert_almost_equal(
scale.huber(self.chem)[1], h(self.chem)[1], DECIMAL
)

def test_huber_Hampel(self):
hh = scale.Huber(norm=scale.norms.Hampel())
Expand Down Expand Up @@ -84,10 +113,11 @@ def test_mad_center(self):
assert_equal(n.shape, (10,))
with pytest.raises(TypeError):
scale.mad(self.X, center=None)
assert_almost_equal(scale.mad(self.X, center=1),
np.median(np.abs(self.X - 1),
axis=0)/Gaussian.ppf(3/4.),
DECIMAL)
assert_almost_equal(
scale.mad(self.X, center=1),
np.median(np.abs(self.X - 1), axis=0) / Gaussian.ppf(3 / 4.0),
DECIMAL,
)


class TestMadAxes(object):
Expand Down Expand Up @@ -169,22 +199,30 @@ def setup_class(cls):
cls.sunspot = sm.datasets.sunspots.load_pandas().data.SUNACTIVITY

def test_qn_naive(self):
assert_almost_equal(scale.qn_scale(self.normal),
scale._qn_naive(self.normal), DECIMAL)
assert_almost_equal(scale.qn_scale(self.range),
scale._qn_naive(self.range), DECIMAL)
assert_almost_equal(scale.qn_scale(self.exponential),
scale._qn_naive(self.exponential), DECIMAL)
assert_almost_equal(
scale.qn_scale(self.normal), scale._qn_naive(self.normal), DECIMAL
)
assert_almost_equal(
scale.qn_scale(self.range), scale._qn_naive(self.range), DECIMAL
)
assert_almost_equal(
scale.qn_scale(self.exponential),
scale._qn_naive(self.exponential),
DECIMAL,
)

def test_qn_robustbase(self):
# from R's robustbase with finite.corr = FALSE
assert_almost_equal(scale.qn_scale(self.range), 13.3148, DECIMAL)
assert_almost_equal(scale.qn_scale(self.stackloss),
np.array([8.87656, 8.87656, 2.21914, 4.43828]),
DECIMAL)
assert_almost_equal(
scale.qn_scale(self.stackloss),
np.array([8.87656, 8.87656, 2.21914, 4.43828]),
DECIMAL,
)
# sunspot.year from datasets in R only goes up to 289
assert_almost_equal(scale.qn_scale(self.sunspot[0:289]), 33.50901,
DECIMAL)
assert_almost_equal(
scale.qn_scale(self.sunspot[0:289]), 33.50901, DECIMAL
)

def test_qn_empty(self):
empty = np.empty(0)
Expand Down Expand Up @@ -255,3 +293,18 @@ def test_axis2(self):
def test_axisneg1(self):
m, s = self.h(self.X, axis=-1)
assert_equal(m.shape, (40, 10))


def test_mad_axis_none():
# GH 7027
a = np.array([[0, 1, 2], [2, 3, 2]])

def m(x):
return np.median(x)

direct = mad(a=a, axis=None)
custom = mad(a=a, axis=None, center=m)
axis0 = mad(a=a.ravel(), axis=0)

np.testing.assert_allclose(direct, custom)
np.testing.assert_allclose(direct, axis0)

0 comments on commit 4ffb563

Please sign in to comment.