BUG: Correct axis None case

Correct axis None in mad closes #7027
statsmodels · Sep 6, 2020 · 4ffb563 · 4ffb563
1 parent 3d6a493
commit 4ffb563
Show file tree

Hide file tree

Showing 2 changed files with 140 additions and 62 deletions.
diff --git a/statsmodels/robust/scale.py b/statsmodels/robust/scale.py
@@ -19,8 +19,7 @@
 from ._qn import _qn
 
 
-def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
-    # c \approx .6745
+def mad(a, c=Gaussian.ppf(3 / 4.0), axis=0, center=np.median):
     """
     The Median Absolute Deviation along given axis of an array
 
@@ -30,7 +29,7 @@ def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
         Input array.
     c : float, optional
         The normalization constant.  Defined as scipy.stats.norm.ppf(3/4.),
-        which is approximately .6745.
+        which is approximately 0.6745.
     axis : int, optional
         The default is 0. Can also be None.
     center : callable or float
@@ -43,19 +42,22 @@ def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
     mad : float
         `mad` = median(abs(`a` - center))/`c`
     """
-    a = array_like(a, 'a', ndim=None)
-    c = float_like(c, 'c')
+    a = array_like(a, "a", ndim=None)
+    c = float_like(c, "c")
     if not a.size:
-        center = 0.0
+        center_val = 0.0
     elif callable(center):
-        center = np.apply_over_axes(center, a, axis)
+        if axis is not None:
+            center_val = np.apply_over_axes(center, a, axis)
+        else:
+            center_val = center(a.ravel())
     else:
-        center = float_like(center, "center")
+        center_val = float_like(center, "center")
 
-    return np.median((np.abs(a-center)) / c, axis=axis)
+    return np.median((np.abs(a - center_val)) / c, axis=axis)
 
 
-def iqr(a, c=Gaussian.ppf(3/4) - Gaussian.ppf(1/4), axis=0):
+def iqr(a, c=Gaussian.ppf(3 / 4) - Gaussian.ppf(1 / 4), axis=0):
     """
     The normalized interquartile range along given axis of an array
 
@@ -75,8 +77,8 @@ def iqr(a, c=Gaussian.ppf(3/4) - Gaussian.ppf(1/4), axis=0):
     -------
     The normalized interquartile range
     """
-    a = array_like(a, 'a', ndim=None)
-    c = float_like(c, 'c')
+    a = array_like(a, "a", ndim=None)
+    c = float_like(c, "c")
 
     if a.ndim == 0:
         raise ValueError("a should have at least one dimension")
@@ -114,9 +116,10 @@ def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0):
     {float, ndarray}
         The Qn robust estimator of scale
     """
-    a = array_like(a, 'a', ndim=None, dtype=np.float64, contiguous=True,
-                   order='C')
-    c = float_like(c, 'c')
+    a = array_like(
+        a, "a", ndim=None, dtype=np.float64, contiguous=True, order="C"
+    )
+    c = float_like(c, "c")
     if a.ndim == 0:
         raise ValueError("a should have at least one dimension")
     elif a.size == 0:
@@ -197,7 +200,7 @@ def __init__(self, c=1.5, tol=1.0e-08, maxiter=30, norm=None):
         self.tol = tol
         self.norm = norm
         tmp = 2 * Gaussian.cdf(c) - 1
-        self.gamma = tmp + c**2 * (1 - tmp) - 2 * c * Gaussian.pdf(c)
+        self.gamma = tmp + c ** 2 * (1 - tmp) - 2 * c * Gaussian.pdf(c)
 
     def __call__(self, a, mu=None, initscale=None, axis=0):
         """
@@ -262,35 +265,44 @@ def _estimate_both(self, a, scale, mu, axis, est_mu, n):
                     # This is a one-step fixed-point estimator
                     # if self.norm == norms.HuberT
                     # It should be faster than using norms.HuberT
-                    nmu = np.clip(a, mu-self.c*scale,
-                                  mu+self.c*scale).sum(axis) / a.shape[axis]
+                    nmu = (
+                        np.clip(
+                            a, mu - self.c * scale, mu + self.c * scale
+                        ).sum(axis)
+                        / a.shape[axis]
+                    )
                 else:
-                    nmu = norms.estimate_location(a, scale, self.norm, axis,
-                                                  mu, self.maxiter, self.tol)
+                    nmu = norms.estimate_location(
+                        a, scale, self.norm, axis, mu, self.maxiter, self.tol
+                    )
             else:
                 # Effectively, do nothing
                 nmu = mu.squeeze()
             nmu = tools.unsqueeze(nmu, axis, a.shape)
 
-            subset = np.less_equal(np.abs((a - mu)/scale), self.c)
+            subset = np.less_equal(np.abs((a - mu) / scale), self.c)
             card = subset.sum(axis)
 
-            scale_num = np.sum(subset * (a - nmu)**2, axis)
-            scale_denom = (n * self.gamma - (a.shape[axis] - card) * self.c**2)
+            scale_num = np.sum(subset * (a - nmu) ** 2, axis)
+            scale_denom = n * self.gamma - (a.shape[axis] - card) * self.c ** 2
             nscale = np.sqrt(scale_num / scale_denom)
             nscale = tools.unsqueeze(nscale, axis, a.shape)
 
-            test1 = np.alltrue(np.less_equal(np.abs(scale - nscale),
-                                             nscale * self.tol))
+            test1 = np.alltrue(
+                np.less_equal(np.abs(scale - nscale), nscale * self.tol)
+            )
             test2 = np.alltrue(
-                np.less_equal(np.abs(mu - nmu), nscale * self.tol))
+                np.less_equal(np.abs(mu - nmu), nscale * self.tol)
+            )
             if not (test1 and test2):
                 mu = nmu
                 scale = nscale
             else:
                 return nmu.squeeze(), nscale.squeeze()
-        raise ValueError('joint estimation of location and scale failed '
-                         'to converge in %d iterations' % self.maxiter)
+        raise ValueError(
+            "joint estimation of location and scale failed "
+            "to converge in %d iterations" % self.maxiter
+        )
 
 
 huber = Huber()
@@ -331,32 +343,45 @@ class HuberScale(object):
     and the Huber constant h = (n-p)/n*(d**2 + (1-d**2)*\
             scipy.stats.norm.cdf(d) - .5 - d*sqrt(2*pi)*exp(-0.5*d**2)
     """
+
     def __init__(self, d=2.5, tol=1e-08, maxiter=30):
         self.d = d
         self.tol = tol
         self.maxiter = maxiter
 
     def __call__(self, df_resid, nobs, resid):
-        h = df_resid / nobs * (
-            self.d ** 2
-            + (1 - self.d ** 2) * Gaussian.cdf(self.d)
-            - .5 - self.d / (np.sqrt(2 * np.pi)) * np.exp(-.5 * self.d ** 2)
+        h = (
+            df_resid
+            / nobs
+            * (
+                self.d ** 2
+                + (1 - self.d ** 2) * Gaussian.cdf(self.d)
+                - 0.5
+                - self.d / (np.sqrt(2 * np.pi)) * np.exp(-0.5 * self.d ** 2)
+            )
         )
         s = mad(resid)
 
         def subset(x):
             return np.less(np.abs(resid / x), self.d)
 
         def chi(s):
-            return subset(s) * (resid / s) ** 2 / 2 + (1 - subset(s)) * \
-                   (self.d ** 2 / 2)
+            return subset(s) * (resid / s) ** 2 / 2 + (1 - subset(s)) * (
+                self.d ** 2 / 2
+            )
 
         scalehist = [np.inf, s]
         niter = 1
-        while (np.abs(scalehist[niter - 1] - scalehist[niter]) > self.tol
-               and niter < self.maxiter):
-            nscale = np.sqrt(1 / (nobs * h) * np.sum(chi(scalehist[-1])) *
-                             scalehist[-1] ** 2)
+        while (
+            np.abs(scalehist[niter - 1] - scalehist[niter]) > self.tol
+            and niter < self.maxiter
+        ):
+            nscale = np.sqrt(
+                1
+                / (nobs * h)
+                * np.sum(chi(scalehist[-1]))
+                * scalehist[-1] ** 2
+            )
             scalehist.append(nscale)
             niter += 1
             # TODO: raise on convergence failure?

diff --git a/statsmodels/robust/tests/test_scale.py b/statsmodels/robust/tests/test_scale.py
@@ -7,23 +7,50 @@
 from numpy.testing import assert_almost_equal, assert_equal
 from scipy.stats import norm as Gaussian
 import pytest
+
 # Example from Section 5.5, Venables & Ripley (2002)
 
 import statsmodels.robust.scale as scale
 import statsmodels.api as sm
+import numpy as np
+from statsmodels.robust.scale import mad
 
 DECIMAL = 4
 # TODO: Can replicate these tests using stackloss data and R if this
-# data is a problem
+#  data is a problem
 
 
 class TestChem(object):
     @classmethod
     def setup_class(cls):
-        cls.chem = np.array([
-            2.20, 2.20, 2.4, 2.4, 2.5, 2.7, 2.8, 2.9, 3.03,
-            3.03, 3.10, 3.37, 3.4, 3.4, 3.4, 3.5, 3.6, 3.7, 3.7, 3.7, 3.7,
-            3.77, 5.28, 28.95])
+        cls.chem = np.array(
+            [
+                2.20,
+                2.20,
+                2.4,
+                2.4,
+                2.5,
+                2.7,
+                2.8,
+                2.9,
+                3.03,
+                3.03,
+                3.10,
+                3.37,
+                3.4,
+                3.4,
+                3.4,
+                3.5,
+                3.6,
+                3.7,
+                3.7,
+                3.7,
+                3.7,
+                3.77,
+                5.28,
+                28.95,
+            ]
+        )
 
     def test_mean(self):
         assert_almost_equal(np.mean(self.chem), 4.2804, DECIMAL)
@@ -50,10 +77,12 @@ def test_huber_huberT(self):
         n = scale.norms.HuberT()
         n.t = 1.5
         h = scale.Huber(norm=n)
-        assert_almost_equal(scale.huber(self.chem)[0], h(self.chem)[0],
-                            DECIMAL)
-        assert_almost_equal(scale.huber(self.chem)[1], h(self.chem)[1],
-                            DECIMAL)
+        assert_almost_equal(
+            scale.huber(self.chem)[0], h(self.chem)[0], DECIMAL
+        )
+        assert_almost_equal(
+            scale.huber(self.chem)[1], h(self.chem)[1], DECIMAL
+        )
 
     def test_huber_Hampel(self):
         hh = scale.Huber(norm=scale.norms.Hampel())
@@ -84,10 +113,11 @@ def test_mad_center(self):
         assert_equal(n.shape, (10,))
         with pytest.raises(TypeError):
             scale.mad(self.X, center=None)
-        assert_almost_equal(scale.mad(self.X, center=1),
-                            np.median(np.abs(self.X - 1),
-                                      axis=0)/Gaussian.ppf(3/4.),
-                            DECIMAL)
+        assert_almost_equal(
+            scale.mad(self.X, center=1),
+            np.median(np.abs(self.X - 1), axis=0) / Gaussian.ppf(3 / 4.0),
+            DECIMAL,
+        )
 
 
 class TestMadAxes(object):
@@ -169,22 +199,30 @@ def setup_class(cls):
         cls.sunspot = sm.datasets.sunspots.load_pandas().data.SUNACTIVITY
 
     def test_qn_naive(self):
-        assert_almost_equal(scale.qn_scale(self.normal),
-                            scale._qn_naive(self.normal), DECIMAL)
-        assert_almost_equal(scale.qn_scale(self.range),
-                            scale._qn_naive(self.range), DECIMAL)
-        assert_almost_equal(scale.qn_scale(self.exponential),
-                            scale._qn_naive(self.exponential), DECIMAL)
+        assert_almost_equal(
+            scale.qn_scale(self.normal), scale._qn_naive(self.normal), DECIMAL
+        )
+        assert_almost_equal(
+            scale.qn_scale(self.range), scale._qn_naive(self.range), DECIMAL
+        )
+        assert_almost_equal(
+            scale.qn_scale(self.exponential),
+            scale._qn_naive(self.exponential),
+            DECIMAL,
+        )
 
     def test_qn_robustbase(self):
         # from R's robustbase with finite.corr = FALSE
         assert_almost_equal(scale.qn_scale(self.range), 13.3148, DECIMAL)
-        assert_almost_equal(scale.qn_scale(self.stackloss),
-                            np.array([8.87656, 8.87656, 2.21914, 4.43828]),
-                            DECIMAL)
+        assert_almost_equal(
+            scale.qn_scale(self.stackloss),
+            np.array([8.87656, 8.87656, 2.21914, 4.43828]),
+            DECIMAL,
+        )
         # sunspot.year from datasets in R only goes up to 289
-        assert_almost_equal(scale.qn_scale(self.sunspot[0:289]), 33.50901,
-                            DECIMAL)
+        assert_almost_equal(
+            scale.qn_scale(self.sunspot[0:289]), 33.50901, DECIMAL
+        )
 
     def test_qn_empty(self):
         empty = np.empty(0)
@@ -255,3 +293,18 @@ def test_axis2(self):
     def test_axisneg1(self):
         m, s = self.h(self.X, axis=-1)
         assert_equal(m.shape, (40, 10))
+
+
+def test_mad_axis_none():
+    # GH 7027
+    a = np.array([[0, 1, 2], [2, 3, 2]])
+
+    def m(x):
+        return np.median(x)
+
+    direct = mad(a=a, axis=None)
+    custom = mad(a=a, axis=None, center=m)
+    axis0 = mad(a=a.ravel(), axis=0)
+
+    np.testing.assert_allclose(direct, custom)
+    np.testing.assert_allclose(direct, axis0)