REF/BUG: fix CovDetMCD

statsmodels · May 12, 2024 · 1adf4c5 · 1adf4c5
1 parent 133de40
commit 1adf4c5
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 12 deletions.
diff --git a/statsmodels/robust/covariance.py b/statsmodels/robust/covariance.py
@@ -1298,6 +1298,7 @@ def _get_detcov_startidx(z, h, options_start=None, methods_cov="all"):
         nobs = z.shape[0]
         idx_sel = np.argpartition(np.abs(z), h)[:h]
         idx_all = [(idx_sel, "abs-resid")]
+        # next uses symmetric equal-tail trimming
         idx_sorted = np.argsort(z)
         h_tail = (nobs - h) // 2
         idx_all.append((idx_sorted[h_tail : h_tail + h], "trimmed-tail"))
@@ -1306,6 +1307,7 @@ def _get_detcov_startidx(z, h, options_start=None, methods_cov="all"):
     # continue if more than 1 random variable
     cov_all = _cov_starting(z, standardize=False, quantile=0.5)
 
+    # orthogonalization step
     idx_all = []
     for c in cov_all:
         if not hasattr(c, "method"):
@@ -1324,10 +1326,14 @@ class CovM:
     """
 
     def __init__(self, data, norm_mean=None, norm_scatter=None,
-                 scale_bias=None):  #, method="S"):
+                 scale_bias=None, method="S"):
         # todo: method defines how norm_mean and norm_scatter are linked
         #       currently I try for S-estimator
 
+        if method.lower() not in ["s"]:
+            msg = f"method {method} option not recognize or implemented"
+            raise ValueError(msg)
+
         self.data = np.asarray(data)
         self.k_vars = k_vars = self.data.shape[1]
 
@@ -1453,11 +1459,11 @@ class CovDetMCD:
     optimization change.
     """
 
-    def __init__(self):
+    def __init__(self, data):
         # no options yet, methods were written as functions
-        pass
+        self.data = np.asarray(data)
 
-    def cstep_mcd(self, x, mean, cov, h, maxiter=2, tol=1e-8):
+    def _cstep(self, x, mean, cov, h, maxiter=2, tol=1e-8):
         """C-step for mcd iteration
 
         x is data, perc is percentile h / nobs, don't need perc when we
@@ -1468,7 +1474,7 @@ def cstep_mcd(self, x, mean, cov, h, maxiter=2, tol=1e-8):
         converged = False
 
         for i in range(maxiter):
-            d = mahalanobis(x, mean, cov)
+            d = mahalanobis(x - mean, cov)
             idx_sel = np.argpartition(d, h)[:h]
             x_sel = x[idx_sel]
             mean = x_sel.mean(0)
@@ -1479,9 +1485,11 @@ def cstep_mcd(self, x, mean, cov, h, maxiter=2, tol=1e-8):
                 converged = True
                 break
 
+            cov = cov_new
+
         return mean, cov
 
-    def _mcd_one(self, x, idx, h, maxiter=2, mean=None, cov=None):
+    def _fit_one(self, x, idx, h, maxiter=2, mean=None, cov=None):
         """Compute mcd for one starting set of observations.
 
         Parameters
@@ -1509,19 +1517,22 @@ def _mcd_one(self, x, idx, h, maxiter=2, mean=None, cov=None):
         This does not do any preprocessing of the data and returns the empirical mean
         and covariance of evaluation set of the data ``x``.
         """
-        x_sel = x[idx]
+        if idx is not None:
+            x_sel = x[idx]
+        else:
+            x_sel = x
         if mean is None:
             mean = x_sel.mean(0)
         if cov is None:
             cov = np.cov(x_sel.T, ddof=1)
 
         # updated with c-step
-        mean, cov = self.cstep_mcd(x, mean, cov, h, maxiter=maxiter)
+        mean, cov = self._cstep(x, mean, cov, h, maxiter=maxiter)
         det = np.linalg.det(cov)
 
         return mean, cov, det
 
-    def mcd_det(self, x, h, *, h_start=None, mean_func=None, scale_func=None,
+    def fit(self, h, *, h_start=None, mean_func=None, scale_func=None,
                 maxiter=100, options_start=None, reweight=True):
         """
         Compute minimum covariance determinant estimate of mean and covariance.
@@ -1546,7 +1557,7 @@ def mcd_det(self, x, h, *, h_start=None, mean_func=None, scale_func=None,
         Holder instance with results
         """
 
-        x = np.asarray(x)
+        x = self.data
         nobs, k_vars = x.shape
 
         if h is None:
@@ -1573,7 +1584,7 @@ def mcd_det(self, x, h, *, h_start=None, mean_func=None, scale_func=None,
         res = {}
         for ii, ini in enumerate(starts):
             idx_sel, method = ini
-            mean, cov, det = self._mcd_one(x, idx_sel, h, maxiter=maxiter)
+            mean, cov, det = self._fit_one(x, idx_sel, h, maxiter=maxiter)
             res[ii] = Holder(
                 mean=mean,
                 cov=cov * fac_trunc,

diff --git a/statsmodels/robust/tests/test_covariance.py b/statsmodels/robust/tests/test_covariance.py
@@ -179,9 +179,11 @@ def test_tyler():
 
 
 def test_cov_ms():
-    # use CovM as local Cov@
+    # use CovM as local CovS
 
     # > CovSest(x)  using package rrcov
+    # same result with > CovSest(x, method="sdet")
+    # but scale difers with method="biweight"
     mean_r = np.array([1.53420879676, 1.82865741024, 1.65565146981])
     cov_r = np.array([
         [1.8090846049573, 0.0479283121828, 0.2446369025717],
@@ -210,6 +212,27 @@ def test_cov_ms():
     assert_allclose(res.scale, scale_r, rtol=1e-5)
 
 
+def test_covdetmcd():
+
+    # results from rrcov
+    # > cdet = CovMcd(x = hbk, raw.only = TRUE, nsamp = "deterministic",
+    #                 use.correction=FALSE)
+    cov_dmcd_r = np.array("""
+    2.2059619213639   0.0223939863695   0.7898958050933   0.4060613360808
+    0.0223939863695   1.1384166802155   0.4315534571891  -0.2344041030201
+    0.7898958050933   0.4315534571891   1.8930117467493  -0.3292893001459
+    0.4060613360808  -0.2344041030201  -0.3292893001459   0.6179686100845
+    """.split(),
+    float).reshape(4,4)
+
+    mean_dmcd_r = np.array([1.7725, 2.2050, 1.5375, -0.0575])
+
+    mod = robcov.CovDetMCD(dta_hbk)
+    # with default start, default start could get wrong local optimum
+    res = mod.fit(40)
+    assert_allclose(res.mean, mean_dmcd_r, rtol=1e-5)
+    assert_allclose(res.cov, cov_dmcd_r, rtol=1e-5)
+
 def test_robcov_SMOKE():
     # currently only smoke test or very loose comparisons to dgp
     nobs, k_vars = 100, 3