diff --git a/.gitignore b/.gitignore index 0c32bf53261..1bc8670606d 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,7 @@ coverage_html_report/ # VS Code .vscode/ +.vs/ # Project specific statsmodels/version.py diff --git a/statsmodels/robust/covariance.py b/statsmodels/robust/covariance.py index f89e40583e1..51acfb41111 100644 --- a/statsmodels/robust/covariance.py +++ b/statsmodels/robust/covariance.py @@ -308,107 +308,6 @@ def _outlier_gy(d, distr=None, k_endog=1, trim_prob=0.975): # ## GK and OGK ### -def _weight_mean(x, c): - """Tukey-biweight, bisquare weights used in tau scale. - - Parameters - ---------- - x : ndarray - Data - c : float - Parameter for bisquare weights - - Returns - ------- - ndarray : weights - """ - x = np.asarray(x) - w = (1 - (x / c)**2)**2 * (np.abs(x) <= c) - return w - - -def _winsor(x, c): - """Winsorized squared data used in tau scale. - - Parameters - ---------- - x : ndarray - Data - c : float - threshold - - Returns - ------- - winsorized squared data, ``np.minimum(x**2, c**2)`` - """ - return np.minimum(x**2, c**2) - - -def scale_tau(data, cm=4.5, cs=3, weight_mean=_weight_mean, - weight_scale=_winsor, normalize=True, ddof=0): - """Tau estimator of univariate scale. - - Experimental, API will change - - Parameters - ---------- - data : array_like, 1-D or 2-D - If data is 2d, then the location and scale estimates - are calculated for each column - cm : float - constant used in call to weight_mean - cs : float - constant used in call to weight_scale - weight_mean : callable - function to calculate weights for weighted mean - weight_scale : callable - function to calculate scale, "rho" function - normalize : bool - rescale the scale estimate so it is consistent when the data is - normally distributed. The computation assumes winsorized (truncated) - variance. - - Returns - ------- - mean : nd_array - robust mean - std : nd_array - robust estimate of scale (standard deviation) - - Notes - ----- - Uses definition of Maronna and Zamar 2002, with weighted mean and - trimmed variance. - The normalization has been added to match R robustbase. - R robustbase uses by default ddof=0, with option to set it to 2. - - References - ---------- - .. [1] Maronna, Ricardo A, and Ruben H Zamar. “Robust Estimates of Location - and Dispersion for High-Dimensional Datasets.” Technometrics 44, no. 4 - (November 1, 2002): 307–17. https://doi.org/10.1198/004017002188618509. - """ - - x = np.asarray(data) - nobs = x.shape[0] - - med_x = np.median(x, 0) - xdm = x - med_x - mad_x = np.median(np.abs(xdm), 0) - wm = weight_mean(xdm / mad_x, cm) - mean = (wm * x).sum(0) / wm.sum(0) - var = (mad_x**2 * weight_scale((x - mean) / mad_x, cs).sum(0) / - (nobs - ddof)) - - cf = 1 - if normalize: - c = cs * stats.norm.ppf(0.75) - cf = 2 * ((1 - c**2) * stats.norm.cdf(c) - c * stats.norm.pdf(c) - + c**2) - 1 - # return Holder(loc=mean, scale=np.sqrt(var / cf)) - return mean, np.sqrt(var / cf) - - def mahalanobis(data, cov=None, cov_inv=None, sqrt=False): """Mahalanobis distance squared diff --git a/statsmodels/robust/robust_linear_model.py b/statsmodels/robust/robust_linear_model.py index 0b92c90d6e6..d3c10573130 100644 --- a/statsmodels/robust/robust_linear_model.py +++ b/statsmodels/robust/robust_linear_model.py @@ -191,10 +191,12 @@ def _estimate_scale(self, resid): elif isinstance(self.scale_est, scale.HuberScale): return self.scale_est(self.df_resid, self.nobs, resid) else: - return scale.scale_est(self, resid) ** 2 + # use df correction to match HuberScale + return self.scale_est(resid) * np.sqrt(self.nobs / self.df_resid) def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1', - update_scale=True, conv='dev', start_params=None): + update_scale=True, conv='dev', start_params=None, start_scale=None, + ): """ Fits the model using iteratively reweighted least squares. @@ -217,6 +219,7 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1', Specifies method for the initial estimates of the parameters. Default is None, which means that the least squares estimate is used. Currently it is the only available choice. + Deprecated and will be removed. There is no choice here. maxiter : int The maximum number of iterations to try. Default is 50. scale_est : str or HuberScale() @@ -236,6 +239,11 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1', start_params : array_like, optional Initial guess of the solution of the optimizer. If not provided, the initial parameters are computed using OLS. + start_scale : float, optional + Initial scale. If update_scale is False, then the scale will be + fixed at this level for the estimation of the mean parameters. + during iteration. If not provided, then the initial scale is + estimated from the OLS residuals Returns ------- @@ -264,8 +272,10 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1', check_weights=False) wls_results = fake_wls.results(start_params) - if not init: + if not init and not start_scale: self.scale = self._estimate_scale(wls_results.resid) + elif start_scale: + self.scale = start_scale history = dict(params=[np.inf], scale=[]) if conv == 'coefs': diff --git a/statsmodels/robust/scale.py b/statsmodels/robust/scale.py index e938a83747a..d866d52f152 100644 --- a/statsmodels/robust/scale.py +++ b/statsmodels/robust/scale.py @@ -248,11 +248,11 @@ def __call__(self, a, mu=None, initscale=None, axis=0): """ a = np.asarray(a) if mu is None: - n = a.shape[0] - 1 + n = a.shape[axis] - 1 mu = np.median(a, axis=axis) est_mu = True else: - n = a.shape[0] + n = a.shape[axis] mu = mu est_mu = False @@ -298,10 +298,10 @@ def _estimate_both(self, a, scale, mu, axis, est_mu, n): nmu = tools.unsqueeze(nmu, axis, a.shape) subset = np.less_equal(np.abs((a - mu) / scale), self.c) - card = subset.sum(axis) - scale_num = np.sum(subset * (a - nmu) ** 2, axis) - scale_denom = n * self.gamma - (a.shape[axis] - card) * self.c ** 2 + scale_num = np.sum(subset * (a - nmu) ** 2 + + (1 - subset) * (scale * self.c)**2, axis) + scale_denom = n * self.gamma nscale = np.sqrt(scale_num / scale_denom) nscale = tools.unsqueeze(nscale, axis, a.shape) @@ -408,6 +408,63 @@ def chi(s): hubers_scale = HuberScale() +class MScale: + """M-scale estimation. + + experimental interface, arguments and options will still change. + + Parameters + ---------- + chi_func : callable + The rho or chi function for the moment condition for estimating scale. + scale_bias : float + Factor in moment condition to obtain fisher consistency of the scale + estimate at the normal distribution. + """ + + def __init__(self, chi_func, scale_bias): + self.chi_func = chi_func + self.scale_bias = scale_bias + + def __call__(self, data, **kwds): + return self.fit(data, **kwds) + + def fit(self, data, start_scale='mad', maxiter=100, rtol=1e-6, atol=1e-8): + """ + Estimate M-scale using iteration. + + Parameters + ---------- + data : array-like + Data, currently assumed to be 1-dimensional. + start_scale : string or float. + Starting value of scale or method to compute the starting value. + Default is using 'mad', no other string options are available. + maxiter : int + Maximum number of iterations. + rtol : float + Relative convergence tolerance. + atol : float + Absolute onvergence tolerance. + + Returns + ------- + float : Scale estimate. The estimated variance is scale squared. + Todo: switch to Holder instance with more information. + + """ + + scale = _scale_iter( + data, + scale0=start_scale, + maxiter=maxiter, rtol=rtol, atol=atol, + meef_scale=self.chi_func, + scale_bias=self.scale_bias, + ) + + return scale + + def scale_trimmed(data, alpha, center='median', axis=0, distr=None, distargs=None): """scale estimate based on symmetrically trimmed sample @@ -524,35 +581,134 @@ def scale_trimmed(data, alpha, center='median', axis=0, distr=None, return res +def _weight_mean(x, c): + """Tukey-biweight, bisquare weights used in tau scale. + + Parameters + ---------- + x : ndarray + Data + c : float + Parameter for bisquare weights + + Returns + ------- + ndarray : weights + """ + x = np.asarray(x) + w = (1 - (x / c)**2)**2 * (np.abs(x) <= c) + return w + + +def _winsor(x, c): + """Winsorized squared data used in tau scale. + + Parameters + ---------- + x : ndarray + Data + c : float + threshold + + Returns + ------- + winsorized squared data, ``np.minimum(x**2, c**2)`` + """ + return np.minimum(x**2, c**2) + + +def scale_tau(data, cm=4.5, cs=3, weight_mean=_weight_mean, + weight_scale=_winsor, normalize=True, ddof=0): + """Tau estimator of univariate scale. + + Experimental, API will change + + Parameters + ---------- + data : array_like, 1-D or 2-D + If data is 2d, then the location and scale estimates + are calculated for each column + cm : float + constant used in call to weight_mean + cs : float + constant used in call to weight_scale + weight_mean : callable + function to calculate weights for weighted mean + weight_scale : callable + function to calculate scale, "rho" function + normalize : bool + rescale the scale estimate so it is consistent when the data is + normally distributed. The computation assumes winsorized (truncated) + variance. + + Returns + ------- + mean : nd_array + robust mean + std : nd_array + robust estimate of scale (standard deviation) + + Notes + ----- + Uses definition of Maronna and Zamar 2002, with weighted mean and + trimmed variance. + The normalization has been added to match R robustbase. + R robustbase uses by default ddof=0, with option to set it to 2. + + References + ---------- + .. [1] Maronna, Ricardo A, and Ruben H Zamar. “Robust Estimates of Location + and Dispersion for High-Dimensional Datasets.” Technometrics 44, no. 4 + (November 1, 2002): 307–17. https://doi.org/10.1198/004017002188618509. + """ + + x = np.asarray(data) + nobs = x.shape[0] + + med_x = np.median(x, 0) + xdm = x - med_x + mad_x = np.median(np.abs(xdm), 0) + wm = weight_mean(xdm / mad_x, cm) + mean = (wm * x).sum(0) / wm.sum(0) + var = (mad_x**2 * weight_scale((x - mean) / mad_x, cs).sum(0) / + (nobs - ddof)) + + cf = 1 + if normalize: + c = cs * stats.norm.ppf(0.75) + cf = 2 * ((1 - c**2) * stats.norm.cdf(c) - c * stats.norm.pdf(c) + + c**2) - 1 + # return Holder(loc=mean, scale=np.sqrt(var / cf)) + return mean, np.sqrt(var / cf) + + debug = 0 -def _scale_iter(data, scale0='mad', maxiter=10, rtol=1e-6, atol=1e-8, - meef_scale=None, scale_bias=None): +def _scale_iter(data, scale0='mad', maxiter=100, rtol=1e-6, atol=1e-8, + meef_scale=None, scale_bias=None, iter_method="rho", ddof=0): """iterative scale estimate base on "rho" function """ x = np.asarray(data) + nobs = x.shape[0] if scale0 == 'mad': - scale0 = mad(x) + scale0 = mad(x, center=0) - scale = scale0 - scale_old = scale for i in range(maxiter): - x_scaled = x / scale - weights_scale = meef_scale(x_scaled) / (1e-50 + x_scaled**2) - if debug: - print('weights sum', weights_scale.sum(), end=" ") - scale_old = scale - scale2 = (weights_scale * x**2).mean() - if debug: - print(scale2, end=" ") - scale2 /= scale_bias - scale = np.sqrt(scale2) + x_scaled = x / scale0 + if iter_method == "rho": + scale = scale0 * np.sqrt( + np.sum(meef_scale(x / scale0)) / scale_bias / (nobs - ddof)) + else: + weights_scale = meef_scale(x_scaled) / (1e-50 + x_scaled**2) + scale2 = (weights_scale * x**2).sum() / (nobs - ddof) + scale2 /= scale_bias + scale = np.sqrt(scale2) if debug: print(scale) - if np.allclose(scale, scale_old, atol=atol, rtol=rtol): + if np.allclose(scale, scale0, atol=atol, rtol=rtol): break - scale_old = scale + scale0 = scale return scale diff --git a/statsmodels/robust/tests/test_covariance.py b/statsmodels/robust/tests/test_covariance.py index d272e153eb1..2af06c68a3f 100644 --- a/statsmodels/robust/tests/test_covariance.py +++ b/statsmodels/robust/tests/test_covariance.py @@ -8,6 +8,7 @@ from statsmodels import robust import statsmodels.robust.covariance as robcov +import statsmodels.robust.scale as robscale from .results import results_cov as res_cov @@ -107,7 +108,7 @@ class TestOGKTau(TestOGKMad): def setup_class(cls): def sfunc(x): - return robcov.scale_tau(x, normalize=False, ddof=0)[1] + return robscale.scale_tau(x, normalize=False, ddof=0)[1] cls.res1 = robcov.cov_ogk(dta_hbk, scale_func=sfunc, diff --git a/statsmodels/robust/tests/test_scale.py b/statsmodels/robust/tests/test_scale.py index b5074d6fcb0..0528f647405 100644 --- a/statsmodels/robust/tests/test_scale.py +++ b/statsmodels/robust/tests/test_scale.py @@ -8,15 +8,23 @@ from numpy.random import standard_normal from numpy.testing import assert_almost_equal, assert_equal, assert_allclose import pytest + +import pandas as pd + from scipy.stats import norm as Gaussian from scipy import stats import statsmodels.api as sm import statsmodels.robust.scale as scale -from statsmodels.robust.scale import mad +from statsmodels.robust.scale import mad, scale_tau import statsmodels.robust.norms as rnorms -from statsmodels.robust.covariance import scale_tau # TODO: will be moved +cur_dir = os.path.abspath(os.path.dirname(__file__)) + +file_name = 'hbk.csv' +file_path = os.path.join(cur_dir, 'results', file_name) +dta_hbk = pd.read_csv(file_path) + # Example from Section 5.5, Venables & Ripley (2002) @@ -281,7 +289,7 @@ class TestHuberAxes: def setup_class(cls): np.random.seed(54321) cls.X = standard_normal((40, 10, 30)) - cls.h = scale.Huber(maxiter=1000, tol=1.0e-05) + cls.h = scale.Huber(maxiter=100, tol=1.0e-05) def test_default(self): m, s = self.h(self.X, axis=0) @@ -391,6 +399,49 @@ def test_scale_iter(): assert_allclose(s, v, rtol=1e-1) assert_allclose(s, 1.0683298, rtol=1e-6) # regression test number + chi = rnorms.TukeyBiweight() + scale_bias = 0.43684963023076195 + mscale_biw = scale.MScale(chi, scale_bias) + s_biw = mscale_biw(x) + assert_allclose(s_biw, s, rtol=1e-10) + + # regression test with 50% breakdown tuning + chi = rnorms.TukeyBiweight(c=1.547) + scale_bias = 0.1995 + mscale_biw = scale.MScale(chi, scale_bias) + s_biw = mscale_biw(x) + assert_allclose(s_biw, 1.0326176662, rtol=1e-9) # regression test number + + +class TestMScale(): + + def test_huber_equivalence(self): + np.random.seed(54321) + nobs = 50 + x = 1.5 * standard_normal(nobs) + + # test equivalence of HuberScale and TrimmedMean M-scale + chi_tm = rnorms.TrimmedMean(c=2.5) + scale_bias_tm = 0.4887799917273257 + mscale_tm = scale.MScale(chi_tm, scale_bias_tm) + s_tm = mscale_tm(x) + + mscale_hub = scale.HuberScale() + s_hub = mscale_hub(nobs, nobs, x) + + assert_allclose(s_tm, s_hub, rtol=1e-6) + + def test_biweight(self): + y = dta_hbk["Y"].to_numpy() + ry = y - np.median(y) + + chi = rnorms.TukeyBiweight(c=1.54764) + scale_bias = 0.19959963130721095 + mscale_biw = scale.MScale(chi, scale_bias) + scale0 = mscale_biw(ry) + scale1 = 0.817260483784376 # from R RobStatTM scaleM + assert_allclose(scale0, scale1, rtol=1e-6) + def test_scale_trimmed_approx(): scale_trimmed = scale.scale_trimmed # shorthand