diff --git a/.gitignore b/.gitignore
index 0c32bf53261..1bc8670606d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,6 +70,7 @@ coverage_html_report/
 
 # VS Code
 .vscode/
+.vs/
 
 # Project specific
 statsmodels/version.py
diff --git a/statsmodels/robust/covariance.py b/statsmodels/robust/covariance.py
index f89e40583e1..51acfb41111 100644
--- a/statsmodels/robust/covariance.py
+++ b/statsmodels/robust/covariance.py
@@ -308,107 +308,6 @@ def _outlier_gy(d, distr=None, k_endog=1, trim_prob=0.975):
 
 # ## GK and OGK ###
 
-def _weight_mean(x, c):
-    """Tukey-biweight, bisquare weights used in tau scale.
-
-    Parameters
-    ----------
-    x : ndarray
-        Data
-    c : float
-        Parameter for bisquare weights
-
-    Returns
-    -------
-    ndarray : weights
-    """
-    x = np.asarray(x)
-    w = (1 - (x / c)**2)**2 * (np.abs(x) <= c)
-    return w
-
-
-def _winsor(x, c):
-    """Winsorized squared data used in tau scale.
-
-    Parameters
-    ----------
-    x : ndarray
-        Data
-    c : float
-        threshold
-
-    Returns
-    -------
-    winsorized squared data, ``np.minimum(x**2, c**2)``
-    """
-    return np.minimum(x**2, c**2)
-
-
-def scale_tau(data, cm=4.5, cs=3, weight_mean=_weight_mean,
-              weight_scale=_winsor, normalize=True, ddof=0):
-    """Tau estimator of univariate scale.
-
-    Experimental, API will change
-
-    Parameters
-    ----------
-    data : array_like, 1-D or 2-D
-        If data is 2d, then the location and scale estimates
-        are calculated for each column
-    cm : float
-        constant used in call to weight_mean
-    cs : float
-        constant used in call to weight_scale
-    weight_mean : callable
-        function to calculate weights for weighted mean
-    weight_scale : callable
-        function to calculate scale, "rho" function
-    normalize : bool
-        rescale the scale estimate so it is consistent when the data is
-        normally distributed. The computation assumes winsorized (truncated)
-        variance.
-
-    Returns
-    -------
-    mean : nd_array
-        robust mean
-    std : nd_array
-        robust estimate of scale (standard deviation)
-
-    Notes
-    -----
-    Uses definition of Maronna and Zamar 2002, with weighted mean and
-    trimmed variance.
-    The normalization has been added to match R robustbase.
-    R robustbase uses by default ddof=0, with option to set it to 2.
-
-    References
-    ----------
-    .. [1] Maronna, Ricardo A, and Ruben H Zamar. “Robust Estimates of Location
-       and Dispersion for High-Dimensional Datasets.” Technometrics 44, no. 4
-       (November 1, 2002): 307–17. https://doi.org/10.1198/004017002188618509.
-    """
-
-    x = np.asarray(data)
-    nobs = x.shape[0]
-
-    med_x = np.median(x, 0)
-    xdm = x - med_x
-    mad_x = np.median(np.abs(xdm), 0)
-    wm = weight_mean(xdm / mad_x, cm)
-    mean = (wm * x).sum(0) / wm.sum(0)
-    var = (mad_x**2 * weight_scale((x - mean) / mad_x, cs).sum(0) /
-           (nobs - ddof))
-
-    cf = 1
-    if normalize:
-        c = cs * stats.norm.ppf(0.75)
-        cf = 2 * ((1 - c**2) * stats.norm.cdf(c) - c * stats.norm.pdf(c)
-                  + c**2) - 1
-    # return Holder(loc=mean, scale=np.sqrt(var / cf))
-    return mean, np.sqrt(var / cf)
-
-
 def mahalanobis(data, cov=None, cov_inv=None, sqrt=False):
     """Mahalanobis distance squared
 
diff --git a/statsmodels/robust/robust_linear_model.py b/statsmodels/robust/robust_linear_model.py
index 0b92c90d6e6..d3c10573130 100644
--- a/statsmodels/robust/robust_linear_model.py
+++ b/statsmodels/robust/robust_linear_model.py
@@ -191,10 +191,12 @@ def _estimate_scale(self, resid):
         elif isinstance(self.scale_est, scale.HuberScale):
             return self.scale_est(self.df_resid, self.nobs, resid)
         else:
-            return scale.scale_est(self, resid) ** 2
+            # use df correction to match HuberScale
+            return self.scale_est(resid) * np.sqrt(self.nobs / self.df_resid)
 
     def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
-            update_scale=True, conv='dev', start_params=None):
+            update_scale=True, conv='dev', start_params=None, start_scale=None,
+            ):
         """
         Fits the model using iteratively reweighted least squares.
 
@@ -217,6 +219,7 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
             Specifies method for the initial estimates of the parameters.
             Default is None, which means that the least squares estimate
             is used.  Currently it is the only available choice.
+            Deprecated and will be removed. There is no choice here.
         maxiter : int
             The maximum number of iterations to try. Default is 50.
         scale_est : str or HuberScale()
@@ -236,6 +239,11 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
         start_params : array_like, optional
             Initial guess of the solution of the optimizer. If not provided,
             the initial parameters are computed using OLS.
+        start_scale : float, optional
+            Initial scale. If update_scale is False, then the scale will be
+            fixed at this level for the estimation of the mean parameters.
+            during iteration. If not provided, then the initial scale is
+            estimated from the OLS residuals
 
         Returns
         -------
@@ -264,8 +272,10 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
                                              check_weights=False)
             wls_results = fake_wls.results(start_params)
 
-        if not init:
+        if not init and not start_scale:
             self.scale = self._estimate_scale(wls_results.resid)
+        elif start_scale:
+            self.scale = start_scale
 
         history = dict(params=[np.inf], scale=[])
         if conv == 'coefs':
diff --git a/statsmodels/robust/scale.py b/statsmodels/robust/scale.py
index e938a83747a..d866d52f152 100644
--- a/statsmodels/robust/scale.py
+++ b/statsmodels/robust/scale.py
@@ -248,11 +248,11 @@ def __call__(self, a, mu=None, initscale=None, axis=0):
         """
         a = np.asarray(a)
         if mu is None:
-            n = a.shape[0] - 1
+            n = a.shape[axis] - 1
             mu = np.median(a, axis=axis)
             est_mu = True
         else:
-            n = a.shape[0]
+            n = a.shape[axis]
             mu = mu
             est_mu = False
 
@@ -298,10 +298,10 @@ def _estimate_both(self, a, scale, mu, axis, est_mu, n):
             nmu = tools.unsqueeze(nmu, axis, a.shape)
 
             subset = np.less_equal(np.abs((a - mu) / scale), self.c)
-            card = subset.sum(axis)
 
-            scale_num = np.sum(subset * (a - nmu) ** 2, axis)
-            scale_denom = n * self.gamma - (a.shape[axis] - card) * self.c ** 2
+            scale_num = np.sum(subset * (a - nmu) ** 2 +
+                               (1 - subset) * (scale * self.c)**2, axis)
+            scale_denom = n * self.gamma
             nscale = np.sqrt(scale_num / scale_denom)
             nscale = tools.unsqueeze(nscale, axis, a.shape)
 
@@ -408,6 +408,63 @@ def chi(s):
 hubers_scale = HuberScale()
 
 
+class MScale:
+    """M-scale estimation.
+
+    experimental interface, arguments and options will still change.
+
+    Parameters
+    ----------
+    chi_func : callable
+        The rho or chi function for the moment condition for estimating scale.
+    scale_bias : float
+        Factor in moment condition to obtain fisher consistency of the scale
+        estimate at the normal distribution.
+    """
+
+    def __init__(self, chi_func, scale_bias):
+        self.chi_func = chi_func
+        self.scale_bias = scale_bias
+
+    def __call__(self, data, **kwds):
+        return self.fit(data, **kwds)
+
+    def fit(self, data, start_scale='mad', maxiter=100, rtol=1e-6, atol=1e-8):
+        """
+        Estimate M-scale using iteration.
+
+        Parameters
+        ----------
+        data : array-like
+            Data, currently assumed to be 1-dimensional.
+        start_scale : string or float.
+            Starting value of scale or method to compute the starting value.
+            Default is using 'mad', no other string options are available.
+        maxiter : int
+            Maximum number of iterations.
+        rtol : float
+            Relative convergence tolerance.
+        atol : float
+            Absolute onvergence tolerance.
+
+        Returns
+        -------
+        float : Scale estimate. The estimated variance is scale squared.
+        Todo: switch to Holder instance with more information.
+
+        """
+
+        scale = _scale_iter(
+            data,
+            scale0=start_scale,
+            maxiter=maxiter, rtol=rtol, atol=atol,
+            meef_scale=self.chi_func,
+            scale_bias=self.scale_bias,
+            )
+
+        return scale
+
+
 def scale_trimmed(data, alpha, center='median', axis=0, distr=None,
                   distargs=None):
     """scale estimate based on symmetrically trimmed sample
@@ -524,35 +581,134 @@ def scale_trimmed(data, alpha, center='median', axis=0, distr=None,
     return res
 
 
+def _weight_mean(x, c):
+    """Tukey-biweight, bisquare weights used in tau scale.
+
+    Parameters
+    ----------
+    x : ndarray
+        Data
+    c : float
+        Parameter for bisquare weights
+
+    Returns
+    -------
+    ndarray : weights
+    """
+    x = np.asarray(x)
+    w = (1 - (x / c)**2)**2 * (np.abs(x) <= c)
+    return w
+
+
+def _winsor(x, c):
+    """Winsorized squared data used in tau scale.
+
+    Parameters
+    ----------
+    x : ndarray
+        Data
+    c : float
+        threshold
+
+    Returns
+    -------
+    winsorized squared data, ``np.minimum(x**2, c**2)``
+    """
+    return np.minimum(x**2, c**2)
+
+
+def scale_tau(data, cm=4.5, cs=3, weight_mean=_weight_mean,
+              weight_scale=_winsor, normalize=True, ddof=0):
+    """Tau estimator of univariate scale.
+
+    Experimental, API will change
+
+    Parameters
+    ----------
+    data : array_like, 1-D or 2-D
+        If data is 2d, then the location and scale estimates
+        are calculated for each column
+    cm : float
+        constant used in call to weight_mean
+    cs : float
+        constant used in call to weight_scale
+    weight_mean : callable
+        function to calculate weights for weighted mean
+    weight_scale : callable
+        function to calculate scale, "rho" function
+    normalize : bool
+        rescale the scale estimate so it is consistent when the data is
+        normally distributed. The computation assumes winsorized (truncated)
+        variance.
+
+    Returns
+    -------
+    mean : nd_array
+        robust mean
+    std : nd_array
+        robust estimate of scale (standard deviation)
+
+    Notes
+    -----
+    Uses definition of Maronna and Zamar 2002, with weighted mean and
+    trimmed variance.
+    The normalization has been added to match R robustbase.
+    R robustbase uses by default ddof=0, with option to set it to 2.
+
+    References
+    ----------
+    .. [1] Maronna, Ricardo A, and Ruben H Zamar. “Robust Estimates of Location
+       and Dispersion for High-Dimensional Datasets.” Technometrics 44, no. 4
+       (November 1, 2002): 307–17. https://doi.org/10.1198/004017002188618509.
+    """
+
+    x = np.asarray(data)
+    nobs = x.shape[0]
+
+    med_x = np.median(x, 0)
+    xdm = x - med_x
+    mad_x = np.median(np.abs(xdm), 0)
+    wm = weight_mean(xdm / mad_x, cm)
+    mean = (wm * x).sum(0) / wm.sum(0)
+    var = (mad_x**2 * weight_scale((x - mean) / mad_x, cs).sum(0) /
+           (nobs - ddof))
+
+    cf = 1
+    if normalize:
+        c = cs * stats.norm.ppf(0.75)
+        cf = 2 * ((1 - c**2) * stats.norm.cdf(c) - c * stats.norm.pdf(c)
+                  + c**2) - 1
+    # return Holder(loc=mean, scale=np.sqrt(var / cf))
+    return mean, np.sqrt(var / cf)
+
+
 debug = 0
 
 
-def _scale_iter(data, scale0='mad', maxiter=10, rtol=1e-6, atol=1e-8,
-                meef_scale=None, scale_bias=None):
+def _scale_iter(data, scale0='mad', maxiter=100, rtol=1e-6, atol=1e-8,
+                meef_scale=None, scale_bias=None, iter_method="rho", ddof=0):
     """iterative scale estimate base on "rho" function
 
     """
     x = np.asarray(data)
+    nobs = x.shape[0]
     if scale0 == 'mad':
-        scale0 = mad(x)
+        scale0 = mad(x, center=0)
 
-    scale = scale0
-    scale_old = scale
     for i in range(maxiter):
-        x_scaled = x / scale
-        weights_scale = meef_scale(x_scaled) / (1e-50 + x_scaled**2)
-        if debug:
-            print('weights sum', weights_scale.sum(), end=" ")
-        scale_old = scale
-        scale2 = (weights_scale * x**2).mean()
-        if debug:
-            print(scale2, end=" ")
-        scale2 /= scale_bias
-        scale = np.sqrt(scale2)
+        x_scaled = x / scale0
+        if iter_method == "rho":
+            scale = scale0 * np.sqrt(
+                np.sum(meef_scale(x / scale0)) / scale_bias / (nobs - ddof))
+        else:
+            weights_scale = meef_scale(x_scaled) / (1e-50 + x_scaled**2)
+            scale2 = (weights_scale * x**2).sum() / (nobs - ddof)
+            scale2 /= scale_bias
+            scale = np.sqrt(scale2)
         if debug:
             print(scale)
-        if np.allclose(scale, scale_old, atol=atol, rtol=rtol):
+        if np.allclose(scale, scale0, atol=atol, rtol=rtol):
             break
-        scale_old = scale
+        scale0 = scale
 
     return scale
diff --git a/statsmodels/robust/tests/test_covariance.py b/statsmodels/robust/tests/test_covariance.py
index d272e153eb1..2af06c68a3f 100644
--- a/statsmodels/robust/tests/test_covariance.py
+++ b/statsmodels/robust/tests/test_covariance.py
@@ -8,6 +8,7 @@
 
 from statsmodels import robust
 import statsmodels.robust.covariance as robcov
+import statsmodels.robust.scale as robscale
 
 from .results import results_cov as res_cov
 
@@ -107,7 +108,7 @@ class TestOGKTau(TestOGKMad):
     def setup_class(cls):
 
         def sfunc(x):
-            return robcov.scale_tau(x, normalize=False, ddof=0)[1]
+            return robscale.scale_tau(x, normalize=False, ddof=0)[1]
 
         cls.res1 = robcov.cov_ogk(dta_hbk,
                                   scale_func=sfunc,
diff --git a/statsmodels/robust/tests/test_scale.py b/statsmodels/robust/tests/test_scale.py
index b5074d6fcb0..0528f647405 100644
--- a/statsmodels/robust/tests/test_scale.py
+++ b/statsmodels/robust/tests/test_scale.py
@@ -8,15 +8,23 @@
 from numpy.random import standard_normal
 from numpy.testing import assert_almost_equal, assert_equal, assert_allclose
 import pytest
+
+import pandas as pd
+
 from scipy.stats import norm as Gaussian
 from scipy import stats
 
 import statsmodels.api as sm
 import statsmodels.robust.scale as scale
-from statsmodels.robust.scale import mad
+from statsmodels.robust.scale import mad, scale_tau
 import statsmodels.robust.norms as rnorms
 
-from statsmodels.robust.covariance import scale_tau  # TODO: will be moved
+cur_dir = os.path.abspath(os.path.dirname(__file__))
+
+file_name = 'hbk.csv'
+file_path = os.path.join(cur_dir, 'results', file_name)
+dta_hbk = pd.read_csv(file_path)
+
 
 # Example from Section 5.5, Venables & Ripley (2002)
 
@@ -281,7 +289,7 @@ class TestHuberAxes:
     def setup_class(cls):
         np.random.seed(54321)
         cls.X = standard_normal((40, 10, 30))
-        cls.h = scale.Huber(maxiter=1000, tol=1.0e-05)
+        cls.h = scale.Huber(maxiter=100, tol=1.0e-05)
 
     def test_default(self):
         m, s = self.h(self.X, axis=0)
@@ -391,6 +399,49 @@ def test_scale_iter():
     assert_allclose(s, v, rtol=1e-1)
     assert_allclose(s, 1.0683298, rtol=1e-6)  # regression test number
 
+    chi = rnorms.TukeyBiweight()
+    scale_bias = 0.43684963023076195
+    mscale_biw = scale.MScale(chi, scale_bias)
+    s_biw = mscale_biw(x)
+    assert_allclose(s_biw, s, rtol=1e-10)
+
+    # regression test with 50% breakdown tuning
+    chi = rnorms.TukeyBiweight(c=1.547)
+    scale_bias = 0.1995
+    mscale_biw = scale.MScale(chi, scale_bias)
+    s_biw = mscale_biw(x)
+    assert_allclose(s_biw, 1.0326176662, rtol=1e-9)  # regression test number
+
+
+class TestMScale():
+
+    def test_huber_equivalence(self):
+        np.random.seed(54321)
+        nobs = 50
+        x = 1.5 * standard_normal(nobs)
+
+        # test equivalence of HuberScale and TrimmedMean M-scale
+        chi_tm = rnorms.TrimmedMean(c=2.5)
+        scale_bias_tm = 0.4887799917273257
+        mscale_tm = scale.MScale(chi_tm, scale_bias_tm)
+        s_tm = mscale_tm(x)
+
+        mscale_hub = scale.HuberScale()
+        s_hub = mscale_hub(nobs, nobs, x)
+
+        assert_allclose(s_tm, s_hub, rtol=1e-6)
+
+    def test_biweight(self):
+        y = dta_hbk["Y"].to_numpy()
+        ry = y - np.median(y)
+
+        chi = rnorms.TukeyBiweight(c=1.54764)
+        scale_bias = 0.19959963130721095
+        mscale_biw = scale.MScale(chi, scale_bias)
+        scale0 = mscale_biw(ry)
+        scale1 = 0.817260483784376   # from R RobStatTM scaleM
+        assert_allclose(scale0, scale1, rtol=1e-6)
+
 
 def test_scale_trimmed_approx():
     scale_trimmed = scale.scale_trimmed  # shorthand