ENH: add Siegel slopes (robust regression) to scipy.stats (#9101)

* ENH add robust regression (Siegel) to scipy.stats
scipy · Aug 25, 2018 · 8a8fec9 · 8a8fec9
1 parent ce320b7
commit 8a8fec9
Show file tree

Hide file tree

Showing 6 changed files with 302 additions and 83 deletions.
diff --git a/scipy/stats/__init__.py b/scipy/stats/__init__.py
@@ -226,6 +226,7 @@
    kendalltau
    weightedtau
    linregress
+   siegelslopes
    theilslopes
 
 Statistical tests

diff --git a/scipy/stats/_stats_mstats_common.py b/scipy/stats/_stats_mstats_common.py
@@ -5,7 +5,7 @@
 from . import distributions
 
 
-__all__ = ['_find_repeats', 'linregress', 'theilslopes']
+__all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']
 
 LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept',
                                                    'rvalue', 'pvalue',
@@ -158,6 +158,10 @@ def theilslopes(y, x=None, alpha=0.95):
     up_slope : float
         Upper bound of the confidence interval on `medslope`.
 
+    See also
+    --------
+    siegelslopes : a similar technique using repeated medians
+
     Notes
     -----
     The implementation of `theilslopes` follows [1]_. The intercept is
@@ -264,3 +268,122 @@ def _find_repeats(arr):
     freq = np.diff(change_idx)
     atleast2 = freq > 1
     return unique[atleast2], freq[atleast2]
+
+
+def siegelslopes(y, x=None, method="hierarchical"):
+    r"""
+    Computes the Siegel estimator for a set of points (x, y).
+
+    `siegelslopes` implements a method for robust linear regression
+    using repeated medians (see [1]_) to fit a line to the points (x, y).
+    The method is robust to outliers with an asymptotic breakdown point
+    of 50%.
+
+    Parameters
+    ----------
+    y : array_like
+        Dependent variable.
+    x : array_like or None, optional
+        Independent variable. If None, use ``arange(len(y))`` instead.
+    method : {'hierarchical', 'separate'}
+        If 'hierarchical', estimate the intercept using the estimated
+        slope ``medslope`` (default option).
+        If 'separate', estimate the intercept independent of the estimated
+        slope. See Notes for details.
+
+    Returns
+    -------
+    medslope : float
+        Estimate of the slope of the regression line.
+    medintercept : float
+        Estimate of the intercept of the regression line.
+
+    See also
+    --------
+    theilslopes : a similar technique without repeated medians
+
+    Notes
+    -----
+    With ``n = len(y)``, compute ``m_j`` as the median of
+    the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
+    ``medslope`` is then the median of all slopes ``m_j``.
+    Two ways are given to estimate the intercept in [1]_ which can be chosen
+    via the parameter ``method``.
+    The hierarchical approach uses the estimated slope ``medslope``
+    and computes ``medintercept`` as the median of ``y - medslope*x``.
+    The other approach estimates the intercept separately as follows: for
+    each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
+    lines through the remaining points and take the median ``i_j``.
+    ``medintercept`` is the median of the ``i_j``.
+
+    The implementation computes `n` times the median of a vector of size `n`
+    which can be slow for large vectors. There are more efficient algorithms
+    (see [2]_) which are not implemented here.
+
+    References
+    ----------
+    .. [1] A. Siegel, "Robust Regression Using Repeated Medians",
+           Biometrika, Vol. 69, pp. 242-244, 1982.
+
+    .. [2] A. Stein and M. Werman, "Finding the repeated median regression
+           line", Proceedings of the Third Annual ACM-SIAM Symposium on
+           Discrete Algorithms, pp. 409–413, 1992.
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    >>> x = np.linspace(-5, 5, num=150)
+    >>> y = x + np.random.normal(size=x.size)
+    >>> y[11:15] += 10  # add outliers
+    >>> y[-5:] -= 7
+
+    Compute the slope and intercept.  For comparison, also compute the
+    least-squares fit with `linregress`:
+
+    >>> res = stats.siegelslopes(y, x)
+    >>> lsq_res = stats.linregress(x, y)
+
+    Plot the results. The Siegel regression line is shown in red. The green
+    line shows the least-squares fit for comparison.
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(x, y, 'b.')
+    >>> ax.plot(x, res[1] + res[0] * x, 'r-')
+    >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
+    >>> plt.show()
+
+    """
+    if method not in ['hierarchical', 'separate']:
+        raise ValueError("method can only be 'hierarchical' or 'separate'")
+    y = np.asarray(y).ravel()
+    if x is None:
+        x = np.arange(len(y), dtype=float)
+    else:
+        x = np.asarray(x, dtype=float).ravel()
+        if len(x) != len(y):
+            raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
+
+    deltax = x[:, np.newaxis] - x
+    deltay = y[:, np.newaxis] - y
+    slopes, intercepts = [], []
+
+    for j in range(len(x)):
+        id_nonzero = deltax[j, :] != 0
+        slopes_j = deltay[j, id_nonzero] / deltax[j, id_nonzero]
+        medslope_j = np.median(slopes_j)
+        slopes.append(medslope_j)
+        if method == 'separate':
+            z = y*x[j] - y[j]*x
+            medintercept_j = np.median(z[id_nonzero] / deltax[j, id_nonzero])
+            intercepts.append(medintercept_j)
+
+    medslope = np.median(np.asarray(slopes))
+    if method == "separate":
+        medinter = np.median(np.asarray(intercepts))
+    else:
+        medinter = np.median(y - medslope*x)
+
+    return medslope, medinter
diff --git a/scipy/stats/mstats.py b/scipy/stats/mstats.py
@@ -66,6 +66,7 @@
    kendalltau
    kendalltau_seasonal
    linregress
+   siegelslopes
    theilslopes
    sen_seasonal_slopes
 

diff --git a/scipy/stats/mstats_basic.py b/scipy/stats/mstats_basic.py
@@ -25,7 +25,8 @@
            'rankdata',
            'scoreatpercentile','sem',
            'sen_seasonal_slopes','skew','skewtest','spearmanr',
-           'theilslopes','tmax','tmean','tmin','trim','trimboth',
+           'siegelslopes', 'theilslopes',
+           'tmax','tmean','tmin','trim','trimboth',
            'trimtail','trima','trimr','trimmed_mean','trimmed_std',
            'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
            'ttest_ind','ttest_rel','tvar',
@@ -50,7 +51,8 @@
 from ._stats_mstats_common import (
         _find_repeats,
         linregress as stats_linregress,
-        theilslopes as stats_theilslopes
+        theilslopes as stats_theilslopes,
+        siegelslopes as stats_siegelslopes 
         )
 
 
@@ -829,6 +831,11 @@ def theilslopes(y, x=None, alpha=0.95):
     up_slope : float
         Upper bound of the confidence interval on `medslope`.
 
+    See also
+    --------
+    siegelslopes : a similar technique with repeated medians
+
+
     Notes
     -----
     For more details on `theilslopes`, see `stats.theilslopes`.
@@ -851,6 +858,60 @@ def theilslopes(y, x=None, alpha=0.95):
     return stats_theilslopes(y, x, alpha=alpha)
 
 
+def siegelslopes(y, x=None, method="hierarchical"):
+    r"""
+    Computes the Siegel estimator for a set of points (x, y).
+
+    `siegelslopes` implements a method for robust linear regression
+    using repeated medians to fit a line to the points (x, y).
+    The method is robust to outliers with an asymptotic breakdown point
+    of 50%.
+
+    Parameters
+    ----------
+    y : array_like
+        Dependent variable.
+    x : array_like or None, optional
+        Independent variable. If None, use ``arange(len(y))`` instead.
+    method : {'hierarchical', 'separate'}
+        If 'hierarchical', estimate the intercept using the estimated
+        slope ``medslope`` (default option).
+        If 'separate', estimate the intercept independent of the estimated
+        slope. See Notes for details.
+
+    Returns
+    -------
+    medslope : float
+        Estimate of the slope of the regression line.
+    medintercept : float
+        Estimate of the intercept of the regression line.
+
+    See also
+    --------
+    theilslopes : a similar technique without repeated medians
+
+    Notes
+    -----
+    For more details on `siegelslopes`, see `scipy.stats.siegelslopes`.
+
+    """
+    y = ma.asarray(y).ravel()
+    if x is None:
+        x = ma.arange(len(y), dtype=float)
+    else:
+        x = ma.asarray(x).ravel()
+        if len(x) != len(y):
+            raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
+
+    m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+    y._mask = x._mask = m
+    # Disregard any masked elements of x or y
+    y = y.compressed()
+    x = x.compressed().astype(float)
+    # We now have unmasked arrays so can use `stats.siegelslopes`
+    return stats_siegelslopes(y, x)
+
+
 def sen_seasonal_slopes(x):
     x = ma.array(x, subok=True, copy=False, ndmin=2)
     (n,_) = x.shape

diff --git a/scipy/stats/stats.py b/scipy/stats/stats.py
@@ -172,7 +172,7 @@
 import scipy.linalg as linalg
 from . import distributions
 from . import mstats_basic
-from ._stats_mstats_common import _find_repeats, linregress, theilslopes
+from ._stats_mstats_common import _find_repeats, linregress, theilslopes, siegelslopes
 from ._stats import _kendall_dis, _toint64, _weightedrankedtau
 
 
@@ -186,7 +186,7 @@
            'sigmaclip', 'trimboth', 'trim1', 'trim_mean', 'f_oneway',
            'pearsonr', 'fisher_exact', 'spearmanr', 'pointbiserialr',
            'kendalltau', 'weightedtau',
-           'linregress', 'theilslopes', 'ttest_1samp',
+           'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp',
            'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel', 'kstest',
            'chisquare', 'power_divergence', 'ks_2samp', 'mannwhitneyu',
            'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',