Merge b7da439 into d476518

raybellwaves · Sep 9, 2020 · 171fa4a · 171fa4a
2 parents d476518 + b7da439
commit 171fa4a
Show file tree

Hide file tree

Showing 8 changed files with 287 additions and 5 deletions.
diff --git a/ci/run-linter.sh b/ci/run-linter.sh
@@ -13,7 +13,7 @@ echo "[black]"
 black --check --line-length=88 -S xskillscore
 
 echo "[isort]"
-isort --recursive --check-only xskillscore
+isort --check-only xskillscore
 
 echo "[doc8]"
 doc8 *.rst
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -108,3 +108,11 @@ Multi-Category Metrics
     Contingency.heidke_score
     Contingency.peirce_score
     Contingency.gerrity_score
+
+Tests
+-----
+
+.. autosummary::
+    :toctree: api/
+
+    sign_test
diff --git a/docs/source/api/xskillscore.sign_test.rst b/docs/source/api/xskillscore.sign_test.rst
@@ -0,0 +1,6 @@
+xskillscore.sign\_test
+======================
+
+.. currentmodule:: xskillscore
+
+.. autofunction:: sign_test
diff --git a/docs/source/quick-start.ipynb b/docs/source/quick-start.ipynb
diff --git a/xskillscore/__init__.py b/xskillscore/__init__.py
@@ -30,6 +30,7 @@
     rps,
     threshold_brier_score,
 )
+from .core.test import sign_test
 from .versioning.print_versions import show_versions
 
 try:

diff --git a/xskillscore/core/test.py b/xskillscore/core/test.py
@@ -0,0 +1,110 @@
+import scipy.stats as st
+import xarray as xr
+
+
+def sign_test(
+    forecast1, forecast2, observation, dim=None, categorical=False, alpha=0.05
+):
+    """
+        Returns the Delsole and Tippett sign test over the given time dimension.
+
+        Parameters
+        ----------
+        forecast1 : xarray.Dataset or xarray.DataArray
+            forecast1 to be compared to observation
+        forecast2 : xarray.Dataset or xarray.DataArray
+            forecast2 to be compared to observation
+        observation : xarray.Dataset or xarray.DataArray
+            observation to be compared to both forecasts
+
+            if str, then assume that forecast1 and forecast2 have already been compared
+            to observations and choose from:
+
+                * ``negatively_oriented_already_evaluated``: metric between forecast1
+                  (forecast2) and observations. Distances are positively oriented,
+                  therefore the smaller distance wins.
+                * ``positively_oriented_already_evaluated``: metric between forecast1
+                  (forecast2) and observations. The larger positively oriented metric
+                  wins.
+                * ``categorical_already_evaluated``: categorical data following
+                  ``logical(forecast1)==logical(forecast2)`` where ``logical`` is a
+                  function return binary output
+
+        dim : str
+            time dimension of dimension over which to compute the random walk.
+            This dimension is not reduced, unlike in other xskillscore functions.
+        alpha : float
+            significance level for random walk.
+        categorical : bool, optional
+            If True, the winning forecast is only rewarded a point if it exactly equals
+            the observations
+
+        Returns
+        -------
+        xarray.DataArray or xarray.Dataset reduced by dim containing the sign test and
+            confidence as new dimension results:
+
+                * ``sign_test``: positive (negative) number shows how many times over
+                    ``dim`` ``forecast1`` is better (worse) than ``forecast2``.
+                * ``confidence``: Positive boundary for the random walk at significance
+                    ``alpha``.
+
+
+        Examples
+        --------
+        >>> f1 = xr.DataArray(np.random.normal(size=(30)),
+        ...      coords=[('time', np.arange(30))])
+        >>> f2 = xr.DataArray(np.random.normal(size=(30)),
+        ...      coords=[('time', np.arange(30))])
+        >>> o = xr.DataArray(np.random.normal(size=(30)),
+        ...      coords=[('time', np.arange(30))])
+        >>> st = sign_test(f1, f2, o, dim='time')
+        >>> st.sel(results='sign_test').plot()
+        >>> st.sel(results='confidence').plot(c='gray')
+        >>> (-1*st.sel(results='confidence')).plot(c='gray')
+
+        References
+        ----------
+        * DelSole, T., & Tippett, M. K. (2016). Forecast Comparison Based on Random
+          Walks. Monthly Weather Review, 144(2), 615–626. doi: 10/f782pf
+    """
+    # two shortcuts for climpred
+    climpred_keys = [
+        'negatively_oriented_already_evaluated',
+        'positively_oriented_already_evaluated',
+        'categorical_already_evaluated',
+    ]
+    if isinstance(observation, str):
+        if observation == 'negatively_oriented_already_evaluated':  # mse, mae, rmse
+            diff1 = forecast1
+            diff2 = forecast2
+        elif observation == 'positively_oriented_already_evaluated':  # 1-mse/std, msss
+            diff1 = forecast1
+            diff2 = forecast2
+        elif observation == 'categorical_already_evaluated':
+            diff1 = ~forecast1
+            diff2 = ~forecast2
+        else:
+            raise ValueError(f'special key not found in {climpred_keys}')
+    else:
+        if categorical:
+            diff1 = -1 * (forecast1 == observation)
+            diff2 = -1 * (forecast2 == observation)
+        else:
+            diff1 = abs(
+                forecast1 - observation
+            )  # is like xs.mae(forecast1,observation,dim=[])
+            diff2 = abs(forecast2 - observation)
+
+    sign_test = (1 * (diff1 < diff2) - 1 * (diff2 < diff1)).cumsum(dim)
+
+    # Estimate 95% confidence interval -----
+    notnan = 1 * (diff1.notnull() & diff2.notnull())
+    N = notnan.cumsum(dim)
+    # z_alpha is the value at which the standardized cumulative Gaussian distributed exceeds alpha
+    confidence = st.norm.ppf(1 - alpha / 2) * xr.ufuncs.sqrt(N)
+    confidence.coords['alpha'] = alpha
+
+    res = xr.concat([sign_test, confidence], dim='results')
+    res['results'] = ['sign_test', 'confidence']
+    return res
diff --git a/xskillscore/tests/conftest.py b/xskillscore/tests/conftest.py
@@ -2,7 +2,9 @@
 import pytest
 import xarray as xr
 
-PERIODS = 10  # effective_p_value produces nans for shorter periods
+PERIODS = 12  # effective_p_value produces nans for shorter periods
+
+np.random.seed(42)
 
 
 @pytest.fixture

diff --git a/xskillscore/tests/test_test.py b/xskillscore/tests/test_test.py
@@ -0,0 +1,73 @@
+import numpy as np
+import pytest
+
+from xskillscore import sign_test
+
+OFFSET = -1
+
+
+@pytest.fixture
+def a_1d_worse(a_1d):
+    return a_1d + OFFSET
+
+
+def logical(ds):
+    return ds > 0.5
+
+
+def test_sign_test_raw(a_1d, a_1d_worse, b_1d):
+    actual = sign_test(a_1d, a_1d_worse, b_1d, dim='time', alpha=0.05)
+    walk_larger_significance = actual.sel(results='sign_test') > actual.sel(
+        results='confidence'
+    )
+    crossing_after_timesteps = walk_larger_significance.argmax(dim='time')
+    # check timesteps after which sign_test larger confidence
+    assert crossing_after_timesteps == 3
+
+
+def test_sign_test_categorical(a_1d, a_1d_worse, b_1d):
+    """Test sign_test categorical."""
+    a_1d = logical(a_1d)
+    a_1d_worse = logical(a_1d_worse)
+    b_1d = logical(b_1d)
+    sign_test(a_1d, a_1d_worse, b_1d, dim='time', categorical=True)
+
+
+@pytest.mark.parametrize('categorical', [True, False])
+def test_sign_test_identical(a_1d, a_1d_worse, b_1d, categorical):
+    """Test that identical forecasts show no walk step."""
+    identicals = [1, 7]
+    for i in identicals:
+        # set a_1d_worse = a_1d identical for time=i
+        a_1d_worse = a_1d_worse.where(a_1d_worse != a_1d_worse.isel(time=i)).fillna(
+            a_1d.isel(time=i)
+        )
+    if categorical:
+        a_1d = logical(a_1d)
+        a_1d_worse = logical(a_1d_worse)
+        b_1d = logical(b_1d)
+    actual = sign_test(a_1d, a_1d_worse, b_1d, dim='time', categorical=categorical)
+    # check flat
+    assert (
+        actual.sel(results='sign_test')
+        .diff(dim='time')
+        .isel(time=[i - 1 for i in identicals])
+        == 0
+    ).all()
+
+
+def test_sign_test_alpha(a_1d, a_1d_worse, b_1d):
+    """Test that larger alpha leads to small confidence bounds in sign_test."""
+    actual_large_alpha = sign_test(a_1d, a_1d_worse, b_1d, dim='time', alpha=0.1)
+    actual_small_alpha = sign_test(a_1d, a_1d_worse, b_1d, dim='time', alpha=0.01)
+    # check difference in confidence
+    assert (
+        actual_large_alpha.sel(results='confidence')
+        < actual_small_alpha.sel(results='confidence')
+    ).all()
+    # check identical sign_test
+    assert (
+        actual_large_alpha.sel(results='sign_test')
+        .drop('alpha')
+        .equals(actual_small_alpha.sel(results='sign_test').drop('alpha'))
+    )