Merge 673837a into 18bdc1c

pvlib · Jun 9, 2020 · a54a2cb · a54a2cb
2 parents 18bdc1c + 673837a
commit a54a2cb
Show file tree

Hide file tree

Showing 3 changed files with 158 additions and 0 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -51,7 +51,23 @@ Identify gaps in the data.
    :toctree: generated/
 
    quality.gaps.interpolation_diff
+
+Data sometimes contains sequences of values that are "stale" or
+"stuck." These are contiguous spans of data where the value does not
+change within the precision given. The functions below
+can be used to detect stale values.
+
+.. note::
+
+   If the data has been altered in some way (i.e. temperature that has
+   been rounded to an integer value) before being passed to these
+   functions you may see unexpectedly large amounts of stale data.
+
+.. autosummary::
+   :toctree: generated/
+
    quality.gaps.stale_values_diff
+   quality.gaps.stale_values_round
 
 The following functions identify days with incomplete data.
 

diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py
@@ -125,6 +125,57 @@ def stale_values_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'):
     return _mark(flags, window, mark)
 
 
+def stale_values_round(x, decimals=3, window=6, mark='tail'):
+    """Identify stale values by rounding.
+
+    A value is considered stale if it is part of a sequence of length
+    `window` of values that are identical when rounded to `decimals`
+    decimal places.
+
+    Parameters
+    ----------
+    x : Series
+        Data to be processed.
+    decimals : int, default 3
+        Number of decimal places to round to.
+    window : int, default 6
+        Number of consecutive identical values for a data point to be
+        considered stale.
+    mark : str, default 'tail'
+        How much of the window to mark ``True`` when a sequence of
+        stale values is detected. Can be of 'tail', 'end', or 'all'.
+
+        - If 'tail' (the default) then every point in the window
+          *except* the first point is marked ``True``.
+        - If 'end' then only the endpoints of the window are marked
+          ``True``. The first `window - 1` values in a stale sequence
+          sequence are marked ``False``.
+        - If 'all' then every point in the window *including* the
+          first point is marked ``True``.
+
+    Returns
+    -------
+    Series
+        True for each value that is part of a stale sequence of data.
+
+    Raises
+    ------
+    ValueError
+        If `mark` is not one of 'tail', 'end', or 'all'.
+
+    Notes
+    -----
+        Based on code from the pvfleets_qa_analysis project. Copyright
+        (c) 2020 Alliance for Sustainable Energy, LLC.
+
+    """
+    rounded_diff = x.round(decimals=decimals).diff()
+    endpoints = rounded_diff.rolling(window=window-1).apply(
+        lambda xs: len(xs[xs == 0]) == window-1
+    ).fillna(False).astype(bool)
+    return _mark(endpoints, window, mark)
+
+
 def interpolation_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'):
     """Identify sequences which appear to be linear.
 

diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py
@@ -651,3 +651,94 @@ def test_complete():
         pd.Series(True, index=data.index),
         gaps.complete(data, minimum_completeness=0.2)
     )
+
+
+def test_stale_values_round_no_stale():
+    """No stale values in a monotonically increasing sequence."""
+    data = pd.Series(np.linspace(0, 10))
+    assert not gaps.stale_values_round(data, mark='all').any()
+
+
+def test_stale_values_round_all_same():
+    """If all data is identical, then all values are stale."""
+    data = pd.Series(1, index=range(0, 10))
+    assert gaps.stale_values_round(data, mark='all').all()
+
+
+def test_stale_values_round_noisy():
+    """If all values are the same +/- 0.0005"""
+    data = pd.Series(
+        [1.555, 1.5551, 1.5549, 1.555, 1.555, 1.5548, 1.5553]
+    )
+    assert gaps.stale_values_round(data, decimals=3, mark='all').all()
+
+
+def test_stale_values_round_span_in_middle():
+    """A span of stale values between not-stale data."""
+    data = pd.Series(
+        [1.0, 1.1, 1.2, 1.5, 1.5, 1.5, 1.5, 1.9, 2.0, 2.2]
+    )
+    assert_series_equal(
+        gaps.stale_values_round(data, window=4, mark='all'),
+        pd.Series([False, False, False,
+                   True, True, True, True,
+                   False, False, False], dtype='bool')
+    )
+
+
+def test_stale_values_larger_window():
+    """Increasing the window size excludes short spans of repeated
+    values."""
+    data = pd.Series(
+        [1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 6]
+    )
+    assert_series_equal(
+        gaps.stale_values_round(data, window=4, mark='all'),
+        (data == 2) | (data == 4)
+    )
+    assert_series_equal(
+        gaps.stale_values_round(data, window=5, mark='all'),
+        (data == 4)
+    )
+
+
+def test_stale_values_round_bad_mark():
+    """passing an invalid value for `mark` raises a ValueError."""
+    data = pd.Series(1, index=range(1, 10))
+    with pytest.raises(ValueError):
+        gaps.stale_values_round(data, mark='bad')
+
+
+def test_stale_values_round_mark():
+    """Test that different values for `mark` have the correct semantics."""
+    data = pd.Series(1, index=range(0, 10))
+    expected = pd.Series(True, index=range(0, 10))
+    assert_series_equal(
+        expected,
+        gaps.stale_values_round(data, mark='all')
+    )
+    expected.iloc[0] = False
+    assert_series_equal(
+        expected,
+        gaps.stale_values_round(data)
+    )
+    assert_series_equal(
+        expected,
+        gaps.stale_values_round(data, mark='tail')
+    )
+    expected.iloc[1] = False
+    assert_series_equal(
+        expected,
+        gaps.stale_values_round(data, window=3, mark='end')
+    )
+
+
+def test_stale_values_round_smaller_window():
+    """Decreasing window size includes shorter spans of repeated values."""
+    data = pd.Series(
+        [1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 6]
+    )
+    assert_series_equal(
+        gaps.stale_values_round(data, window=3, mark='all'),
+        (data == 2) | (data == 4)
+    )