diff --git a/docs/api.rst b/docs/api.rst index 04935e29..0ed31d90 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -51,7 +51,23 @@ Identify gaps in the data. :toctree: generated/ quality.gaps.interpolation_diff + +Data sometimes contains sequences of values that are "stale" or +"stuck." These are contiguous spans of data where the value does not +change within the precision given. The functions below +can be used to detect stale values. + +.. note:: + + If the data has been altered in some way (i.e. temperature that has + been rounded to an integer value) before being passed to these + functions you may see unexpectedly large amounts of stale data. + +.. autosummary:: + :toctree: generated/ + quality.gaps.stale_values_diff + quality.gaps.stale_values_round The following functions identify days with incomplete data. diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 82c1ddcd..171d8ca3 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -84,13 +84,14 @@ def stale_values_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'): absolute tolerance for detecting a change in data values mark : str, default 'tail' How much of the window to mark ``True`` when a sequence of - stale values is detected. Can be of 'tail', 'end', or 'all'. + stale values is detected. Can one be of 'tail', 'end', or + 'all'. - If 'tail' (the default) then every point in the window *except* the first point is marked ``True``. - - If 'end' then only the endpoints of the window are marked - ``True``. The first `window - 1` values in a stale sequence - sequence are marked ``False``. + - If 'end' then the first `window - 1` values in a stale + sequence sequence are marked ``False`` and all subsequent + values in the sequence are marked ``True``. - If 'all' then every point in the window *including* the first point is marked ``True``. @@ -125,6 +126,58 @@ def stale_values_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'): return _mark(flags, window, mark) +def stale_values_round(x, window=6, decimals=3, mark='tail'): + """Identify stale values by rounding. + + A value is considered stale if it is part of a sequence of length + `window` of values that are identical when rounded to `decimals` + decimal places. + + Parameters + ---------- + x : Series + Data to be processed. + window : int, default 6 + Number of consecutive identical values for a data point to be + considered stale. + decimals : int, default 3 + Number of decimal places to round to. + mark : str, default 'tail' + How much of the window to mark ``True`` when a sequence of + stale values is detected. Can be one of 'tail', 'end', or + 'all'. + + - If 'tail' (the default) then every point in the window + *except* the first point is marked ``True``. + - If 'end' then the first `window - 1` values in a stale + sequence sequence are marked ``False`` and all subsequent + values in the sequence are marked ``True``. + - If 'all' then every point in the window *including* the + first point is marked ``True``. + + Returns + ------- + Series + True for each value that is part of a stale sequence of data. + + Raises + ------ + ValueError + If `mark` is not one of 'tail', 'end', or 'all'. + + Notes + ----- + Based on code from the pvfleets_qa_analysis project. Copyright + (c) 2020 Alliance for Sustainable Energy, LLC. + + """ + rounded_diff = x.round(decimals=decimals).diff() + endpoints = rounded_diff.rolling(window=window-1).apply( + lambda xs: len(xs[xs == 0]) == window-1 + ).fillna(False).astype(bool) + return _mark(endpoints, window, mark) + + def interpolation_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'): """Identify sequences which appear to be linear. @@ -148,14 +201,14 @@ def interpolation_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'): absolute tolerance for detecting a change in first difference mark : str, default 'tail' How much of the window to mark ``True`` when a sequence of - interpolated values is detected. Can be 'tail', 'end', or - 'all'. + interpolated values is detected. Can be one of 'tail', 'end', + or 'all'. - If 'tail' (the default) then every point in the window *except* the first point is marked ``True``. - - If 'end' then only the endpoints of the window are marked - ``True``. The first `window - 1` values in an interpolated - sequence are marked ``False``. + - If 'end' then the first `window - 1` values in an + interpolated sequence are marked ``False`` and all + subsequent values in the sequence are marked ``True``. - If 'all' then every point in the window *including* the first point is marked ``True``. diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index 7896b293..6d008c7d 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -651,3 +651,94 @@ def test_complete(): pd.Series(True, index=data.index), gaps.complete(data, minimum_completeness=0.2) ) + + +def test_stale_values_round_no_stale(): + """No stale values in a monotonically increasing sequence.""" + data = pd.Series(np.linspace(0, 10)) + assert not gaps.stale_values_round(data, mark='all').any() + + +def test_stale_values_round_all_same(): + """If all data is identical, then all values are stale.""" + data = pd.Series(1, index=range(0, 10)) + assert gaps.stale_values_round(data, mark='all').all() + + +def test_stale_values_round_noisy(): + """If all values are the same +/- 0.0005""" + data = pd.Series( + [1.555, 1.5551, 1.5549, 1.555, 1.555, 1.5548, 1.5553] + ) + assert gaps.stale_values_round(data, decimals=3, mark='all').all() + + +def test_stale_values_round_span_in_middle(): + """A span of stale values between not-stale data.""" + data = pd.Series( + [1.0, 1.1, 1.2, 1.5, 1.5, 1.5, 1.5, 1.9, 2.0, 2.2] + ) + assert_series_equal( + gaps.stale_values_round(data, window=4, mark='all'), + pd.Series([False, False, False, + True, True, True, True, + False, False, False], dtype='bool') + ) + + +def test_stale_values_larger_window(): + """Increasing the window size excludes short spans of repeated + values.""" + data = pd.Series( + [1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 6] + ) + assert_series_equal( + gaps.stale_values_round(data, window=4, mark='all'), + (data == 2) | (data == 4) + ) + assert_series_equal( + gaps.stale_values_round(data, window=5, mark='all'), + (data == 4) + ) + + +def test_stale_values_round_bad_mark(): + """passing an invalid value for `mark` raises a ValueError.""" + data = pd.Series(1, index=range(1, 10)) + with pytest.raises(ValueError): + gaps.stale_values_round(data, mark='bad') + + +def test_stale_values_round_mark(): + """Test that different values for `mark` have the correct semantics.""" + data = pd.Series(1, index=range(0, 10)) + expected = pd.Series(True, index=range(0, 10)) + assert_series_equal( + expected, + gaps.stale_values_round(data, mark='all') + ) + expected.iloc[0] = False + assert_series_equal( + expected, + gaps.stale_values_round(data) + ) + assert_series_equal( + expected, + gaps.stale_values_round(data, mark='tail') + ) + expected.iloc[1] = False + assert_series_equal( + expected, + gaps.stale_values_round(data, window=3, mark='end') + ) + + +def test_stale_values_round_smaller_window(): + """Decreasing window size includes shorter spans of repeated values.""" + data = pd.Series( + [1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 6] + ) + assert_series_equal( + gaps.stale_values_round(data, window=3, mark='all'), + (data == 2) | (data == 4) + )