Skip to content

Commit

Permalink
Merge 673837a into 18bdc1c
Browse files Browse the repository at this point in the history
  • Loading branch information
wfvining committed Jun 9, 2020
2 parents 18bdc1c + 673837a commit a54a2cb
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 0 deletions.
16 changes: 16 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,23 @@ Identify gaps in the data.
:toctree: generated/

quality.gaps.interpolation_diff

Data sometimes contains sequences of values that are "stale" or
"stuck." These are contiguous spans of data where the value does not
change within the precision given. The functions below
can be used to detect stale values.

.. note::

If the data has been altered in some way (i.e. temperature that has
been rounded to an integer value) before being passed to these
functions you may see unexpectedly large amounts of stale data.

.. autosummary::
:toctree: generated/

quality.gaps.stale_values_diff
quality.gaps.stale_values_round

The following functions identify days with incomplete data.

Expand Down
51 changes: 51 additions & 0 deletions pvanalytics/quality/gaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,57 @@ def stale_values_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'):
return _mark(flags, window, mark)


def stale_values_round(x, decimals=3, window=6, mark='tail'):
"""Identify stale values by rounding.
A value is considered stale if it is part of a sequence of length
`window` of values that are identical when rounded to `decimals`
decimal places.
Parameters
----------
x : Series
Data to be processed.
decimals : int, default 3
Number of decimal places to round to.
window : int, default 6
Number of consecutive identical values for a data point to be
considered stale.
mark : str, default 'tail'
How much of the window to mark ``True`` when a sequence of
stale values is detected. Can be of 'tail', 'end', or 'all'.
- If 'tail' (the default) then every point in the window
*except* the first point is marked ``True``.
- If 'end' then only the endpoints of the window are marked
``True``. The first `window - 1` values in a stale sequence
sequence are marked ``False``.
- If 'all' then every point in the window *including* the
first point is marked ``True``.
Returns
-------
Series
True for each value that is part of a stale sequence of data.
Raises
------
ValueError
If `mark` is not one of 'tail', 'end', or 'all'.
Notes
-----
Based on code from the pvfleets_qa_analysis project. Copyright
(c) 2020 Alliance for Sustainable Energy, LLC.
"""
rounded_diff = x.round(decimals=decimals).diff()
endpoints = rounded_diff.rolling(window=window-1).apply(
lambda xs: len(xs[xs == 0]) == window-1
).fillna(False).astype(bool)
return _mark(endpoints, window, mark)


def interpolation_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'):
"""Identify sequences which appear to be linear.
Expand Down
91 changes: 91 additions & 0 deletions pvanalytics/tests/quality/test_gaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,3 +651,94 @@ def test_complete():
pd.Series(True, index=data.index),
gaps.complete(data, minimum_completeness=0.2)
)


def test_stale_values_round_no_stale():
"""No stale values in a monotonically increasing sequence."""
data = pd.Series(np.linspace(0, 10))
assert not gaps.stale_values_round(data, mark='all').any()


def test_stale_values_round_all_same():
"""If all data is identical, then all values are stale."""
data = pd.Series(1, index=range(0, 10))
assert gaps.stale_values_round(data, mark='all').all()


def test_stale_values_round_noisy():
"""If all values are the same +/- 0.0005"""
data = pd.Series(
[1.555, 1.5551, 1.5549, 1.555, 1.555, 1.5548, 1.5553]
)
assert gaps.stale_values_round(data, decimals=3, mark='all').all()


def test_stale_values_round_span_in_middle():
"""A span of stale values between not-stale data."""
data = pd.Series(
[1.0, 1.1, 1.2, 1.5, 1.5, 1.5, 1.5, 1.9, 2.0, 2.2]
)
assert_series_equal(
gaps.stale_values_round(data, window=4, mark='all'),
pd.Series([False, False, False,
True, True, True, True,
False, False, False], dtype='bool')
)


def test_stale_values_larger_window():
"""Increasing the window size excludes short spans of repeated
values."""
data = pd.Series(
[1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 6]
)
assert_series_equal(
gaps.stale_values_round(data, window=4, mark='all'),
(data == 2) | (data == 4)
)
assert_series_equal(
gaps.stale_values_round(data, window=5, mark='all'),
(data == 4)
)


def test_stale_values_round_bad_mark():
"""passing an invalid value for `mark` raises a ValueError."""
data = pd.Series(1, index=range(1, 10))
with pytest.raises(ValueError):
gaps.stale_values_round(data, mark='bad')


def test_stale_values_round_mark():
"""Test that different values for `mark` have the correct semantics."""
data = pd.Series(1, index=range(0, 10))
expected = pd.Series(True, index=range(0, 10))
assert_series_equal(
expected,
gaps.stale_values_round(data, mark='all')
)
expected.iloc[0] = False
assert_series_equal(
expected,
gaps.stale_values_round(data)
)
assert_series_equal(
expected,
gaps.stale_values_round(data, mark='tail')
)
expected.iloc[1] = False
assert_series_equal(
expected,
gaps.stale_values_round(data, window=3, mark='end')
)


def test_stale_values_round_smaller_window():
"""Decreasing window size includes shorter spans of repeated values."""
data = pd.Series(
[1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 6]
)
assert_series_equal(
gaps.stale_values_round(data, window=3, mark='all'),
(data == 2) | (data == 4)
)

0 comments on commit a54a2cb

Please sign in to comment.