Skip to content

Commit

Permalink
Merge 7c56a05 into 18bdc1c
Browse files Browse the repository at this point in the history
  • Loading branch information
wfvining committed Jun 9, 2020
2 parents 18bdc1c + 7c56a05 commit fa19732
Show file tree
Hide file tree
Showing 3 changed files with 163 additions and 3 deletions.
16 changes: 16 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,23 @@ Identify gaps in the data.
:toctree: generated/

quality.gaps.interpolation_diff

Data sometimes contains sequences of values that are "stale" or
"stuck." These are contiguous spans of data where the value does not
change within the precision given. The functions below
can be used to detect stale values.

.. note::

If the data has been altered in some way (i.e. temperature that has
been rounded to an integer value) before being passed to these
functions you may see unexpectedly large amounts of stale data.

.. autosummary::
:toctree: generated/

quality.gaps.stale_values_diff
quality.gaps.stale_values_round

The following functions identify days with incomplete data.

Expand Down
59 changes: 56 additions & 3 deletions pvanalytics/quality/gaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ def stale_values_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'):
absolute tolerance for detecting a change in data values
mark : str, default 'tail'
How much of the window to mark ``True`` when a sequence of
stale values is detected. Can be of 'tail', 'end', or 'all'.
stale values is detected. Can one be of 'tail', 'end', or
'all'.
- If 'tail' (the default) then every point in the window
*except* the first point is marked ``True``.
Expand Down Expand Up @@ -125,6 +126,58 @@ def stale_values_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'):
return _mark(flags, window, mark)


def stale_values_round(x, decimals=3, window=6, mark='tail'):
"""Identify stale values by rounding.
A value is considered stale if it is part of a sequence of length
`window` of values that are identical when rounded to `decimals`
decimal places.
Parameters
----------
x : Series
Data to be processed.
decimals : int, default 3
Number of decimal places to round to.
window : int, default 6
Number of consecutive identical values for a data point to be
considered stale.
mark : str, default 'tail'
How much of the window to mark ``True`` when a sequence of
stale values is detected. Can be one of 'tail', 'end', or
'all'.
- If 'tail' (the default) then every point in the window
*except* the first point is marked ``True``.
- If 'end' then only the endpoints of the window are marked
``True``. The first `window - 1` values in a stale sequence
sequence are marked ``False``.
- If 'all' then every point in the window *including* the
first point is marked ``True``.
Returns
-------
Series
True for each value that is part of a stale sequence of data.
Raises
------
ValueError
If `mark` is not one of 'tail', 'end', or 'all'.
Notes
-----
Based on code from the pvfleets_qa_analysis project. Copyright
(c) 2020 Alliance for Sustainable Energy, LLC.
"""
rounded_diff = x.round(decimals=decimals).diff()
endpoints = rounded_diff.rolling(window=window-1).apply(
lambda xs: len(xs[xs == 0]) == window-1
).fillna(False).astype(bool)
return _mark(endpoints, window, mark)


def interpolation_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'):
"""Identify sequences which appear to be linear.
Expand All @@ -148,8 +201,8 @@ def interpolation_diff(x, window=6, rtol=1e-5, atol=1e-8, mark='tail'):
absolute tolerance for detecting a change in first difference
mark : str, default 'tail'
How much of the window to mark ``True`` when a sequence of
interpolated values is detected. Can be 'tail', 'end', or
'all'.
interpolated values is detected. Can be one of 'tail', 'end',
or 'all'.
- If 'tail' (the default) then every point in the window
*except* the first point is marked ``True``.
Expand Down
91 changes: 91 additions & 0 deletions pvanalytics/tests/quality/test_gaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,3 +651,94 @@ def test_complete():
pd.Series(True, index=data.index),
gaps.complete(data, minimum_completeness=0.2)
)


def test_stale_values_round_no_stale():
"""No stale values in a monotonically increasing sequence."""
data = pd.Series(np.linspace(0, 10))
assert not gaps.stale_values_round(data, mark='all').any()


def test_stale_values_round_all_same():
"""If all data is identical, then all values are stale."""
data = pd.Series(1, index=range(0, 10))
assert gaps.stale_values_round(data, mark='all').all()


def test_stale_values_round_noisy():
"""If all values are the same +/- 0.0005"""
data = pd.Series(
[1.555, 1.5551, 1.5549, 1.555, 1.555, 1.5548, 1.5553]
)
assert gaps.stale_values_round(data, decimals=3, mark='all').all()


def test_stale_values_round_span_in_middle():
"""A span of stale values between not-stale data."""
data = pd.Series(
[1.0, 1.1, 1.2, 1.5, 1.5, 1.5, 1.5, 1.9, 2.0, 2.2]
)
assert_series_equal(
gaps.stale_values_round(data, window=4, mark='all'),
pd.Series([False, False, False,
True, True, True, True,
False, False, False], dtype='bool')
)


def test_stale_values_larger_window():
"""Increasing the window size excludes short spans of repeated
values."""
data = pd.Series(
[1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 6]
)
assert_series_equal(
gaps.stale_values_round(data, window=4, mark='all'),
(data == 2) | (data == 4)
)
assert_series_equal(
gaps.stale_values_round(data, window=5, mark='all'),
(data == 4)
)


def test_stale_values_round_bad_mark():
"""passing an invalid value for `mark` raises a ValueError."""
data = pd.Series(1, index=range(1, 10))
with pytest.raises(ValueError):
gaps.stale_values_round(data, mark='bad')


def test_stale_values_round_mark():
"""Test that different values for `mark` have the correct semantics."""
data = pd.Series(1, index=range(0, 10))
expected = pd.Series(True, index=range(0, 10))
assert_series_equal(
expected,
gaps.stale_values_round(data, mark='all')
)
expected.iloc[0] = False
assert_series_equal(
expected,
gaps.stale_values_round(data)
)
assert_series_equal(
expected,
gaps.stale_values_round(data, mark='tail')
)
expected.iloc[1] = False
assert_series_equal(
expected,
gaps.stale_values_round(data, window=3, mark='end')
)


def test_stale_values_round_smaller_window():
"""Decreasing window size includes shorter spans of repeated values."""
data = pd.Series(
[1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 6]
)
assert_series_equal(
gaps.stale_values_round(data, window=3, mark='all'),
(data == 2) | (data == 4)
)

0 comments on commit fa19732

Please sign in to comment.