Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 31 additions & 5 deletions pvanalytics/quality/gaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,29 @@ def _all_close_to_first(x, rtol=1e-5, atol=1e-8):
return np.allclose(a=x, b=x[0], rtol=rtol, atol=atol)


def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8):
def _backfill_window(endpoints, window):
# propagate Trues in `endpoints` back `window` periods. This
# makes Trues fill the entire window, rather than just marking the
# right endpoint of each window.
#
# `endpoints` must be the output of Series.rolling with `label='right'`
flags = endpoints
while window > 0:
window = window - 1
flags = flags | endpoints.shift(-window).fillna(False)
return flags


def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8, label_all=False):
"""Identify stale values in the data.

For a window of length N, the last value (index N-1) is considered
stale if all values in the window are close to the first value
(index 0).

Parameters `rtol` and `atol` have the same meaning as in
:py:func:`numpy.allclose`.

Parameters
----------
x : Series
Expand All @@ -53,9 +69,9 @@ def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8):
relative tolerance for detecting a change in data values
atol : float, default 1e-8
absolute tolerance for detecting a change in data values

Parameters rtol and atol have the same meaning as in
numpy.allclose
label_all : bool, default False
Whether to label all values in the window. If False, then only
the right endpoint of the window is labeled.

Returns
-------
Expand Down Expand Up @@ -84,16 +100,21 @@ def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8):
raw=True,
kwargs={'rtol': rtol, 'atol': atol}
).fillna(False).astype(bool)
if label_all:
return _backfill_window(flags, window)
return flags


def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8):
def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8, label_all=False):
"""Identify sequences which appear to be linear.

Sequences are linear if the first difference appears to be
constant. For a window of length N, the last value (index N-1) is
flagged if all values in the window appear to be a line segment.

Parameters `rtol` and `atol` have the same meaning as in
:py:func:`numpy.allclose`.

Parameters
----------
x : Series
Expand All @@ -105,6 +126,9 @@ def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8):
tolerance relative to max(abs(x.diff()) for detecting a change
atol : float, default 1e-8
absolute tolerance for detecting a change in first difference
label_all : bool, default False
Whether to label all values in the window. If False, then only the
right endpoint of the window is labeled.

Returns
-------
Expand Down Expand Up @@ -135,6 +159,8 @@ def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8):
rtol=rtol,
atol=atol
)
if label_all:
return _backfill_window(flags, window)
return flags


Expand Down
22 changes: 22 additions & 0 deletions pvanalytics/tests/quality/test_gaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,17 @@ def test_stale_values_diff_raises_error(stale_data):
gaps.stale_values_diff(stale_data, window=1)


def test_stale_values_diff_label_all(stale_data):
"""When label_all is True the full window is marked stale"""
assert_series_equal(
pd.Series([False, True, True, True, True,
True, True, True, False, False]),
gaps.stale_values_diff(
stale_data, window=4, label_all=True
)
)


@pytest.fixture
def interpolated_data():
"""A series that contains linear interpolation.
Expand All @@ -136,6 +147,17 @@ def interpolated_data():
return pd.Series(data=data)


def test_interpolation_diff_label_all(interpolated_data):
"""When label_all is True the full window is marked interpoated"""
assert_series_equal(
gaps.interpolation_diff(interpolated_data, window=3, label_all=True),
pd.Series([False, False, False, False, False,
True, True, True, False, False,
False, True, True, True, True, True,
False])
)


def test_interpolation_diff(interpolated_data):
"""Interpolation is detected correclty.

Expand Down