From 681abd164a1dede7a85cb9f06215c6ff753272e6 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 26 May 2020 12:36:26 -0600 Subject: [PATCH 1/4] Optionally label every interpolated/stale point (not just endpoints) Adds kwarg 'label_all' and _backfill_window funciton. If label_all is True then every point in a window that appears interpoated or stale is marked 'True'. If label_all is False then only the endpoint of the window is marked 'True'. --- pvanalytics/quality/gaps.py | 27 ++++++++++++++++++++++++-- pvanalytics/tests/quality/test_gaps.py | 22 +++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index a08bf03c..5a57c5b2 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -35,7 +35,20 @@ def _all_close_to_first(x, rtol=1e-5, atol=1e-8): return np.allclose(a=x, b=x[0], rtol=rtol, atol=atol) -def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8): +def _backfill_window(endpoints, window): + # propagate Trues in `endpoints` back `window` periods. This + # makes Trues fill the entire window, rather than just marking the + # endpoints of the window. + # + # `endpoints` must be the output of Series.rolling with `label='right'` + flags = endpoints + while window > 0: + window = window - 1 + flags = flags | endpoints.shift(-window).fillna(False) + return flags + + +def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8, label_all=False): """Identify stale values in the data. For a window of length N, the last value (index N-1) is considered @@ -53,6 +66,9 @@ def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8): relative tolerance for detecting a change in data values atol : float, default 1e-8 absolute tolerance for detecting a change in data values + label_all : bool, default False + Whether to label the full window. If False, then just the + endpoints of the window are labeled. Parameters rtol and atol have the same meaning as in numpy.allclose @@ -84,10 +100,12 @@ def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8): raw=True, kwargs={'rtol': rtol, 'atol': atol} ).fillna(False).astype(bool) + if label_all: + return _backfill_window(flags, window) return flags -def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8): +def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8, label_all=False): """Identify sequences which appear to be linear. Sequences are linear if the first difference appears to be @@ -105,6 +123,9 @@ def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8): tolerance relative to max(abs(x.diff()) for detecting a change atol : float, default 1e-8 absolute tolerance for detecting a change in first difference + label_all : bool, default False + Whether to label all values in the window. If False only the + endpoints of the window are labeled. Returns ------- @@ -135,6 +156,8 @@ def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8): rtol=rtol, atol=atol ) + if label_all: + return _backfill_window(flags, window) return flags diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index c4ce6452..036225c6 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -118,6 +118,17 @@ def test_stale_values_diff_raises_error(stale_data): gaps.stale_values_diff(stale_data, window=1) +def test_stale_values_diff_label_all(stale_data): + """When label_all is True the full window is marked stale""" + assert_series_equal( + pd.Series([False, True, True, True, True, + True, True, True, False, False]), + gaps.stale_values_diff( + stale_data, window=4, label_all=True + ) + ) + + @pytest.fixture def interpolated_data(): """A series that contains linear interpolation. @@ -136,6 +147,17 @@ def interpolated_data(): return pd.Series(data=data) +def test_interpolation_diff_label_all(interpolated_data): + """When label_all is True the full window is marked interpoated""" + assert_series_equal( + gaps.interpolation_diff(interpolated_data, window=3, label_all=True), + pd.Series([False, False, False, False, False, + True, True, True, False, False, + False, True, True, True, True, True, + False]) + ) + + def test_interpolation_diff(interpolated_data): """Interpolation is detected correclty. From 11b944cbf72229cf5f692db5b0a9e1e74c7924e0 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Wed, 27 May 2020 14:27:32 -0600 Subject: [PATCH 2/4] Clean up documentation Move not about meaning of rtol and atol out of the parameters block. --- pvanalytics/quality/gaps.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 5a57c5b2..823285ad 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -55,6 +55,9 @@ def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8, label_all=False): stale if all values in the window are close to the first value (index 0). + Parameters `rtol` and `atol` have the same meaning as in + :py:func:`numpy.allclose`. + Parameters ---------- x : Series @@ -70,9 +73,6 @@ def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8, label_all=False): Whether to label the full window. If False, then just the endpoints of the window are labeled. - Parameters rtol and atol have the same meaning as in - numpy.allclose - Returns ------- Series @@ -112,6 +112,9 @@ def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8, label_all=False): constant. For a window of length N, the last value (index N-1) is flagged if all values in the window appear to be a line segment. + Parameters `rtol` and `atol` have the same meaning as in + :py:func:`numpy.allclose`. + Parameters ---------- x : Series From 9a412ed36b24955db2d13a96774c6acf3d9b37aa Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 28 May 2020 13:36:34 -0600 Subject: [PATCH 3/4] Improve wording in documentation & comments Co-authored-by: Cliff Hansen --- pvanalytics/quality/gaps.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 823285ad..8cf789e4 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -38,7 +38,7 @@ def _all_close_to_first(x, rtol=1e-5, atol=1e-8): def _backfill_window(endpoints, window): # propagate Trues in `endpoints` back `window` periods. This # makes Trues fill the entire window, rather than just marking the - # endpoints of the window. + # right endpoint of each window. # # `endpoints` must be the output of Series.rolling with `label='right'` flags = endpoints @@ -70,8 +70,8 @@ def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8, label_all=False): atol : float, default 1e-8 absolute tolerance for detecting a change in data values label_all : bool, default False - Whether to label the full window. If False, then just the - endpoints of the window are labeled. + Whether to label the full window. If False, then only the right + endpoint of the window is labeled. Returns ------- @@ -127,8 +127,8 @@ def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8, label_all=False): atol : float, default 1e-8 absolute tolerance for detecting a change in first difference label_all : bool, default False - Whether to label all values in the window. If False only the - endpoints of the window are labeled. + Whether to label all values in the window. If False, then only the + right endpoint of the window is labeled. Returns ------- From 6d8cffeb575464835b401f9b448ff3bf44cc6fb9 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 28 May 2020 14:13:05 -0600 Subject: [PATCH 4/4] make description of label_all parameter conssistent Use same wording for both stale_values_diff and interpoaltion_diff --- pvanalytics/quality/gaps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 8cf789e4..61d78e97 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -70,8 +70,8 @@ def stale_values_diff(x, window=3, rtol=1e-5, atol=1e-8, label_all=False): atol : float, default 1e-8 absolute tolerance for detecting a change in data values label_all : bool, default False - Whether to label the full window. If False, then only the right - endpoint of the window is labeled. + Whether to label all values in the window. If False, then only + the right endpoint of the window is labeled. Returns -------