From 6e2b6e3eb3ed2799395ec8590676c2e3fa390eb2 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 31 Mar 2020 14:56:48 -0600 Subject: [PATCH 01/30] Funciton to detect and remove leading and trailing gaps Looks for sparodic/sparse data at the begining and end of a series and removes removes those leating and trailing periods. --- pvanalytics/quality/gaps.py | 88 +++++++++++++ pvanalytics/tests/quality/test_gaps.py | 165 +++++++++++++++++++++++++ 2 files changed, 253 insertions(+) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 9fc69150..b5c00aa6 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -5,6 +5,7 @@ """ import numpy as np +import pandas as pd def _all_close_to_first(x, rtol=1e-5, atol=1e-8): @@ -135,3 +136,90 @@ def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8): atol=atol ) return flags + + +def valid_between(series, days=10, minimum_hours=7.75, freq=None): + """Get the start and end of valid data. + + The start and end dates returned by this function can be used to + remove large periods of missing data from the begining and end of + the series. The valid data begins when there are `days` + consecutive days with data covering at least `minimum_hours` on + each day. Valid data ends on the last day with `days` consecutive + days with data covering at least `minimum_hours` preceeding it. + + Parameters + ---------- + series : Series + A datetime indexed series. + days : int + The minimum number of consecutive valid days for data to be + considered valid. + minimum_hours : float + The number of hours that must have valid data for a day to be + considered valid. + freq : string or None, default None + The frequency to the series. If None, then frequescy is + inferred from the index. + + Returns + ------- + start : Datetime or None + The first valid day. If there are no sufficiently long periods + of valid days then None is returned. + end : Datetime or None + The last valid day. None if start is None. + + """ + freq_hours = (pd.Timedelta(freq or pd.infer_freq(series.index)).seconds + / (60.0*60.0)) + daily_hours = (series.dropna().resample('D').count()*freq_hours) + good_days_preceeding = daily_hours[daily_hours >= minimum_hours].rolling( + str(days)+'D', closed='right' + ).count() + good_days_following = good_days_preceeding.shift(periods=-(days-1)) + + following_above_threshold = good_days_following[ + good_days_following >= days + ] + preceeding_above_threshold = good_days_preceeding[ + good_days_preceeding >= days + ] + + start = None + end = None + + if len(following_above_threshold) > 0: + start = following_above_threshold.index[0] + + if len(preceeding_above_threshold) > 0: + end = preceeding_above_threshold.index[-1] + + return start, end + + +def trim(series, **kwargs): + """Remove missing data from the begining and end of the dataset. + + Missing data is determined by the criteria in + :py:func:`valid_between`. + + Parameters + ---------- + series : Series + A DatatimeIndexed series + kwargs : + Any of the keyword arguments that can be passed to + :py:func:`valid_between` + + Returns + ------- + Series or None + The same series with leading and trailing `NA`s removed. If + there is no valid data None is returned + + """ + start, end = valid_between(series, **kwargs) + if start: + return series[start.date():end.date()] + return None diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index a082c03c..838c640e 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -1,6 +1,7 @@ """Tests for gaps quality control functions.""" import pytest import pandas as pd +import numpy as np from pandas.util.testing import assert_series_equal from pvanalytics.quality import gaps @@ -203,3 +204,167 @@ def test_interpolation_diff_raises_error(interpolated_data): """ with pytest.raises(ValueError): gaps.interpolation_diff(interpolated_data, window=2) + + +def test_valid_between_no_missing_data(): + """If there is no missing data firstlastvaliddays should return the + start and end of the series. + + """ + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + series = pd.Series( + data=np.full(len(index), 10), + index=index + ) + firstvalid, lastvalid = gaps.valid_between(series) + assert firstvalid.date() == pd.Timestamp('01-01-2020').date() + assert lastvalid.date() == pd.Timestamp('08-01-2020').date() + + +def test_first_day_missing_data(): + """If the first day is missing data, the first valid date should be + the second day. + + """ + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + data = np.full(len(index), 10) + series = pd.Series(data=data, index=index) + series['01-01-2020 00:00':'01-02-2020 00:00'] = np.nan + firstvalid, lastvalid = gaps.valid_between(series) + assert firstvalid.date() == pd.Timestamp('01-02-2020').date() + assert lastvalid.date() == pd.Timestamp('08-01-2020').date() + + +def test_first_and_fifth_days_missing(): + """First valid date should be the sixth of January.""" + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + data = np.full(len(index), 10) + series = pd.Series(data=data, index=index) + series['01-01-2020 00:00':'01-02-2020 00:00'] = np.nan + series['01-05-2020 00:00':'01-06-2020 00:00'] = np.nan + firstvalid, lastvalid = gaps.valid_between(series) + assert firstvalid.date() == pd.Timestamp('01-06-2020').date() + assert lastvalid.date() == pd.Timestamp('08-01-2020').date() + + +def test_last_two_days_missing(): + """If the last two days of data are missing last valid day should be + July 30. + + """ + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + data = np.full(len(index), 10) + series = pd.Series(data=data, index=index) + series['07-31-2020 00:00':'08-01-2020 23:00'] = np.nan + firstvalid, lastvalid = gaps.valid_between(series) + assert firstvalid.date() == pd.Timestamp('01-01-2020').date() + assert lastvalid.date() == pd.Timestamp('07-30-2020').date() + + +def test_valid_between_no_data(): + """If the passed to valid_between is empty the returns (None, None).""" + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + series = pd.Series(index=index, data=np.full(len(index), np.nan)) + assert (None, None) == gaps.valid_between(series) + + +def test_valid_between_sparse_data(): + """Check that days with only a few hours of data aren't considered + valid. + + """ + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + series = pd.Series(index=index, data=np.full(len(index), 2.3)) + series['01-02-2020 00:00':'01-02-2020 06:00'] = np.nan + series['01-02-2020 08:00':'01-02-2020 21:00'] = np.nan + series['07-31-2020 07:00':] = np.nan + start, end = gaps.valid_between(series) + assert start.date() == pd.Timestamp('01-03-2020').date() + assert end.date() == pd.Timestamp('07-30-2020').date() + + +def test_valid_between_not_enough_data(): + """Only one day of data is not ehough for any valid days.""" + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + series = pd.Series(index=index, dtype='float64') + series['02-23-2020 08:00':'02-24-2020 08:00'] = 1 + assert (None, None) == gaps.valid_between(series) + + +def test_valid_between_one_day(): + """Works when there is exactly the minimum number of consecutive + days with data. + + """ + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + series = pd.Series(index=index, dtype='float64') + series['05-05-2020'] = 2 + start, end = gaps.valid_between(series, days=1) + assert start.date() == pd.Timestamp('05-05-2020').date() + assert end.date() == pd.Timestamp('05-05-2020').date() + + +def test_valid_between_with_gaps_in_middle(): + """When there are gaps in the data longer than `days` valid between + should include those gaps, as long as there are `days` consecutive + days with enough data some time after the gap. + + """ + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + series = pd.Series(index=index, data=np.full(len(index), 1)) + series['03-05-2020':'03-25-2020'] = np.nan + start, end = gaps.valid_between(series, days=5) + assert start.date() == index[0].date() + assert end.date() == index[-1].date() + + +def test_trim(): + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + series = pd.Series(index=index, data=np.full(len(index), 1)) + series['01-02-2020':'01-07-2020 13:00'] = np.nan + series['01-10-2020':'01-11-2020'] = np.nan + valid_series = gaps.trim(series, days=3) + assert_series_equal( + valid_series, + series['01-07-2020':'08-01-2020 00:00'] + ) From ef9554f40eba6c1b9a434242359ef1b512647f06 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 2 Apr 2020 08:08:31 -0600 Subject: [PATCH 02/30] Return boolean mask instead of trimming the series. To keep consistent with the rest of the quality functions `gaps.trim` should return a boolen mask with False for the entries that are being trimmed rather than a slice of the series. --- pvanalytics/quality/gaps.py | 19 +++++++++++-------- pvanalytics/tests/quality/test_gaps.py | 3 +-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index b5c00aa6..12b9e479 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -199,7 +199,7 @@ def valid_between(series, days=10, minimum_hours=7.75, freq=None): def trim(series, **kwargs): - """Remove missing data from the begining and end of the dataset. + """Mask out missing data from the begining and end of the data. Missing data is determined by the criteria in :py:func:`valid_between`. @@ -207,19 +207,22 @@ def trim(series, **kwargs): Parameters ---------- series : Series - A DatatimeIndexed series + A DatatimeIndexed series. kwargs : Any of the keyword arguments that can be passed to - :py:func:`valid_between` + :py:func:`valid_between`. Returns ------- - Series or None - The same series with leading and trailing `NA`s removed. If - there is no valid data None is returned + Series + A series of booleans whith the same index as `series` with False + up to the first good day, True from the first to the last good + day, and False from the last good day to the end. """ start, end = valid_between(series, **kwargs) + s = pd.Series(index=series.index, dtype='bool') + s.loc[:] = False if start: - return series[start.date():end.date()] - return None + s.loc[start.date():end.date()] = True + return s diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index 838c640e..ae77a7ed 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -363,8 +363,7 @@ def test_trim(): series = pd.Series(index=index, data=np.full(len(index), 1)) series['01-02-2020':'01-07-2020 13:00'] = np.nan series['01-10-2020':'01-11-2020'] = np.nan - valid_series = gaps.trim(series, days=3) assert_series_equal( - valid_series, + series[gaps.trim(series, days=3)], series['01-07-2020':'08-01-2020 00:00'] ) From 0a23469a5805b217dd0d2acf4679ab679b8d1a60 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 2 Apr 2020 08:49:17 -0600 Subject: [PATCH 03/30] Trimming a series with no valid days returns all False Did not previously test this important edge-case --- pvanalytics/tests/quality/test_gaps.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index ae77a7ed..a6e4ac56 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -355,6 +355,10 @@ def test_valid_between_with_gaps_in_middle(): def test_trim(): + """gaps.trim() should return a boolean mask that selects only the good + data in the middle of a series. + + """ index = pd.date_range( freq='15T', start='01-01-2020', @@ -367,3 +371,15 @@ def test_trim(): series[gaps.trim(series, days=3)], series['01-07-2020':'08-01-2020 00:00'] ) + + +def test_trim_empty(): + """gaps.trim() returns all False for series with no valid days.""" + index = pd.date_range( + freq='15T', + start='01-01-2020', + end='08-01-2020 23:00' + ) + series = pd.Series(index=index, dtype='float64') + series.iloc[::(24*60)] = 1 + assert (~gaps.trim(series, days=3)).all() From 0cc6335a18a6f9f6364f19371f6274859918b2aa Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 2 Apr 2020 08:57:04 -0600 Subject: [PATCH 04/30] Add gaps.valid_between and gaps.trim to API documentation --- docs/api.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/api.rst b/docs/api.rst index ae8f4efa..a1cc73f8 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -53,6 +53,15 @@ Identify gaps in the data. quality.gaps.interpolation_diff quality.gaps.stale_values_diff +Many data sets may have leading and trailing periods with sparodic or +no data. The following functions can be used to remove those periods. + +.. autosummary:: + :toctree: generated/ + + quality.gaps.valid_between + quality.gaps.trim + Outliers -------- From f95e83fbb3283a13fc8a8323ce3f0d2daaa2a0ba Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 21 Apr 2020 13:55:33 -0600 Subject: [PATCH 05/30] Add license and attribution for pvfleets_qa_analysis --- LICENSES/PVFLEETS_QA_LICENSE | 29 +++++++++++++++++++++++++++++ pvanalytics/quality/gaps.py | 9 +++++++++ 2 files changed, 38 insertions(+) create mode 100644 LICENSES/PVFLEETS_QA_LICENSE diff --git a/LICENSES/PVFLEETS_QA_LICENSE b/LICENSES/PVFLEETS_QA_LICENSE new file mode 100644 index 00000000..797c6425 --- /dev/null +++ b/LICENSES/PVFLEETS_QA_LICENSE @@ -0,0 +1,29 @@ +Copyright (c) 2020 Alliance for Sustainable Energy, LLC. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 12b9e479..77e48dad 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -170,6 +170,15 @@ def valid_between(series, days=10, minimum_hours=7.75, freq=None): end : Datetime or None The last valid day. None if start is None. + Notes + ----- + This function was derived from the pvfleets_qa_analysis project, + Copyright (c) 2020 Alliance for Sustainable Energy, LLC. See the + file LICENSES/PVFLEETS_QA_LICENSE at the top level directory of + this distribution and at ``_ for more + information. + """ freq_hours = (pd.Timedelta(freq or pd.infer_freq(series.index)).seconds / (60.0*60.0)) From 62e74da1206f624eb514a5aba5142703d479d0df Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 21 Apr 2020 13:57:15 -0600 Subject: [PATCH 06/30] Fix docstring indentation and spelling for gaps.trim() --- pvanalytics/quality/gaps.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 77e48dad..1a6b328c 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -224,9 +224,9 @@ def trim(series, **kwargs): Returns ------- Series - A series of booleans whith the same index as `series` with False - up to the first good day, True from the first to the last good - day, and False from the last good day to the end. + A series of booleans with the same index as `series` with + False up to the first good day, True from the first to the + last good day, and False from the last good day to the end. """ start, end = valid_between(series, **kwargs) From 013d2e2cf930768c850a52e7f51d62ec2e4da13f Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 21 Apr 2020 14:14:40 -0600 Subject: [PATCH 07/30] clarify valid data in documentation for 'valid_between' --- pvanalytics/quality/gaps.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 1a6b328c..3130a441 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -144,9 +144,13 @@ def valid_between(series, days=10, minimum_hours=7.75, freq=None): The start and end dates returned by this function can be used to remove large periods of missing data from the begining and end of the series. The valid data begins when there are `days` - consecutive days with data covering at least `minimum_hours` on - each day. Valid data ends on the last day with `days` consecutive - days with data covering at least `minimum_hours` preceeding it. + consecutive days with valid data covering at least `minimum_hours` + on each day. Valid data ends on the last day with `days` + consecutive days with data covering at least `minimum_hours` + preceeding it. + + Any data point with a value other than `NaN` is considered valid + data. Parameters ---------- From d2ceb5467cf361c4fec3bdf7ea3e8d4c93a23696 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 5 May 2020 15:02:47 -0600 Subject: [PATCH 08/30] Tests for function that calculates a daily completeness index Completeness is the fraction of the day that has data (a timestamp exists and its value is not NaN). --- pvanalytics/quality/gaps.py | 29 ++++++++++ pvanalytics/tests/quality/test_gaps.py | 76 ++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 3130a441..391dfac3 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -211,6 +211,35 @@ def valid_between(series, days=10, minimum_hours=7.75, freq=None): return start, end +def daily_completeness(series, freq=None): + """Calculate a completeness index for each day in the data. + + The completeness for a given day is the fraction of time in the + day for which there is data (a value other than NaN). + + Parameters + ---------- + series : Series + A DatetimeIndexed series. + freq : string, default None + interval between samples in the series. If None, the frequency + is inferred using :py:func:`pandas.infer_freq`. + + Returns + ------- + Series + A series of floats indexed by day giving the completeness of + each day (fraction of hours in the day for which `series` has + data). + + """ + seconds_per_sample = pd.Timedelta( + freq or pd.infer_freq(series.index) + ).seconds + daily_counts = series.resample('D').count() + return (daily_counts * seconds_per_sample) / (1440*60) + + def trim(series, **kwargs): """Mask out missing data from the begining and end of the data. diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index a6e4ac56..16241471 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -383,3 +383,79 @@ def test_trim_empty(): series = pd.Series(index=index, dtype='float64') series.iloc[::(24*60)] = 1 assert (~gaps.trim(series, days=3)).all() + + +def test_daily_completeness_all_nans(): + """A data set with all nans has completeness 0 for each day.""" + completeness = gaps.daily_completeness( + pd.Series( + np.nan, + index=pd.date_range('01/01/2020 00:00', freq='1H', periods=48), + dtype='float64' + ) + ) + assert_series_equal( + pd.Series( + 0, + index=pd.date_range(start='01/01/2020', freq='D', periods=2) + ), + completeness + ) + + +def test_daily_completeness_no_data(): + """A data set with completely missing timestamps and NaNs has + completeness 0.""" + two_days = pd.date_range(start='01/01/2020', freq='D', periods=2) + completeness = gaps.daily_completeness( + pd.Series(index=two_days, dtype='float64'), freq='15T' + ) + assert_series_equal( + pd.Series(0.0, index=two_days), + completeness + ) + + +def test_daily_completeness_incomplete_index(): + """A series with one data point per hour has 25% completeness at + 15-minute sample frequency""" + data = pd.Series( + 1, + index=pd.date_range(start='01/01/2020', freq='1H', periods=72), + ) + completeness = gaps.daily_completeness(data, freq='15T') + assert_series_equal( + pd.Series( + 0.25, + index=pd.date_range(start='01/01/2020', freq='D', periods=3) + ), + completeness + ) + + +def test_daily_completeness_complete(): + """A series with data at every point has completeness 1.0""" + data = pd.Series( + 1, index=pd.date_range(start='01/01/2020', freq='15T', periods=24*4*2) + ) + completeness = gaps.daily_completeness(data) + assert_series_equal( + pd.Series( + 1.0, + index=pd.date_range(start='01/01/2020', freq='D', periods=2) + ), + completeness + ) + + +def test_daily_completeness_freq_too_high(): + """If the infered freq is shorter than the passed freq an exception is + raised.""" + data = pd.Series( + 1, + index=pd.date_range(start='1/1/2020', freq='15T', periods=24*4*4) + ) + with pytest.raises(ValueError): + gaps.daily_completeness(data, freq='16T') + with pytest.raises(ValueError): + gaps.daily_completeness(data, freq='1H') From a0894100bd77870583fadaebe45e49deac76e224 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 7 May 2020 08:31:38 -0600 Subject: [PATCH 09/30] Fix data types and use longer series in tests Longer series is necessary for infer_freq, it is reasonable to include a longer series here since the function under test aggregates data by day, it will not be called with very short time series. --- pvanalytics/tests/quality/test_gaps.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index 16241471..a85ad004 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -396,7 +396,7 @@ def test_daily_completeness_all_nans(): ) assert_series_equal( pd.Series( - 0, + 0.0, index=pd.date_range(start='01/01/2020', freq='D', periods=2) ), completeness @@ -406,12 +406,12 @@ def test_daily_completeness_all_nans(): def test_daily_completeness_no_data(): """A data set with completely missing timestamps and NaNs has completeness 0.""" - two_days = pd.date_range(start='01/01/2020', freq='D', periods=2) + four_days = pd.date_range(start='01/01/2020', freq='D', periods=4) completeness = gaps.daily_completeness( - pd.Series(index=two_days, dtype='float64'), freq='15T' + pd.Series(index=four_days, dtype='float64'), freq='15T' ) assert_series_equal( - pd.Series(0.0, index=two_days), + pd.Series(0.0, index=four_days), completeness ) From 2c5d61878ec1225d1367d5e36a7c06b282a3cd59 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 7 May 2020 08:33:17 -0600 Subject: [PATCH 10/30] Raise a value error if the frequency passed to the function is bad It doesn't make sense to pass a frequency that is longer than the inferred frequency of the series. --- pvanalytics/quality/gaps.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 391dfac3..7d5eac60 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -211,6 +211,15 @@ def valid_between(series, days=10, minimum_hours=7.75, freq=None): return start, end +def _freq_to_seconds(freq): + if not freq: + return None + if freq.isalpha(): + freq = '1' + freq + delta = pd.to_timedelta(freq) + return delta.days * (1440 * 60) + delta.seconds + + def daily_completeness(series, freq=None): """Calculate a completeness index for each day in the data. @@ -232,10 +241,18 @@ def daily_completeness(series, freq=None): each day (fraction of hours in the day for which `series` has data). + Raises + ------ + ValueError + If `freq` is longer than the frequency inferred from `series`. + """ - seconds_per_sample = pd.Timedelta( - freq or pd.infer_freq(series.index) - ).seconds + inferred_seconds = _freq_to_seconds(pd.infer_freq(series.index)) + freq_seconds = _freq_to_seconds(freq) + seconds_per_sample = freq_seconds or inferred_seconds + if freq and inferred_seconds < freq_seconds: + raise ValueError("freq must be less than or equal to the" + + " frequency of the series") daily_counts = series.resample('D').count() return (daily_counts * seconds_per_sample) / (1440*60) From 324c6c48262a57ada4a02cd0c81f268ebd323013 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 7 May 2020 09:58:16 -0600 Subject: [PATCH 11/30] Tests for completeness filtering function. Adds function, documentation, and initial tests covering edge cases for the threshold parameter as well as tests covering basic functionality. --- pvanalytics/quality/gaps.py | 28 +++++++ pvanalytics/tests/quality/test_gaps.py | 103 +++++++++++++++++++++++++ 2 files changed, 131 insertions(+) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 7d5eac60..bfd61d94 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -257,6 +257,34 @@ def daily_completeness(series, freq=None): return (daily_counts * seconds_per_sample) / (1440*60) +def complete(series, threshold=0.333, freq=None): + """Select only data points that are part of a day with complete data. + + Parameters + ---------- + series : Series + The data to be checked for completeness. + threshold : float, default 0.333 + Fraction of the day that must have data. + freq : str, default None + The expected frequency of the data in `series`. If none then + the frequency is inferred from the data. + + Returns + ------- + Series + A series of booleans with True for each value that is part of + a day with completeness greater than `threshold`. + + Raises + ------ + ValueError + See :py:func:`daily_completeness`. + + """ + pass + + def trim(series, **kwargs): """Mask out missing data from the begining and end of the data. diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index a85ad004..9082101f 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -459,3 +459,106 @@ def test_daily_completeness_freq_too_high(): gaps.daily_completeness(data, freq='16T') with pytest.raises(ValueError): gaps.daily_completeness(data, freq='1H') + + +def test_complete_threshold_zero(): + """threshold of 0 returns all True regardless of data.""" + ten_days = pd.date_range( + start='01/01/2020', freq='15T', end='1/10/2020', closed='left') + data = pd.Series(index=ten_days, dtype='float64') + assert_series_equal( + pd.Series(True, index=data.index), + gaps.complete(data, threshold=0) + ) + data[pd.date_range( + start='01/01/2020', freq='1D', end='1/10/2020', closed='left')] = 1.0 + data.dropna() + assert_series_equal( + pd.Series(True, index=data.index), + gaps.complete(data, threshold=0, freq='15T') + ) + data = pd.Series(1.0, index=ten_days) + assert_series_equal( + pd.Series(True, index=data.index), + gaps.complete(data, threshold=0) + ) + + +def test_complete_threshold_one(): + """If threshold=1 then any missing data on a day means all data for + the day is flagged False.""" + ten_days = pd.date_range( + start='01/01/2020', freq='15T', end='01/10/2020', closed='left') + data = pd.Series(index=ten_days, dtype='float64') + assert_series_equal( + pd.Series(False, index=data.index), + gaps.complete(data, threshold=1.0) + ) + data.loc[:] = 1 + assert_series_equal( + pd.Series(True, index=data.index), + gaps.complete(data, threshold=1.0) + ) + # remove one data-point per day + days = pd.date_range( + start='1/1/2020', freq='1D', end='1/10/2020', closed='left') + data.loc[days] = np.nan + assert_series_equal( + pd.Series(False, index=data.index), + gaps.complete(data, threshold=1.0) + ) + # check that dropping the NaNs still gives the same result with + # and without passing `freq`. (There should be enough data to infer the + # correct frequency if only one value is missing on each day.) + data.dropna() + assert_series_equal( + pd.Series(False, index=data.index), + gaps.complete(data, threshold=1.0) + ) + assert_series_equal( + gaps.complete(data, threshold=1.0), + gaps.complete(data, threshold=1.0, freq='15T') + ) + + +def test_complete(): + """Test gaps.complete with varying amounts of missing data.""" + ten_days = pd.date_range( + start='1/1/2020', freq='H', end='1/10/2020', closed='left') + data = pd.Series(index=ten_days, dtype='float64') + data.loc['1/1/2020'] = 1.0 + day_two_values = pd.date_range( + start='1/2/2020', freq='2H', end='1/3/2020', closed='left') + data.loc[day_two_values] = 2.0 + day_three_values = pd.date_range( + start='1/3/2020', freq='3H', end='1/4/2020', closed='left') + data.loc[day_three_values] = 3.0 + day_four_values = pd.date_range( + start='1/4/2020', freq='4H', end='1/5/2020', closed='left') + data.loc[day_four_values] = 4.0 + data.loc['1/5/2020':] = 5.0 + + expected = pd.Series(False, index=data.index) + expected.loc['1/1/2020'] = True + expected.loc['1/5/2020':] = True + assert_series_equal( + expected, + gaps.complete(data, threshold=1.0) + ) + + expected.loc['1/2/2020'] = True + assert_series_equal( + expected, + gaps.complete(data, threshold=0.5) + ) + + expected.loc['1/3/2020'] = True + assert_series_equal( + expected, + gaps.complete(data, threshold=0.3) + ) + + assert_series_equal( + pd.Series(True, index=data.index), + gaps.complete(data, threshold=0.2) + ) From f255b5128a450e318e2fe4db6e5834b837d6c9a5 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 7 May 2020 12:51:08 -0600 Subject: [PATCH 12/30] Initial implementation of gaps.complete --- pvanalytics/quality/gaps.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index bfd61d94..d305bfc0 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -282,7 +282,8 @@ def complete(series, threshold=0.333, freq=None): See :py:func:`daily_completeness`. """ - pass + completeness = daily_completeness(series, freq) + return (completeness >= threshold).reindex(series.index, method='pad') def trim(series, **kwargs): From 0dbc14b5f11f0167faaa892561333d26827b5030 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 7 May 2020 14:05:54 -0600 Subject: [PATCH 13/30] Improve documentation. Adds clarification to description of gaps.daily_completeness() and gaps.complete(). --- pvanalytics/quality/gaps.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index d305bfc0..79672802 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -224,15 +224,21 @@ def daily_completeness(series, freq=None): """Calculate a completeness index for each day in the data. The completeness for a given day is the fraction of time in the - day for which there is data (a value other than NaN). + day for which there is data (a value other than NaN). The amount + of time that a value is attributed is equal to the timestamp + spacing in `series` or `freq` if it is specified. For example, a + day with 24 non-NaN values in a series with 30 minute timestamp + spacing would have 12 hours of data and therefore completeness of + 0.5. Parameters ---------- series : Series A DatetimeIndexed series. freq : string, default None - interval between samples in the series. If None, the frequency - is inferred using :py:func:`pandas.infer_freq`. + Interval between samples in the series, as a pandas frequency + string. If None, the frequency is inferred using + :py:func:`pandas.infer_freq`. Returns ------- @@ -260,6 +266,13 @@ def daily_completeness(series, freq=None): def complete(series, threshold=0.333, freq=None): """Select only data points that are part of a day with complete data. + A day has complete data if the fraction of the day that has + non-NaN values is at least `threshold`. The fraction of the day + assigned to each value is equal to the timestamp spacing of the + series or `freq` if it is provided. For example, a day with 24 + non-NaN values in a series with 30 minute timestamp spacing would + have 12 hours of data and therefore completeness of 0.5. + Parameters ---------- series : Series From 5a4fb543ffaa05a9f23a12e1ab6d69a90d7a2791 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 7 May 2020 14:55:42 -0600 Subject: [PATCH 14/30] Refactor valid_between to use daily_completeness --- pvanalytics/quality/gaps.py | 144 ++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 73 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 79672802..9b1ccfd6 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -138,79 +138,6 @@ def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8): return flags -def valid_between(series, days=10, minimum_hours=7.75, freq=None): - """Get the start and end of valid data. - - The start and end dates returned by this function can be used to - remove large periods of missing data from the begining and end of - the series. The valid data begins when there are `days` - consecutive days with valid data covering at least `minimum_hours` - on each day. Valid data ends on the last day with `days` - consecutive days with data covering at least `minimum_hours` - preceeding it. - - Any data point with a value other than `NaN` is considered valid - data. - - Parameters - ---------- - series : Series - A datetime indexed series. - days : int - The minimum number of consecutive valid days for data to be - considered valid. - minimum_hours : float - The number of hours that must have valid data for a day to be - considered valid. - freq : string or None, default None - The frequency to the series. If None, then frequescy is - inferred from the index. - - Returns - ------- - start : Datetime or None - The first valid day. If there are no sufficiently long periods - of valid days then None is returned. - end : Datetime or None - The last valid day. None if start is None. - - Notes - ----- - This function was derived from the pvfleets_qa_analysis project, - Copyright (c) 2020 Alliance for Sustainable Energy, LLC. See the - file LICENSES/PVFLEETS_QA_LICENSE at the top level directory of - this distribution and at ``_ for more - information. - - """ - freq_hours = (pd.Timedelta(freq or pd.infer_freq(series.index)).seconds - / (60.0*60.0)) - daily_hours = (series.dropna().resample('D').count()*freq_hours) - good_days_preceeding = daily_hours[daily_hours >= minimum_hours].rolling( - str(days)+'D', closed='right' - ).count() - good_days_following = good_days_preceeding.shift(periods=-(days-1)) - - following_above_threshold = good_days_following[ - good_days_following >= days - ] - preceeding_above_threshold = good_days_preceeding[ - good_days_preceeding >= days - ] - - start = None - end = None - - if len(following_above_threshold) > 0: - start = following_above_threshold.index[0] - - if len(preceeding_above_threshold) > 0: - end = preceeding_above_threshold.index[-1] - - return start, end - - def _freq_to_seconds(freq): if not freq: return None @@ -299,6 +226,77 @@ def complete(series, threshold=0.333, freq=None): return (completeness >= threshold).reindex(series.index, method='pad') +def valid_between(series, days=10, minimum_completeness=0.333333, freq=None): + """Get the start and end of valid data. + + The start and end dates returned by this function can be used to + remove large periods of missing data from the begining and end of + the series. The valid data begins when there are `days` + consecutive days with valid data covering at least `minimum_hours` + on each day. Valid data ends on the last day with `days` + consecutive days with data covering at least `minimum_hours` + preceeding it. + + Any data point with a value other than `NaN` is considered valid + data. + + Parameters + ---------- + series : Series + A datetime indexed series. + days : int + The minimum number of consecutive valid days for data to be + considered valid. + minimum_hours : float + The number of hours that must have valid data for a day to be + considered valid. + freq : string or None, default None + The frequency to the series. If None, then frequency is + inferred from the index. + + Returns + ------- + start : Datetime or None + The first valid day. If there are no sufficiently long periods + of valid days then None is returned. + end : Datetime or None + The last valid day. None if start is None. + + Notes + ----- + This function was derived from the pvfleets_qa_analysis project, + Copyright (c) 2020 Alliance for Sustainable Energy, LLC. See the + file LICENSES/PVFLEETS_QA_LICENSE at the top level directory of + this distribution and at ``_ for more + information. + + """ + completeness = daily_completeness(series, freq) + complete_days = completeness >= minimum_completeness + good_days_preceeding = complete_days.astype('int').rolling( + days, closed='right' + ).sum() + good_days_following = good_days_preceeding.shift(periods=-(days-1)) + following_above_threshold = good_days_following[ + good_days_following >= days + ] + preceeding_above_threshold = good_days_preceeding[ + good_days_preceeding >= days + ] + + start = None + end = None + + if len(following_above_threshold) > 0: + start = following_above_threshold.index[0] + + if len(preceeding_above_threshold) > 0: + end = preceeding_above_threshold.index[-1] + + return start, end + + def trim(series, **kwargs): """Mask out missing data from the begining and end of the data. From eedd93e024fe943d6bb4ec34554bd2bb48383aee Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 7 May 2020 15:44:18 -0600 Subject: [PATCH 15/30] Rename valid_between to start_stop_dates. --- pvanalytics/quality/gaps.py | 38 ++++++++++++++------------ pvanalytics/tests/quality/test_gaps.py | 32 +++++++++++----------- 2 files changed, 36 insertions(+), 34 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 9b1ccfd6..132e84d8 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -226,33 +226,31 @@ def complete(series, threshold=0.333, freq=None): return (completeness >= threshold).reindex(series.index, method='pad') -def valid_between(series, days=10, minimum_completeness=0.333333, freq=None): - """Get the start and end of valid data. +def start_stop_dates(series, days=10, minimum_completeness=0.333333, + freq=None): + """Get the start and end of data excluding leading and trailing gaps. The start and end dates returned by this function can be used to remove large periods of missing data from the begining and end of - the series. The valid data begins when there are `days` - consecutive days with valid data covering at least `minimum_hours` - on each day. Valid data ends on the last day with `days` - consecutive days with data covering at least `minimum_hours` - preceeding it. - - Any data point with a value other than `NaN` is considered valid - data. + the series. The data starts when there are `days` consecutive days + with completeness greater than `minimum_completeness` (see + :py:func:`daily_completeness`) and ends on the last day with + `days` consecutive days with completeness at least + `minimum_completeness` preceeding it. Parameters ---------- series : Series - A datetime indexed series. - days : int + A DatetimeIndexed series. + days : int, default 10 The minimum number of consecutive valid days for data to be considered valid. - minimum_hours : float - The number of hours that must have valid data for a day to be - considered valid. + minimum_completeness : float, default 0.333333 + The fraction of a day that must have data for the day to be + considered complete. freq : string or None, default None - The frequency to the series. If None, then frequency is - inferred from the index. + The frequency of data in the series. If None, then frequency + is inferred from the index. Returns ------- @@ -262,6 +260,10 @@ def valid_between(series, days=10, minimum_completeness=0.333333, freq=None): end : Datetime or None The last valid day. None if start is None. + See Also + -------- + :py:func:`daily_completeness` + Notes ----- This function was derived from the pvfleets_qa_analysis project, @@ -319,7 +321,7 @@ def trim(series, **kwargs): last good day, and False from the last good day to the end. """ - start, end = valid_between(series, **kwargs) + start, end = start_stop_dates(series, **kwargs) s = pd.Series(index=series.index, dtype='bool') s.loc[:] = False if start: diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index 9082101f..09a17c25 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -206,7 +206,7 @@ def test_interpolation_diff_raises_error(interpolated_data): gaps.interpolation_diff(interpolated_data, window=2) -def test_valid_between_no_missing_data(): +def test_start_stop_dates_no_missing_data(): """If there is no missing data firstlastvaliddays should return the start and end of the series. @@ -220,7 +220,7 @@ def test_valid_between_no_missing_data(): data=np.full(len(index), 10), index=index ) - firstvalid, lastvalid = gaps.valid_between(series) + firstvalid, lastvalid = gaps.start_stop_dates(series) assert firstvalid.date() == pd.Timestamp('01-01-2020').date() assert lastvalid.date() == pd.Timestamp('08-01-2020').date() @@ -238,7 +238,7 @@ def test_first_day_missing_data(): data = np.full(len(index), 10) series = pd.Series(data=data, index=index) series['01-01-2020 00:00':'01-02-2020 00:00'] = np.nan - firstvalid, lastvalid = gaps.valid_between(series) + firstvalid, lastvalid = gaps.start_stop_dates(series) assert firstvalid.date() == pd.Timestamp('01-02-2020').date() assert lastvalid.date() == pd.Timestamp('08-01-2020').date() @@ -254,7 +254,7 @@ def test_first_and_fifth_days_missing(): series = pd.Series(data=data, index=index) series['01-01-2020 00:00':'01-02-2020 00:00'] = np.nan series['01-05-2020 00:00':'01-06-2020 00:00'] = np.nan - firstvalid, lastvalid = gaps.valid_between(series) + firstvalid, lastvalid = gaps.start_stop_dates(series) assert firstvalid.date() == pd.Timestamp('01-06-2020').date() assert lastvalid.date() == pd.Timestamp('08-01-2020').date() @@ -272,23 +272,23 @@ def test_last_two_days_missing(): data = np.full(len(index), 10) series = pd.Series(data=data, index=index) series['07-31-2020 00:00':'08-01-2020 23:00'] = np.nan - firstvalid, lastvalid = gaps.valid_between(series) + firstvalid, lastvalid = gaps.start_stop_dates(series) assert firstvalid.date() == pd.Timestamp('01-01-2020').date() assert lastvalid.date() == pd.Timestamp('07-30-2020').date() -def test_valid_between_no_data(): - """If the passed to valid_between is empty the returns (None, None).""" +def test_start_stop_dates_no_data(): + """If the passed to start_stop_dates is empty the returns (None, None).""" index = pd.date_range( freq='15T', start='01-01-2020', end='08-01-2020 23:00' ) series = pd.Series(index=index, data=np.full(len(index), np.nan)) - assert (None, None) == gaps.valid_between(series) + assert (None, None) == gaps.start_stop_dates(series) -def test_valid_between_sparse_data(): +def test_start_stop_dates_sparse_data(): """Check that days with only a few hours of data aren't considered valid. @@ -302,12 +302,12 @@ def test_valid_between_sparse_data(): series['01-02-2020 00:00':'01-02-2020 06:00'] = np.nan series['01-02-2020 08:00':'01-02-2020 21:00'] = np.nan series['07-31-2020 07:00':] = np.nan - start, end = gaps.valid_between(series) + start, end = gaps.start_stop_dates(series) assert start.date() == pd.Timestamp('01-03-2020').date() assert end.date() == pd.Timestamp('07-30-2020').date() -def test_valid_between_not_enough_data(): +def test_start_stop_dates_not_enough_data(): """Only one day of data is not ehough for any valid days.""" index = pd.date_range( freq='15T', @@ -316,10 +316,10 @@ def test_valid_between_not_enough_data(): ) series = pd.Series(index=index, dtype='float64') series['02-23-2020 08:00':'02-24-2020 08:00'] = 1 - assert (None, None) == gaps.valid_between(series) + assert (None, None) == gaps.start_stop_dates(series) -def test_valid_between_one_day(): +def test_start_stop_dates_one_day(): """Works when there is exactly the minimum number of consecutive days with data. @@ -331,12 +331,12 @@ def test_valid_between_one_day(): ) series = pd.Series(index=index, dtype='float64') series['05-05-2020'] = 2 - start, end = gaps.valid_between(series, days=1) + start, end = gaps.start_stop_dates(series, days=1) assert start.date() == pd.Timestamp('05-05-2020').date() assert end.date() == pd.Timestamp('05-05-2020').date() -def test_valid_between_with_gaps_in_middle(): +def test_start_stop_dates_with_gaps_in_middle(): """When there are gaps in the data longer than `days` valid between should include those gaps, as long as there are `days` consecutive days with enough data some time after the gap. @@ -349,7 +349,7 @@ def test_valid_between_with_gaps_in_middle(): ) series = pd.Series(index=index, data=np.full(len(index), 1)) series['03-05-2020':'03-25-2020'] = np.nan - start, end = gaps.valid_between(series, days=5) + start, end = gaps.start_stop_dates(series, days=5) assert start.date() == index[0].date() assert end.date() == index[-1].date() From b70608060bf53a76025681c4eee89e6a0f4ad621 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 12 May 2020 08:06:30 -0600 Subject: [PATCH 16/30] Add new gaps functions to API documentation. - rename valid_between to start_stop_dates - daily_completeness - complete --- docs/api.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/api.rst b/docs/api.rst index a1cc73f8..cfa77fe2 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -53,13 +53,21 @@ Identify gaps in the data. quality.gaps.interpolation_diff quality.gaps.stale_values_diff +The following functions identify days with incomplete data. + +.. autosummary:: + :toctree: generated/ + + quality.gaps.daily_completeness + quality.gaps.complete + Many data sets may have leading and trailing periods with sparodic or no data. The following functions can be used to remove those periods. .. autosummary:: :toctree: generated/ - quality.gaps.valid_between + quality.gaps.start_stop_dates quality.gaps.trim Outliers From 17dd564ac6e71fdfa779b991663935d41fb42c3f Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 12 May 2020 08:07:38 -0600 Subject: [PATCH 17/30] Update documentation for quality.gaps functions Reword for clarity and consistency. Adds a 'See Also' block to refer to related functions (mostly refers to daily_completeness). --- pvanalytics/quality/gaps.py | 82 +++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 132e84d8..6ee12670 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -148,30 +148,29 @@ def _freq_to_seconds(freq): def daily_completeness(series, freq=None): - """Calculate a completeness index for each day in the data. + """Calculate a completeness score for each day. - The completeness for a given day is the fraction of time in the - day for which there is data (a value other than NaN). The amount - of time that a value is attributed is equal to the timestamp - spacing in `series` or `freq` if it is specified. For example, a - day with 24 non-NaN values in a series with 30 minute timestamp - spacing would have 12 hours of data and therefore completeness of - 0.5. + The completeness score for a given day is the fraction of time in + the day for which there is data (a value other than NaN). The time + attributed to each value is equal to the timestamp spacing of + `series` or `freq` if it is specified. For example, a day with 24 + non-NaN values in a series with 30 minute timestamp spacing would + have 12 hours of data and therefore completeness score of 0.5. Parameters ---------- series : Series A DatetimeIndexed series. - freq : string, default None - Interval between samples in the series, as a pandas frequency + freq : str, default None + Interval between samples in the series as a pandas frequency string. If None, the frequency is inferred using :py:func:`pandas.infer_freq`. Returns ------- Series - A series of floats indexed by day giving the completeness of - each day (fraction of hours in the day for which `series` has + A series of floats, indexed by day, giving the completeness + score for each day (fraction of the day for which `series` has data). Raises @@ -193,12 +192,12 @@ def daily_completeness(series, freq=None): def complete(series, threshold=0.333, freq=None): """Select only data points that are part of a day with complete data. - A day has complete data if the fraction of the day that has - non-NaN values is at least `threshold`. The fraction of the day - assigned to each value is equal to the timestamp spacing of the - series or `freq` if it is provided. For example, a day with 24 - non-NaN values in a series with 30 minute timestamp spacing would - have 12 hours of data and therefore completeness of 0.5. + A day is complete if its completeness score is greater than or + equal to `threshold`. See :py:func:`daily_completeness` for more + information. For example, a day with 24 non-NaN values in a series + with 30 minute timestamp spacing would have 12 hours of data and + therefore a completeness score of 0.5; with the default + `threshold=0.333` the day would be marked complete. Parameters ---------- @@ -221,6 +220,10 @@ def complete(series, threshold=0.333, freq=None): ValueError See :py:func:`daily_completeness`. + See Also + -------- + :py:func:`daily_completeness` + """ completeness = daily_completeness(series, freq) return (completeness >= threshold).reindex(series.index, method='pad') @@ -231,12 +234,13 @@ def start_stop_dates(series, days=10, minimum_completeness=0.333333, """Get the start and end of data excluding leading and trailing gaps. The start and end dates returned by this function can be used to - remove large periods of missing data from the begining and end of + remove large periods of missing data from the beginning and end of the series. The data starts when there are `days` consecutive days - with completeness greater than `minimum_completeness` (see - :py:func:`daily_completeness`) and ends on the last day with + with completeness greater than or equal to `minimum_completeness` + (see :py:func:`daily_completeness`) and ends on the last day with `days` consecutive days with completeness at least - `minimum_completeness` preceeding it. + `minimum_completeness` preceeding it. Periods of incomplete days + between these two dates have no effect on the dates returned. Parameters ---------- @@ -246,18 +250,18 @@ def start_stop_dates(series, days=10, minimum_completeness=0.333333, The minimum number of consecutive valid days for data to be considered valid. minimum_completeness : float, default 0.333333 - The fraction of a day that must have data for the day to be - considered complete. - freq : string or None, default None - The frequency of data in the series. If None, then frequency - is inferred from the index. + The minimum completeness score for a day to be considered + complete. (see :py:func:`daily_completeness`). + freq : str or None, default None + The frequency of data in the series as a pandas frequency + string. If None, then frequency is inferred from the index. Returns ------- start : Datetime or None The first valid day. If there are no sufficiently long periods of valid days then None is returned. - end : Datetime or None + stop : Datetime or None The last valid day. None if start is None. See Also @@ -300,25 +304,33 @@ def start_stop_dates(series, days=10, minimum_completeness=0.333333, def trim(series, **kwargs): - """Mask out missing data from the begining and end of the data. + """Mask out missing data from the beginning and end of the data. - Missing data is determined by the criteria in - :py:func:`valid_between`. + Removes data preceeding the start date and following the stop date + returned by :py:func:`start_stop_dates`. If no start and stop + dates are identified then a series of all False is returned. Parameters ---------- series : Series - A DatatimeIndexed series. + A DatetimeIndexed series. kwargs : Any of the keyword arguments that can be passed to - :py:func:`valid_between`. + :py:func:`start_stop_dates`. Returns ------- Series A series of booleans with the same index as `series` with - False up to the first good day, True from the first to the - last good day, and False from the last good day to the end. + False up to the first complete day, True between the first and + the last complete days, and False following the last complete + day. + + See Also + -------- + :py:func:`start_stop_dates` + + :py:func:`daily_completeness` """ start, end = start_stop_dates(series, **kwargs) From eb29ada49e3523d24333c5ba7c42b4f8107f64d6 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 12 May 2020 08:28:02 -0600 Subject: [PATCH 18/30] Rename threshold parameter to gaps.complete function the new name `minimum_completeness` is much more descriptive. --- pvanalytics/quality/gaps.py | 13 ++++++----- pvanalytics/tests/quality/test_gaps.py | 32 +++++++++++++------------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 6ee12670..185a5c72 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -189,21 +189,21 @@ def daily_completeness(series, freq=None): return (daily_counts * seconds_per_sample) / (1440*60) -def complete(series, threshold=0.333, freq=None): +def complete(series, minimum_completeness=0.333, freq=None): """Select only data points that are part of a day with complete data. A day is complete if its completeness score is greater than or - equal to `threshold`. See :py:func:`daily_completeness` for more + equal to `minimum_completeness`. See :py:func:`daily_completeness` for more information. For example, a day with 24 non-NaN values in a series with 30 minute timestamp spacing would have 12 hours of data and therefore a completeness score of 0.5; with the default - `threshold=0.333` the day would be marked complete. + `minimum_completeness=0.333` the day would be marked complete. Parameters ---------- series : Series The data to be checked for completeness. - threshold : float, default 0.333 + minimum_completeness : float, default 0.333 Fraction of the day that must have data. freq : str, default None The expected frequency of the data in `series`. If none then @@ -213,7 +213,7 @@ def complete(series, threshold=0.333, freq=None): ------- Series A series of booleans with True for each value that is part of - a day with completeness greater than `threshold`. + a day with completeness greater than `minimum_completeness`. Raises ------ @@ -226,7 +226,8 @@ def complete(series, threshold=0.333, freq=None): """ completeness = daily_completeness(series, freq) - return (completeness >= threshold).reindex(series.index, method='pad') + return ((completeness >= minimum_completeness) + .reindex(series.index, method='pad')) def start_stop_dates(series, days=10, minimum_completeness=0.333333, diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index 09a17c25..c7c4ba10 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -462,42 +462,42 @@ def test_daily_completeness_freq_too_high(): def test_complete_threshold_zero(): - """threshold of 0 returns all True regardless of data.""" + """minimum_completeness of 0 returns all True regardless of data.""" ten_days = pd.date_range( start='01/01/2020', freq='15T', end='1/10/2020', closed='left') data = pd.Series(index=ten_days, dtype='float64') assert_series_equal( pd.Series(True, index=data.index), - gaps.complete(data, threshold=0) + gaps.complete(data, minimum_completeness=0) ) data[pd.date_range( start='01/01/2020', freq='1D', end='1/10/2020', closed='left')] = 1.0 data.dropna() assert_series_equal( pd.Series(True, index=data.index), - gaps.complete(data, threshold=0, freq='15T') + gaps.complete(data, minimum_completeness=0, freq='15T') ) data = pd.Series(1.0, index=ten_days) assert_series_equal( pd.Series(True, index=data.index), - gaps.complete(data, threshold=0) + gaps.complete(data, minimum_completeness=0) ) def test_complete_threshold_one(): - """If threshold=1 then any missing data on a day means all data for - the day is flagged False.""" + """If minimum_completeness=1 then any missing data on a day means all + data for the day is flagged False.""" ten_days = pd.date_range( start='01/01/2020', freq='15T', end='01/10/2020', closed='left') data = pd.Series(index=ten_days, dtype='float64') assert_series_equal( pd.Series(False, index=data.index), - gaps.complete(data, threshold=1.0) + gaps.complete(data, minimum_completeness=1.0) ) data.loc[:] = 1 assert_series_equal( pd.Series(True, index=data.index), - gaps.complete(data, threshold=1.0) + gaps.complete(data, minimum_completeness=1.0) ) # remove one data-point per day days = pd.date_range( @@ -505,7 +505,7 @@ def test_complete_threshold_one(): data.loc[days] = np.nan assert_series_equal( pd.Series(False, index=data.index), - gaps.complete(data, threshold=1.0) + gaps.complete(data, minimum_completeness=1.0) ) # check that dropping the NaNs still gives the same result with # and without passing `freq`. (There should be enough data to infer the @@ -513,11 +513,11 @@ def test_complete_threshold_one(): data.dropna() assert_series_equal( pd.Series(False, index=data.index), - gaps.complete(data, threshold=1.0) + gaps.complete(data, minimum_completeness=1.0) ) assert_series_equal( - gaps.complete(data, threshold=1.0), - gaps.complete(data, threshold=1.0, freq='15T') + gaps.complete(data, minimum_completeness=1.0), + gaps.complete(data, minimum_completeness=1.0, freq='15T') ) @@ -543,22 +543,22 @@ def test_complete(): expected.loc['1/5/2020':] = True assert_series_equal( expected, - gaps.complete(data, threshold=1.0) + gaps.complete(data, minimum_completeness=1.0) ) expected.loc['1/2/2020'] = True assert_series_equal( expected, - gaps.complete(data, threshold=0.5) + gaps.complete(data, minimum_completeness=0.5) ) expected.loc['1/3/2020'] = True assert_series_equal( expected, - gaps.complete(data, threshold=0.3) + gaps.complete(data, minimum_completeness=0.3) ) assert_series_equal( pd.Series(True, index=data.index), - gaps.complete(data, threshold=0.2) + gaps.complete(data, minimum_completeness=0.2) ) From e3c91d8ff6966558ae9810ea874be09f688b8bad Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 12 May 2020 08:29:41 -0600 Subject: [PATCH 19/30] Rework gaps.complete for improved readability. Reduces the number of operations taking place in the return statement. Minor change, but somewhat easier to read and understand. --- pvanalytics/quality/gaps.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 185a5c72..c4eb1e99 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -225,9 +225,8 @@ def complete(series, minimum_completeness=0.333, freq=None): :py:func:`daily_completeness` """ - completeness = daily_completeness(series, freq) - return ((completeness >= minimum_completeness) - .reindex(series.index, method='pad')) + complete_days = daily_completeness(series, freq) >= minimum_completeness + return complete_days.reindex(series.index, method='pad') def start_stop_dates(series, days=10, minimum_completeness=0.333333, From f97093f19a94609edc64500d1b9d13f48be926fe Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 14 May 2020 08:03:36 -0600 Subject: [PATCH 20/30] Apply documentation changes suggested in code review. Co-authored-by: Cliff Hansen --- docs/api.rst | 3 +-- pvanalytics/quality/gaps.py | 24 +++++++++++------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index cfa77fe2..81aff36d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -61,7 +61,7 @@ The following functions identify days with incomplete data. quality.gaps.daily_completeness quality.gaps.complete -Many data sets may have leading and trailing periods with sparodic or +Many data sets may have leading and trailing periods of days with sporadic or no data. The following functions can be used to remove those periods. .. autosummary:: @@ -133,4 +133,3 @@ Clearsky .. [1] C. N. Long and Y. Shi, An Automated Quality Assessment and Control Algorithm for Surface Radiation Measurements, The Open Atmospheric Science Journal 2, pp. 23-37, 2008. - diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index c4eb1e99..6799abd7 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -148,14 +148,15 @@ def _freq_to_seconds(freq): def daily_completeness(series, freq=None): - """Calculate a completeness score for each day. + """Calculate a data completeness score for each day. The completeness score for a given day is the fraction of time in the day for which there is data (a value other than NaN). The time - attributed to each value is equal to the timestamp spacing of - `series` or `freq` if it is specified. For example, a day with 24 - non-NaN values in a series with 30 minute timestamp spacing would - have 12 hours of data and therefore completeness score of 0.5. + duration attributed to each value is equal to the timestamp + spacing of `series` or `freq` if it is specified. For example, a + 24-hour time series with 30 minute timestamp spacing and 24 + non-NaN values would have data for a total of 12 hours and + therefore a completeness score of 0.5. Parameters ---------- @@ -190,14 +191,11 @@ def daily_completeness(series, freq=None): def complete(series, minimum_completeness=0.333, freq=None): - """Select only data points that are part of a day with complete data. - - A day is complete if its completeness score is greater than or - equal to `minimum_completeness`. See :py:func:`daily_completeness` for more - information. For example, a day with 24 non-NaN values in a series - with 30 minute timestamp spacing would have 12 hours of data and - therefore a completeness score of 0.5; with the default - `minimum_completeness=0.333` the day would be marked complete. + """Select data points that are part of days with complete data. + + A day has complete data if its completeness score is greater than + or equal to `minimum_completeness`. The completeness score is + calculated by :py:func:`daily_completeness`. Parameters ---------- From 02df37e30d1967e3c704d34c8047e62645824c5a Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 14 May 2020 10:29:37 -0600 Subject: [PATCH 21/30] Add keep_index arg to daily_completeness & rename completeness_score move the reindexing inside the completeness score function. Keeps operations that affect the index in the same function. --- docs/api.rst | 2 +- pvanalytics/quality/gaps.py | 20 ++++++---- pvanalytics/tests/quality/test_gaps.py | 51 +++++++++++++++++++------- 3 files changed, 51 insertions(+), 22 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 81aff36d..38164600 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -58,7 +58,7 @@ The following functions identify days with incomplete data. .. autosummary:: :toctree: generated/ - quality.gaps.daily_completeness + quality.gaps.completeness_score quality.gaps.complete Many data sets may have leading and trailing periods of days with sporadic or diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 6799abd7..aa18d4ed 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -147,7 +147,7 @@ def _freq_to_seconds(freq): return delta.days * (1440 * 60) + delta.seconds -def daily_completeness(series, freq=None): +def completeness_score(series, freq=None, keep_index=True): """Calculate a data completeness score for each day. The completeness score for a given day is the fraction of time in @@ -166,13 +166,15 @@ def daily_completeness(series, freq=None): Interval between samples in the series as a pandas frequency string. If None, the frequency is inferred using :py:func:`pandas.infer_freq`. + keep_index : boolean, default True + Whether or not the returned series has the same index as + `series`. If False the returned series will be indexed by day. Returns ------- Series - A series of floats, indexed by day, giving the completeness - score for each day (fraction of the day for which `series` has - data). + A series of floats giving the completeness score for each day + (fraction of the day for which `series` has data). Raises ------ @@ -187,7 +189,10 @@ def daily_completeness(series, freq=None): raise ValueError("freq must be less than or equal to the" + " frequency of the series") daily_counts = series.resample('D').count() - return (daily_counts * seconds_per_sample) / (1440*60) + daily_completeness = (daily_counts * seconds_per_sample) / (1440*60) + if keep_index: + return daily_completeness.reindex(series.index, method='pad') + return daily_completeness def complete(series, minimum_completeness=0.333, freq=None): @@ -223,8 +228,7 @@ def complete(series, minimum_completeness=0.333, freq=None): :py:func:`daily_completeness` """ - complete_days = daily_completeness(series, freq) >= minimum_completeness - return complete_days.reindex(series.index, method='pad') + return completeness_score(series, freq=freq) >= minimum_completeness def start_stop_dates(series, days=10, minimum_completeness=0.333333, @@ -276,7 +280,7 @@ def start_stop_dates(series, days=10, minimum_completeness=0.333333, information. """ - completeness = daily_completeness(series, freq) + completeness = completeness_score(series, freq=freq, keep_index=False) complete_days = completeness >= minimum_completeness good_days_preceeding = complete_days.astype('int').rolling( days, closed='right' diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index c7c4ba10..a098c423 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -385,14 +385,15 @@ def test_trim_empty(): assert (~gaps.trim(series, days=3)).all() -def test_daily_completeness_all_nans(): +def test_completeness_score_all_nans(): """A data set with all nans has completeness 0 for each day.""" - completeness = gaps.daily_completeness( + completeness = gaps.completeness_score( pd.Series( np.nan, index=pd.date_range('01/01/2020 00:00', freq='1H', periods=48), dtype='float64' - ) + ), + keep_index=False ) assert_series_equal( pd.Series( @@ -403,12 +404,14 @@ def test_daily_completeness_all_nans(): ) -def test_daily_completeness_no_data(): +def test_completeness_score_no_data(): """A data set with completely missing timestamps and NaNs has completeness 0.""" four_days = pd.date_range(start='01/01/2020', freq='D', periods=4) - completeness = gaps.daily_completeness( - pd.Series(index=four_days, dtype='float64'), freq='15T' + completeness = gaps.completeness_score( + pd.Series(index=four_days, dtype='float64'), + freq='15T', + keep_index=False ) assert_series_equal( pd.Series(0.0, index=four_days), @@ -416,14 +419,14 @@ def test_daily_completeness_no_data(): ) -def test_daily_completeness_incomplete_index(): +def test_completeness_score_incomplete_index(): """A series with one data point per hour has 25% completeness at 15-minute sample frequency""" data = pd.Series( 1, index=pd.date_range(start='01/01/2020', freq='1H', periods=72), ) - completeness = gaps.daily_completeness(data, freq='15T') + completeness = gaps.completeness_score(data, freq='15T', keep_index=False) assert_series_equal( pd.Series( 0.25, @@ -433,12 +436,12 @@ def test_daily_completeness_incomplete_index(): ) -def test_daily_completeness_complete(): +def test_completeness_score_complete(): """A series with data at every point has completeness 1.0""" data = pd.Series( 1, index=pd.date_range(start='01/01/2020', freq='15T', periods=24*4*2) ) - completeness = gaps.daily_completeness(data) + completeness = gaps.completeness_score(data, keep_index=False) assert_series_equal( pd.Series( 1.0, @@ -448,7 +451,7 @@ def test_daily_completeness_complete(): ) -def test_daily_completeness_freq_too_high(): +def test_completeness_score_freq_too_high(): """If the infered freq is shorter than the passed freq an exception is raised.""" data = pd.Series( @@ -456,9 +459,31 @@ def test_daily_completeness_freq_too_high(): index=pd.date_range(start='1/1/2020', freq='15T', periods=24*4*4) ) with pytest.raises(ValueError): - gaps.daily_completeness(data, freq='16T') + gaps.completeness_score(data, freq='16T') with pytest.raises(ValueError): - gaps.daily_completeness(data, freq='1H') + gaps.completeness_score(data, freq='1H') + + +def test_completeness_score_reindex(): + """Every timestamp is marked with completeness for the day when + keep_index=True""" + data = pd.Series( + 1, + index=pd.date_range( + start='1/1/2020', freq='15T', end='1/4/2020', closed='left' + ) + ) + data.loc[pd.date_range(start='1/1/2020', freq='30T', periods=48)] = np.nan + data.loc[pd.date_range(start='1/3/2020', freq='1H', periods=24)] = np.nan + + expected = pd.Series(index=data.index, dtype='float64') + expected.loc['1/1/2020'] = 0.5 + expected.loc['1/2/2020'] = 1.0 + expected.loc['1/3/2020'] = 0.75 + assert_series_equal( + expected, + gaps.completeness_score(data, keep_index=True) + ) def test_complete_threshold_zero(): From 6552a8a72f58ca879509967e75d0bc7df6aa6d57 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 14 May 2020 11:03:05 -0600 Subject: [PATCH 22/30] Improve clarity in calculation of seconds per sample. Slightly more verbose, but much more clear what is happening. Also simplifies the _freq_to_seconds function since it no longer needs to handle freq=None. --- pvanalytics/quality/gaps.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index aa18d4ed..bc9ac50c 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -139,8 +139,6 @@ def interpolation_diff(x, window=3, rtol=1e-5, atol=1e-8): def _freq_to_seconds(freq): - if not freq: - return None if freq.isalpha(): freq = '1' + freq delta = pd.to_timedelta(freq) @@ -183,8 +181,12 @@ def completeness_score(series, freq=None, keep_index=True): """ inferred_seconds = _freq_to_seconds(pd.infer_freq(series.index)) - freq_seconds = _freq_to_seconds(freq) - seconds_per_sample = freq_seconds or inferred_seconds + if freq: + freq_seconds = _freq_to_seconds(freq) + seconds_per_sample = freq_seconds + else: + seconds_per_sample = inferred_seconds + if freq and inferred_seconds < freq_seconds: raise ValueError("freq must be less than or equal to the" + " frequency of the series") From 17fc35f085442ab3f8ae3cee5218f2170774b639 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 14 May 2020 11:07:31 -0600 Subject: [PATCH 23/30] use a more descriptive variable name in gaps.trim --- pvanalytics/quality/gaps.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index bc9ac50c..b2f3a405 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -338,8 +338,8 @@ def trim(series, **kwargs): """ start, end = start_stop_dates(series, **kwargs) - s = pd.Series(index=series.index, dtype='bool') - s.loc[:] = False + mask = pd.Series(index=series.index, dtype='bool') + mask.loc[:] = False if start: - s.loc[start.date():end.date()] = True - return s + mask.loc[start.date():end.date()] = True + return mask From 82d87ff4cad9b8d1a5f89170d0bb68a577e9f788 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 14 May 2020 15:02:57 -0600 Subject: [PATCH 24/30] Rework old tests for the new start_stop_dates API New function takes a series of booleans and looks for `days` days long consecutive blocks of data where every value is True. --- pvanalytics/quality/gaps.py | 46 ++++++++++++- pvanalytics/tests/quality/test_gaps.py | 92 +++++++++----------------- 2 files changed, 75 insertions(+), 63 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index b2f3a405..054d4084 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -233,7 +233,51 @@ def complete(series, minimum_completeness=0.333, freq=None): return completeness_score(series, freq=freq) >= minimum_completeness -def start_stop_dates(series, days=10, minimum_completeness=0.333333, +def start_stop_dates(series, days=10): + """Get the start and end of data excluding leading and trailing gaps. + + Parameters + ---------- + series : Series + A DatetimeIndexed series of booleans. + days : int, default 10 + The minimum number of consecutive days where every value in + `series` is True for data to start or stop. + + Returns + ------- + start : Datetime or None + The first valid day. If there are no sufficiently long periods + of valid days then None is returned. + stop : Datetime or None + The last valid day. None if start is None. + + """ + good_days = series.resample('D').apply(all) + good_days_preceeding = good_days.astype('int').rolling( + days, closed='right' + ).sum() + good_days_following = good_days_preceeding.shift(periods=-(days-1)) + following_above_threshold = good_days_following[ + good_days_following >= days + ] + preceeding_above_threshold = good_days_preceeding[ + good_days_preceeding >= days + ] + + start = None + end = None + + if len(following_above_threshold) > 0: + start = following_above_threshold.index[0] + + if len(preceeding_above_threshold) > 0: + end = preceeding_above_threshold.index[-1] + + return start, end + + +def start_stop_complete(series, days=10, minimum_completeness=0.333333, freq=None): """Get the start and end of data excluding leading and trailing gaps. diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index a098c423..b3be0755 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -206,60 +206,50 @@ def test_interpolation_diff_raises_error(interpolated_data): gaps.interpolation_diff(interpolated_data, window=2) -def test_start_stop_dates_no_missing_data(): - """If there is no missing data firstlastvaliddays should return the - start and end of the series. - - """ +def test_start_stop_dates_all_true(): + """If all values are True then start and stop are equal to first and + last day of the series.""" index = pd.date_range( freq='15T', start='01-01-2020', end='08-01-2020 23:00' ) - series = pd.Series( - data=np.full(len(index), 10), - index=index - ) + series = pd.Series(True, index=index) firstvalid, lastvalid = gaps.start_stop_dates(series) - assert firstvalid.date() == pd.Timestamp('01-01-2020').date() - assert lastvalid.date() == pd.Timestamp('08-01-2020').date() - + assert firstvalid.date() == series.index[0].date() + assert lastvalid.date() == series.index[-1].date() -def test_first_day_missing_data(): - """If the first day is missing data, the first valid date should be - the second day. - """ +def test_start_stop_dates_first_day_false(): + """If day one is all False, then start date should be day 2.""" index = pd.date_range( freq='15T', start='01-01-2020', end='08-01-2020 23:00' ) - data = np.full(len(index), 10) - series = pd.Series(data=data, index=index) - series['01-01-2020 00:00':'01-02-2020 00:00'] = np.nan + series = pd.Series(True, index=index) + series.loc['01-01-2020'] = False firstvalid, lastvalid = gaps.start_stop_dates(series) assert firstvalid.date() == pd.Timestamp('01-02-2020').date() assert lastvalid.date() == pd.Timestamp('08-01-2020').date() -def test_first_and_fifth_days_missing(): +def test_start_stop_dates_first_and_fifth_days_missing(): """First valid date should be the sixth of January.""" index = pd.date_range( freq='15T', start='01-01-2020', end='08-01-2020 23:00' ) - data = np.full(len(index), 10) - series = pd.Series(data=data, index=index) - series['01-01-2020 00:00':'01-02-2020 00:00'] = np.nan - series['01-05-2020 00:00':'01-06-2020 00:00'] = np.nan + series = pd.Series(True, index=index) + series.loc['01-01-2020'] = False + series.loc['01-05-2020'] = False firstvalid, lastvalid = gaps.start_stop_dates(series) assert firstvalid.date() == pd.Timestamp('01-06-2020').date() assert lastvalid.date() == pd.Timestamp('08-01-2020').date() -def test_last_two_days_missing(): +def test_start_stop_dates_last_two_days_missing(): """If the last two days of data are missing last valid day should be July 30. @@ -269,53 +259,33 @@ def test_last_two_days_missing(): start='01-01-2020', end='08-01-2020 23:00' ) - data = np.full(len(index), 10) - series = pd.Series(data=data, index=index) - series['07-31-2020 00:00':'08-01-2020 23:00'] = np.nan + series = pd.Series(True, index=index) + series.loc['07-31-2020':'08-01-2020'] = False firstvalid, lastvalid = gaps.start_stop_dates(series) assert firstvalid.date() == pd.Timestamp('01-01-2020').date() assert lastvalid.date() == pd.Timestamp('07-30-2020').date() -def test_start_stop_dates_no_data(): +def test_start_stop_dates_all_false(): """If the passed to start_stop_dates is empty the returns (None, None).""" index = pd.date_range( freq='15T', start='01-01-2020', end='08-01-2020 23:00' ) - series = pd.Series(index=index, data=np.full(len(index), np.nan)) + series = pd.Series(False, index=index) assert (None, None) == gaps.start_stop_dates(series) -def test_start_stop_dates_sparse_data(): - """Check that days with only a few hours of data aren't considered - valid. - - """ +def test_start_stop_dates_not_enough_days(): + """Fewer than 10 days of True gives not start/stop dates.""" index = pd.date_range( freq='15T', start='01-01-2020', end='08-01-2020 23:00' ) - series = pd.Series(index=index, data=np.full(len(index), 2.3)) - series['01-02-2020 00:00':'01-02-2020 06:00'] = np.nan - series['01-02-2020 08:00':'01-02-2020 21:00'] = np.nan - series['07-31-2020 07:00':] = np.nan - start, end = gaps.start_stop_dates(series) - assert start.date() == pd.Timestamp('01-03-2020').date() - assert end.date() == pd.Timestamp('07-30-2020').date() - - -def test_start_stop_dates_not_enough_data(): - """Only one day of data is not ehough for any valid days.""" - index = pd.date_range( - freq='15T', - start='01-01-2020', - end='08-01-2020 23:00' - ) - series = pd.Series(index=index, dtype='float64') - series['02-23-2020 08:00':'02-24-2020 08:00'] = 1 + series = pd.Series(False, index=index) + series['02-23-2020':'02-24-2020'] = True assert (None, None) == gaps.start_stop_dates(series) @@ -329,26 +299,24 @@ def test_start_stop_dates_one_day(): start='01-01-2020', end='08-01-2020 23:00' ) - series = pd.Series(index=index, dtype='float64') - series['05-05-2020'] = 2 + series = pd.Series(False, index=index) + series['05-05-2020'] = True start, end = gaps.start_stop_dates(series, days=1) assert start.date() == pd.Timestamp('05-05-2020').date() assert end.date() == pd.Timestamp('05-05-2020').date() def test_start_stop_dates_with_gaps_in_middle(): - """When there are gaps in the data longer than `days` valid between - should include those gaps, as long as there are `days` consecutive - days with enough data some time after the gap. - - """ + """large gaps between the first and last sufficiently long block of + consecutive 'good' days have no effect on the start and stop + date.""" index = pd.date_range( freq='15T', start='01-01-2020', end='08-01-2020 23:00' ) - series = pd.Series(index=index, data=np.full(len(index), 1)) - series['03-05-2020':'03-25-2020'] = np.nan + series = pd.Series(True, index=index) + series['03-05-2020':'03-25-2020'] = False start, end = gaps.start_stop_dates(series, days=5) assert start.date() == index[0].date() assert end.date() == index[-1].date() From 21cfb3330f7f95b50dd98dbf969de11bbae78126 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 14 May 2020 15:28:22 -0600 Subject: [PATCH 25/30] Rewrite trim function as trim_incomplete Uses the new interface for start_stop_dates, shifting the daily_completeness calculations into trim_incomplete. --- pvanalytics/quality/gaps.py | 94 ++++---------------------- pvanalytics/tests/quality/test_gaps.py | 12 ++-- 2 files changed, 20 insertions(+), 86 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 054d4084..66da25d2 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -277,81 +277,7 @@ def start_stop_dates(series, days=10): return start, end -def start_stop_complete(series, days=10, minimum_completeness=0.333333, - freq=None): - """Get the start and end of data excluding leading and trailing gaps. - - The start and end dates returned by this function can be used to - remove large periods of missing data from the beginning and end of - the series. The data starts when there are `days` consecutive days - with completeness greater than or equal to `minimum_completeness` - (see :py:func:`daily_completeness`) and ends on the last day with - `days` consecutive days with completeness at least - `minimum_completeness` preceeding it. Periods of incomplete days - between these two dates have no effect on the dates returned. - - Parameters - ---------- - series : Series - A DatetimeIndexed series. - days : int, default 10 - The minimum number of consecutive valid days for data to be - considered valid. - minimum_completeness : float, default 0.333333 - The minimum completeness score for a day to be considered - complete. (see :py:func:`daily_completeness`). - freq : str or None, default None - The frequency of data in the series as a pandas frequency - string. If None, then frequency is inferred from the index. - - Returns - ------- - start : Datetime or None - The first valid day. If there are no sufficiently long periods - of valid days then None is returned. - stop : Datetime or None - The last valid day. None if start is None. - - See Also - -------- - :py:func:`daily_completeness` - - Notes - ----- - This function was derived from the pvfleets_qa_analysis project, - Copyright (c) 2020 Alliance for Sustainable Energy, LLC. See the - file LICENSES/PVFLEETS_QA_LICENSE at the top level directory of - this distribution and at ``_ for more - information. - - """ - completeness = completeness_score(series, freq=freq, keep_index=False) - complete_days = completeness >= minimum_completeness - good_days_preceeding = complete_days.astype('int').rolling( - days, closed='right' - ).sum() - good_days_following = good_days_preceeding.shift(periods=-(days-1)) - following_above_threshold = good_days_following[ - good_days_following >= days - ] - preceeding_above_threshold = good_days_preceeding[ - good_days_preceeding >= days - ] - - start = None - end = None - - if len(following_above_threshold) > 0: - start = following_above_threshold.index[0] - - if len(preceeding_above_threshold) > 0: - end = preceeding_above_threshold.index[-1] - - return start, end - - -def trim(series, **kwargs): +def trim_incomplete(series, minimum_completeness=0.333333, days=10, freq=None): """Mask out missing data from the beginning and end of the data. Removes data preceeding the start date and following the stop date @@ -362,9 +288,15 @@ def trim(series, **kwargs): ---------- series : Series A DatetimeIndexed series. - kwargs : - Any of the keyword arguments that can be passed to - :py:func:`start_stop_dates`. + minimum_completeness : float, default 0.333333 + The minimum completeness score for each day. + days : int, default 10 + The number of consecutive days with completeness greater than + `minumum_completeness` for the 'good' data to start or + end. See :py:func:`start_stop_dates` for more information. + freq : str, default None + The expected frequency of the series. See + :py:func:`completeness_score` fore more information. Returns ------- @@ -378,10 +310,12 @@ def trim(series, **kwargs): -------- :py:func:`start_stop_dates` - :py:func:`daily_completeness` + :py:func:`completeness_score` """ - start, end = start_stop_dates(series, **kwargs) + completeness = completeness_score(series, freq=freq, keep_index=False) + complete_days = completeness >= minimum_completeness + start, end = start_stop_dates(complete_days, days=days) mask = pd.Series(index=series.index, dtype='bool') mask.loc[:] = False if start: diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index b3be0755..d4f18fd6 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -322,8 +322,8 @@ def test_start_stop_dates_with_gaps_in_middle(): assert end.date() == index[-1].date() -def test_trim(): - """gaps.trim() should return a boolean mask that selects only the good +def test_trim_incomplete(): + """gaps.trim_incomplete() should return a boolean mask that selects only the good data in the middle of a series. """ @@ -336,13 +336,13 @@ def test_trim(): series['01-02-2020':'01-07-2020 13:00'] = np.nan series['01-10-2020':'01-11-2020'] = np.nan assert_series_equal( - series[gaps.trim(series, days=3)], + series[gaps.trim_incomplete(series, days=3)], series['01-07-2020':'08-01-2020 00:00'] ) -def test_trim_empty(): - """gaps.trim() returns all False for series with no valid days.""" +def test_trim_incomplete_empty(): + """gaps.trim_incomplete() returns all False for series with no valid days.""" index = pd.date_range( freq='15T', start='01-01-2020', @@ -350,7 +350,7 @@ def test_trim_empty(): ) series = pd.Series(index=index, dtype='float64') series.iloc[::(24*60)] = 1 - assert (~gaps.trim(series, days=3)).all() + assert (~gaps.trim_incomplete(series, days=3)).all() def test_completeness_score_all_nans(): From 7ef91ea40d3e19cf70e0a8d54a338d16833f568f Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 14 May 2020 15:30:38 -0600 Subject: [PATCH 26/30] Update references to daily_completeness in documentation daily_completeness has been renamed completeness_score --- pvanalytics/quality/gaps.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 66da25d2..f6e2a9c9 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -202,7 +202,7 @@ def complete(series, minimum_completeness=0.333, freq=None): A day has complete data if its completeness score is greater than or equal to `minimum_completeness`. The completeness score is - calculated by :py:func:`daily_completeness`. + calculated by :py:func:`completeness_score`. Parameters ---------- @@ -223,11 +223,11 @@ def complete(series, minimum_completeness=0.333, freq=None): Raises ------ ValueError - See :py:func:`daily_completeness`. + See :py:func:`completeness_score`. See Also -------- - :py:func:`daily_completeness` + :py:func:`completeness_score` """ return completeness_score(series, freq=freq) >= minimum_completeness From ef706f8c7fac50b8443377babf55c7afa59a4d86 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Thu, 14 May 2020 15:52:12 -0600 Subject: [PATCH 27/30] Documentation for more general 'gaps.trim' function --- pvanalytics/quality/gaps.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index f6e2a9c9..95e9190c 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -277,6 +277,28 @@ def start_stop_dates(series, days=10): return start, end +def trim(series, days=10): + """Mask the begining and end of the data if there are gaps. + + Parameters + ---------- + series : Series + A DatetimeIndexed series of booleans + days : int, default 10 + Minimum number of consecutive days that are all True for + 'good' data to start. + + Returns + ------- + Series + A series of booleans with True for all data points between the + first and last block of `days` consecutive days that are all + True in `series` + + """ + pass + + def trim_incomplete(series, minimum_completeness=0.333333, days=10, freq=None): """Mask out missing data from the beginning and end of the data. From 9fc7a59d8aa6cbf1b3fa2ee99428539dbadb7169 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 19 May 2020 08:11:50 -0600 Subject: [PATCH 28/30] Generic function for trimming begining and end of time series refactored trim_incomplete to use the generic function. --- pvanalytics/quality/gaps.py | 15 ++++++------- pvanalytics/tests/quality/test_gaps.py | 30 +++++++++++++++++++++----- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 95e9190c..9e7b7ba3 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -296,7 +296,11 @@ def trim(series, days=10): True in `series` """ - pass + start, end = start_stop_dates(series, days=days) + mask = pd.Series(False, index=series.index) + if start: + mask.loc[start.date():end.date()] = True + return mask def trim_incomplete(series, minimum_completeness=0.333333, days=10, freq=None): @@ -335,11 +339,6 @@ def trim_incomplete(series, minimum_completeness=0.333333, days=10, freq=None): :py:func:`completeness_score` """ - completeness = completeness_score(series, freq=freq, keep_index=False) + completeness = completeness_score(series, freq=freq) complete_days = completeness >= minimum_completeness - start, end = start_stop_dates(complete_days, days=days) - mask = pd.Series(index=series.index, dtype='bool') - mask.loc[:] = False - if start: - mask.loc[start.date():end.date()] = True - return mask + return trim(complete_days, days=days) diff --git a/pvanalytics/tests/quality/test_gaps.py b/pvanalytics/tests/quality/test_gaps.py index d4f18fd6..c4ce6452 100644 --- a/pvanalytics/tests/quality/test_gaps.py +++ b/pvanalytics/tests/quality/test_gaps.py @@ -323,10 +323,8 @@ def test_start_stop_dates_with_gaps_in_middle(): def test_trim_incomplete(): - """gaps.trim_incomplete() should return a boolean mask that selects only the good - data in the middle of a series. - - """ + """gaps.trim_incomplete() should return a boolean mask that selects + only the good data in the middle of a series.""" index = pd.date_range( freq='15T', start='01-01-2020', @@ -342,7 +340,8 @@ def test_trim_incomplete(): def test_trim_incomplete_empty(): - """gaps.trim_incomplete() returns all False for series with no valid days.""" + """gaps.trim_incomplete() returns all False for series with no valid + days.""" index = pd.date_range( freq='15T', start='01-01-2020', @@ -353,6 +352,27 @@ def test_trim_incomplete_empty(): assert (~gaps.trim_incomplete(series, days=3)).all() +def test_trim_daily_index(): + """trim works when data has a daily index.""" + data = pd.Series(True, index=pd.date_range( + start='1/1/2020', end='3/1/2020', freq='D', closed='left')) + assert gaps.trim(data).all() + data.iloc[0:8] = False + data.iloc[9] = False + expected = data.copy() + expected.iloc[0:10] = False + assert_series_equal( + expected, + gaps.trim(data) + ) + data.iloc[-5:] = False + expected.iloc[-5:] = False + assert_series_equal( + expected, + gaps.trim(data) + ) + + def test_completeness_score_all_nans(): """A data set with all nans has completeness 0 for each day.""" completeness = gaps.completeness_score( From 941bf79820f091deeffcaf625eefe04d0e407f6e Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 19 May 2020 11:29:17 -0600 Subject: [PATCH 29/30] Apply suggestions from code review Co-authored-by: Cliff Hansen --- pvanalytics/quality/gaps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index 9e7b7ba3..fb1de36e 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -151,7 +151,7 @@ def completeness_score(series, freq=None, keep_index=True): The completeness score for a given day is the fraction of time in the day for which there is data (a value other than NaN). The time duration attributed to each value is equal to the timestamp - spacing of `series` or `freq` if it is specified. For example, a + spacing of `series`, or `freq` if it is specified. For example, a 24-hour time series with 30 minute timestamp spacing and 24 non-NaN values would have data for a total of 12 hours and therefore a completeness score of 0.5. @@ -306,7 +306,7 @@ def trim(series, days=10): def trim_incomplete(series, minimum_completeness=0.333333, days=10, freq=None): """Mask out missing data from the beginning and end of the data. - Removes data preceeding the start date and following the stop date + False for times preceeding the start date and following the stop date returned by :py:func:`start_stop_dates`. If no start and stop dates are identified then a series of all False is returned. From ddf6bb6631e40cad07c2a910dbf7947d78c285c4 Mon Sep 17 00:00:00 2001 From: Will Vining Date: Tue, 19 May 2020 12:21:29 -0600 Subject: [PATCH 30/30] Documentation improvements. --- docs/api.rst | 1 + pvanalytics/quality/gaps.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 38164600..c51f3cda 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -69,6 +69,7 @@ no data. The following functions can be used to remove those periods. quality.gaps.start_stop_dates quality.gaps.trim + quality.gaps.trim_incomplete Outliers -------- diff --git a/pvanalytics/quality/gaps.py b/pvanalytics/quality/gaps.py index fb1de36e..a08bf03c 100644 --- a/pvanalytics/quality/gaps.py +++ b/pvanalytics/quality/gaps.py @@ -278,7 +278,7 @@ def start_stop_dates(series, days=10): def trim(series, days=10): - """Mask the begining and end of the data if there are gaps. + """Mask the beginning and end of the data if not all True. Parameters ---------- @@ -293,7 +293,13 @@ def trim(series, days=10): Series A series of booleans with True for all data points between the first and last block of `days` consecutive days that are all - True in `series` + True in `series`. If `series` does not contain such a block of + consecutive True values, then the returned series will be + entirely False. + + See Also + -------- + :py:func:`start_stop_dates` """ start, end = start_stop_dates(series, days=days) @@ -304,11 +310,9 @@ def trim(series, days=10): def trim_incomplete(series, minimum_completeness=0.333333, days=10, freq=None): - """Mask out missing data from the beginning and end of the data. + """Trim the series based on the completeness score. - False for times preceeding the start date and following the stop date - returned by :py:func:`start_stop_dates`. If no start and stop - dates are identified then a series of all False is returned. + Combines :py:func:`completeness_score` and :py:func:`trim`. Parameters ---------- @@ -334,7 +338,7 @@ def trim_incomplete(series, minimum_completeness=0.333333, days=10, freq=None): See Also -------- - :py:func:`start_stop_dates` + :py:func:`trim` :py:func:`completeness_score`