Merge 9dea9d6 into aa04b16

pvlib · Sep 29, 2020 · d7d242e · d7d242e
2 parents aa04b16 + 9dea9d6
commit d7d242e
Show file tree

Hide file tree

Showing 6 changed files with 233 additions and 25 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -179,6 +179,7 @@ Functions for identifying inverter clipping
 
    features.clipping.levels
    features.clipping.threshold
+   features.clipping.geometric
 
 Clearsky
 --------

diff --git a/pvanalytics/features/clipping.py b/pvanalytics/features/clipping.py
@@ -1,6 +1,9 @@
 """Functions for identifying clipping."""
 import pandas as pd
 import numpy as np
+from pandas.tseries import frequencies
+
+from pvanalytics.util import _normalize, _group
 
 
 def _detect_levels(x, count=3, num_bins=100):
@@ -223,3 +226,107 @@ def threshold(ac_power, slope_max=0.0035, power_min=0.75,
         freq=freq
     )
     return ac_power >= threshold
+
+
+def _estimate_freq_minutes(index):
+    # Estimate the timestamp frequency in minutes.
+    #
+    # Parameters
+    # ----------
+    # index : DatetimeIndex
+    #
+    # Returns
+    # -------
+    # float
+    #     Timestamp spacing in minutes.
+    freq = pd.infer_freq(index)
+    if freq:
+        return pd.to_timedelta(frequencies.to_offset(freq)).seconds / 60
+    return index.to_series().diff().astype('timedelta64[m]').mode()[0]
+
+
+def geometric(power_ac, clip_min=0.8, daily_fraction_min=0.9,
+              length_min=2, margin=0.01,
+              freq_minutes=None, derivative_max=None):
+    """Identify inverter clipping from the shape of the AC power output.
+
+    `power_ac` is normalized, then on each day, `power_ac` values which appear to be
+     clipped are identified according to the following criteria:
+
+    - the normalized value is greater than `clip_min`
+    - the forward or backward difference is less than `derivative_max`
+    - the value is at least `daily_fraction_min` of the maximum value
+      on the same day
+    - the value is part of a  sequence of `length_min` consecutive values
+      which meet the above criteria
+
+    On each day, the values that meet these criteria are flagged and 
+    are used to identify a daily threshold. The daily threshold is the
+    minimum of the flagged values minus `margin`. Any flagged value
+    on that day that is greater than or equal to the threshold is 
+    considered clipped and the resulting mask is returned.
+
+    Parameters
+    ----------
+    power_ac : Series
+        AC power.
+    clip_min : float, default 0.8
+        After normalization a clipped value must be greater than `clip_min`
+    daily_fraction_min : float, default 0.9
+        A clipped value must be greater than `daily_fraction_min` times the
+        maximum value on the same day.
+    length_min : int, default 2
+        Minimum number of sequential values that satisfy clipping criteria.
+    margin : float, default 0.01
+        Values within +/- `margin` of the daily clipping threshold are flagged
+        as clipped.
+    freq_minutes : float, optional
+        Timestamp spacing in minutes. If not specified timestamp spacing
+        will be inferred from the index of `power_ac`.
+    derivative_max : float, optional
+        Values with a derivative less than `derivative_max` are
+        potentially clipped. If not specified the value is based on
+        the timestamp spacing in `power_ac` according to the following
+        formula:
+
+        .. math::
+
+           0.00005 f_m + 0.0009
+
+        where :math:`f_m` is the timestamp spacing in minutes.
+
+    Returns
+    -------
+    Series
+        Boolean series with True for values that appear to be clipped.
+    """
+    if freq_minutes is None:
+        freq_minutes = _estimate_freq_minutes(power_ac.index)
+    if derivative_max is None:
+        # Experimentally derived formula from the PVFleets project
+        derivative_max = (0.00005 * freq_minutes) + 0.0009
+    power_ac = power_ac.copy()
+    power_ac.loc[power_ac < 0] = 0
+    power_normalized = _normalize.min_max(power_ac)
+    forward_diff = abs(power_normalized - power_normalized.shift(-1))
+    backward_diff = abs(power_normalized - power_normalized.shift(1))
+    daily_max = _group.by_day(power_normalized).transform('max')
+    fraction_of_max = power_normalized / daily_max
+    candidate_clipping = ((power_normalized >= clip_min)
+                          & ((forward_diff <= derivative_max)
+                             | (backward_diff <= derivative_max))
+                          & (fraction_of_max >= daily_fraction_min))
+    # clipped values must be part of a sequence of clipped values
+    # that is longer than `length_min`
+    valid_sequence = _group.run_lengths(candidate_clipping) > length_min
+    clipping = candidate_clipping & valid_sequence
+    # Establish a clipping threshold for each day. The threshold is the
+    # minimum candidate clipping value on that day (or NaN on days with
+    # no clipping).
+    daily_clipping_threshold = _group.by_day(
+        power_normalized[clipping].reindex_like(power_normalized)
+    ).transform('min')
+    # For days with clipping, any value greater than the daily clipping
+    # threshold is flagged as clipped. (Days without clipping are implicitly
+    # flagged False here because the clipping threshold on those days is NaN.)
+    return (daily_clipping_threshold - power_normalized) <= margin
diff --git a/pvanalytics/features/daytime.py b/pvanalytics/features/daytime.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pandas as pd
 from pandas.tseries import frequencies
+from pvanalytics.util import _normalize, _group
 
 
 def _rolling_by_minute(data, days, f):
@@ -18,29 +19,6 @@ def _rolling_by_minute(data, days, f):
     return result.sort_index()
 
 
-def _run_lengths(series):
-    # Count the number of equal values adjacent to each value.
-    #
-    # Examples
-    # --------
-    # >>> _run_lengths(pd.Series([True, True, True]))
-    # 0    3
-    # 1    3
-    # 2    3
-    #
-    # >>> _run_lengths(
-    # ...     pd.Series([True, False, False, True, True, False]
-    # ... ))
-    # 0    1
-    # 1    2
-    # 2    2
-    # 3    2
-    # 4    2
-    # 5    1
-    runs = (series != series.shift(1)).cumsum()
-    return runs.groupby(runs).transform('count')
-
-
 def _correct_if_invalid(series, invalid, correction_window):
     # For every time marked `invalid` replace the truth value in `series`
     # with the truth value at the same time in the majority of the
@@ -58,7 +36,7 @@ def _correct_midday_errors(night, minutes_per_value, hours_min,
                            correction_window):
     # identify periods of time that appear to switch from night to day
     # (or day to night) on too short a time scale to be reasonable.
-    invalid = _run_lengths(night)*minutes_per_value <= hours_min*60
+    invalid = _group.run_lengths(night)*minutes_per_value <= hours_min*60
     return _correct_if_invalid(night, invalid, correction_window)
 
 
@@ -99,7 +77,7 @@ def _filter_and_normalize(series, outliers):
     if outliers is not None:
         series.loc[outliers] = np.nan
     series.loc[series < 0] = 0
-    return (series - series.min()) / (series.max() - series.min())
+    return _normalize.min_max(series)
 
 
 def _freqstr_to_minutes(freqstr):

diff --git a/pvanalytics/tests/features/test_clipping.py b/pvanalytics/tests/features/test_clipping.py
@@ -232,3 +232,62 @@ def test_threshold_no_clipping_four_days(quadratic):
     clipped = clipping.threshold(power)
 
     assert not clipped.any()
+
+
+def test_geometric_no_clipping(albuquerque):
+    clearsky = albuquerque.get_clearsky(
+        pd.date_range(
+            start='7/1/2020',
+            end='8/1/2020',
+            freq='15T'
+        ),
+        model='simplified_solis'
+    )
+    clipped = clipping.geometric(clearsky['ghi'])
+    assert not clipped.any()
+
+
+@pytest.fixture(scope='module',
+                params=['H', '15T', '30T'])
+def simple_clipped(request, albuquerque):
+    clearsky = albuquerque.get_clearsky(
+        pd.date_range(
+            start='7/1/2020',
+            end='8/1/2020',
+            closed='left',
+            freq=request.param
+        ),
+        model='simplified_solis'
+    )
+    ghi = clearsky['ghi'].copy()
+    level = ghi.quantile(0.5)
+    ghi.loc[ghi > level] = level
+    expected = ghi >= level - 0.25
+    return {
+        'data': ghi,
+        'expected': expected,
+        'level': level
+    }
+
+
+def test_geometric_simple_clipping(simple_clipped):
+    assert_series_equal(
+        clipping.geometric(simple_clipped['data'], margin=0),
+        simple_clipped['expected'],
+        check_names=False
+    )
+
+
+def test_geometric_uneven_timestamps(simple_clipped):
+    data = simple_clipped['data'].copy()
+    data.loc['7/3/2020 10:00':'7/3/2020 12:00'] = np.nan
+    data.loc['7/10/2020 01:00':'7/10/2020 03:00'] = np.nan
+    freq = pd.infer_freq(data.index)
+    if freq != 'H':
+        data.loc['7/10/2020 13:00':'7/3/2020 13:30'] = np.nan
+    data.dropna(inplace=True)
+    assert_series_equal(
+        clipping.geometric(data, margin=0),
+        simple_clipped['expected'][data.index],
+        check_names=False
+    )
diff --git a/pvanalytics/util/_group.py b/pvanalytics/util/_group.py
@@ -43,3 +43,38 @@ def by_minute(data):
 
     """
     return data.groupby(data.index.minute + data.index.hour * 60)
+
+
+def run_lengths(series):
+    """
+    Count the number of adjacent values that are equal.
+
+    Paramters
+    ---------
+    series : Series
+
+    Returns
+    -------
+    Series
+        An integer for each value specifying the number of adjacent
+        values that are equal.
+
+    Examples
+    --------
+    >>> run_lengths(pd.Series([True, True, True]))
+    0    3
+    1    3
+    2    3
+
+    >>> run_lengths(
+    ...     pd.Series([True, False, False, True, True, False]
+    ... ))
+    0    1
+    1    2
+    2    2
+    3    2
+    4    2
+    5    1
+    """
+    runs = (series != series.shift(1)).cumsum()
+    return runs.groupby(runs).transform('count')
diff --git a/pvanalytics/util/_normalize.py b/pvanalytics/util/_normalize.py
@@ -0,0 +1,28 @@
+"""Functions for normalizing data."""
+
+
+def min_max(data):
+    """Normalize data by its minimum and maximum.
+
+    The following formula is used for normalization
+
+    .. math::
+
+      (x - min(data)) / (max(data) - min(data))
+
+    Parameters
+    ----------
+    data : Series
+        Series of numeric data
+
+    Returns
+    -------
+    Series
+
+        `data` normalized to the range [0, 1] according to the formula
+        above.
+
+    """
+    minimum = data.min()
+    maximum = data.max()
+    return (data - minimum) / (maximum - minimum)