Skip to content

Commit

Permalink
Merge 9dea9d6 into aa04b16
Browse files Browse the repository at this point in the history
  • Loading branch information
wfvining committed Sep 29, 2020
2 parents aa04b16 + 9dea9d6 commit d7d242e
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 25 deletions.
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ Functions for identifying inverter clipping

features.clipping.levels
features.clipping.threshold
features.clipping.geometric

Clearsky
--------
Expand Down
107 changes: 107 additions & 0 deletions pvanalytics/features/clipping.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""Functions for identifying clipping."""
import pandas as pd
import numpy as np
from pandas.tseries import frequencies

from pvanalytics.util import _normalize, _group


def _detect_levels(x, count=3, num_bins=100):
Expand Down Expand Up @@ -223,3 +226,107 @@ def threshold(ac_power, slope_max=0.0035, power_min=0.75,
freq=freq
)
return ac_power >= threshold


def _estimate_freq_minutes(index):
# Estimate the timestamp frequency in minutes.
#
# Parameters
# ----------
# index : DatetimeIndex
#
# Returns
# -------
# float
# Timestamp spacing in minutes.
freq = pd.infer_freq(index)
if freq:
return pd.to_timedelta(frequencies.to_offset(freq)).seconds / 60
return index.to_series().diff().astype('timedelta64[m]').mode()[0]


def geometric(power_ac, clip_min=0.8, daily_fraction_min=0.9,
length_min=2, margin=0.01,
freq_minutes=None, derivative_max=None):
"""Identify inverter clipping from the shape of the AC power output.
`power_ac` is normalized, then on each day, `power_ac` values which appear to be
clipped are identified according to the following criteria:
- the normalized value is greater than `clip_min`
- the forward or backward difference is less than `derivative_max`
- the value is at least `daily_fraction_min` of the maximum value
on the same day
- the value is part of a sequence of `length_min` consecutive values
which meet the above criteria
On each day, the values that meet these criteria are flagged and
are used to identify a daily threshold. The daily threshold is the
minimum of the flagged values minus `margin`. Any flagged value
on that day that is greater than or equal to the threshold is
considered clipped and the resulting mask is returned.
Parameters
----------
power_ac : Series
AC power.
clip_min : float, default 0.8
After normalization a clipped value must be greater than `clip_min`
daily_fraction_min : float, default 0.9
A clipped value must be greater than `daily_fraction_min` times the
maximum value on the same day.
length_min : int, default 2
Minimum number of sequential values that satisfy clipping criteria.
margin : float, default 0.01
Values within +/- `margin` of the daily clipping threshold are flagged
as clipped.
freq_minutes : float, optional
Timestamp spacing in minutes. If not specified timestamp spacing
will be inferred from the index of `power_ac`.
derivative_max : float, optional
Values with a derivative less than `derivative_max` are
potentially clipped. If not specified the value is based on
the timestamp spacing in `power_ac` according to the following
formula:
.. math::
0.00005 f_m + 0.0009
where :math:`f_m` is the timestamp spacing in minutes.
Returns
-------
Series
Boolean series with True for values that appear to be clipped.
"""
if freq_minutes is None:
freq_minutes = _estimate_freq_minutes(power_ac.index)
if derivative_max is None:
# Experimentally derived formula from the PVFleets project
derivative_max = (0.00005 * freq_minutes) + 0.0009
power_ac = power_ac.copy()
power_ac.loc[power_ac < 0] = 0
power_normalized = _normalize.min_max(power_ac)
forward_diff = abs(power_normalized - power_normalized.shift(-1))
backward_diff = abs(power_normalized - power_normalized.shift(1))
daily_max = _group.by_day(power_normalized).transform('max')
fraction_of_max = power_normalized / daily_max
candidate_clipping = ((power_normalized >= clip_min)
& ((forward_diff <= derivative_max)
| (backward_diff <= derivative_max))
& (fraction_of_max >= daily_fraction_min))
# clipped values must be part of a sequence of clipped values
# that is longer than `length_min`
valid_sequence = _group.run_lengths(candidate_clipping) > length_min
clipping = candidate_clipping & valid_sequence
# Establish a clipping threshold for each day. The threshold is the
# minimum candidate clipping value on that day (or NaN on days with
# no clipping).
daily_clipping_threshold = _group.by_day(
power_normalized[clipping].reindex_like(power_normalized)
).transform('min')
# For days with clipping, any value greater than the daily clipping
# threshold is flagged as clipped. (Days without clipping are implicitly
# flagged False here because the clipping threshold on those days is NaN.)
return (daily_clipping_threshold - power_normalized) <= margin
28 changes: 3 additions & 25 deletions pvanalytics/features/daytime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import pandas as pd
from pandas.tseries import frequencies
from pvanalytics.util import _normalize, _group


def _rolling_by_minute(data, days, f):
Expand All @@ -18,29 +19,6 @@ def _rolling_by_minute(data, days, f):
return result.sort_index()


def _run_lengths(series):
# Count the number of equal values adjacent to each value.
#
# Examples
# --------
# >>> _run_lengths(pd.Series([True, True, True]))
# 0 3
# 1 3
# 2 3
#
# >>> _run_lengths(
# ... pd.Series([True, False, False, True, True, False]
# ... ))
# 0 1
# 1 2
# 2 2
# 3 2
# 4 2
# 5 1
runs = (series != series.shift(1)).cumsum()
return runs.groupby(runs).transform('count')


def _correct_if_invalid(series, invalid, correction_window):
# For every time marked `invalid` replace the truth value in `series`
# with the truth value at the same time in the majority of the
Expand All @@ -58,7 +36,7 @@ def _correct_midday_errors(night, minutes_per_value, hours_min,
correction_window):
# identify periods of time that appear to switch from night to day
# (or day to night) on too short a time scale to be reasonable.
invalid = _run_lengths(night)*minutes_per_value <= hours_min*60
invalid = _group.run_lengths(night)*minutes_per_value <= hours_min*60
return _correct_if_invalid(night, invalid, correction_window)


Expand Down Expand Up @@ -99,7 +77,7 @@ def _filter_and_normalize(series, outliers):
if outliers is not None:
series.loc[outliers] = np.nan
series.loc[series < 0] = 0
return (series - series.min()) / (series.max() - series.min())
return _normalize.min_max(series)


def _freqstr_to_minutes(freqstr):
Expand Down
59 changes: 59 additions & 0 deletions pvanalytics/tests/features/test_clipping.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,62 @@ def test_threshold_no_clipping_four_days(quadratic):
clipped = clipping.threshold(power)

assert not clipped.any()


def test_geometric_no_clipping(albuquerque):
clearsky = albuquerque.get_clearsky(
pd.date_range(
start='7/1/2020',
end='8/1/2020',
freq='15T'
),
model='simplified_solis'
)
clipped = clipping.geometric(clearsky['ghi'])
assert not clipped.any()


@pytest.fixture(scope='module',
params=['H', '15T', '30T'])
def simple_clipped(request, albuquerque):
clearsky = albuquerque.get_clearsky(
pd.date_range(
start='7/1/2020',
end='8/1/2020',
closed='left',
freq=request.param
),
model='simplified_solis'
)
ghi = clearsky['ghi'].copy()
level = ghi.quantile(0.5)
ghi.loc[ghi > level] = level
expected = ghi >= level - 0.25
return {
'data': ghi,
'expected': expected,
'level': level
}


def test_geometric_simple_clipping(simple_clipped):
assert_series_equal(
clipping.geometric(simple_clipped['data'], margin=0),
simple_clipped['expected'],
check_names=False
)


def test_geometric_uneven_timestamps(simple_clipped):
data = simple_clipped['data'].copy()
data.loc['7/3/2020 10:00':'7/3/2020 12:00'] = np.nan
data.loc['7/10/2020 01:00':'7/10/2020 03:00'] = np.nan
freq = pd.infer_freq(data.index)
if freq != 'H':
data.loc['7/10/2020 13:00':'7/3/2020 13:30'] = np.nan
data.dropna(inplace=True)
assert_series_equal(
clipping.geometric(data, margin=0),
simple_clipped['expected'][data.index],
check_names=False
)
35 changes: 35 additions & 0 deletions pvanalytics/util/_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,38 @@ def by_minute(data):
"""
return data.groupby(data.index.minute + data.index.hour * 60)


def run_lengths(series):
"""
Count the number of adjacent values that are equal.
Paramters
---------
series : Series
Returns
-------
Series
An integer for each value specifying the number of adjacent
values that are equal.
Examples
--------
>>> run_lengths(pd.Series([True, True, True]))
0 3
1 3
2 3
>>> run_lengths(
... pd.Series([True, False, False, True, True, False]
... ))
0 1
1 2
2 2
3 2
4 2
5 1
"""
runs = (series != series.shift(1)).cumsum()
return runs.groupby(runs).transform('count')
28 changes: 28 additions & 0 deletions pvanalytics/util/_normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Functions for normalizing data."""


def min_max(data):
"""Normalize data by its minimum and maximum.
The following formula is used for normalization
.. math::
(x - min(data)) / (max(data) - min(data))
Parameters
----------
data : Series
Series of numeric data
Returns
-------
Series
`data` normalized to the range [0, 1] according to the formula
above.
"""
minimum = data.min()
maximum = data.max()
return (data - minimum) / (maximum - minimum)

0 comments on commit d7d242e

Please sign in to comment.