Skip to content

Commit

Permalink
Fix histogram statistics calculation
Browse files Browse the repository at this point in the history
Due to the loss of precision involved with recalculating the wx and wx2
values, as well as the lack of scaling the weights when scaling the
histogram by ROOT, we use a simpler implementation where we just use
the values from ROOT. However, recalculation is also implemented for
future potential use.
  • Loading branch information
raymondEhlers committed Aug 18, 2019
1 parent c5b23a7 commit 6ee9f7b
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 45 deletions.
170 changes: 125 additions & 45 deletions pachyderm/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,15 +247,7 @@ def mean(self) -> float:
Returns:
Mean of the histogram.
"""
# Validation
if "total_sum_wx" not in self.metadata:
raise ValueError("Sum of weights * x is not available, so we cannot calculate the mean.")
# Calculate the mean.
total_sum_w = np.sum(self.y)
if total_sum_w > 0:
return cast(float, self.metadata["total_sum_wx"] / total_sum_w)
# Can't divide, so return NaN
return np.nan # type: ignore
return binned_mean(self.metadata)

@property
def std_dev(self) -> float:
Expand All @@ -268,7 +260,7 @@ def std_dev(self) -> float:
Returns:
Standard deviation of the histogram.
"""
return cast(float, np.sqrt(self.variance))
return binned_standard_deviation(self.metadata)

@property
def variance(self) -> float:
Expand All @@ -281,21 +273,7 @@ def variance(self) -> float:
Returns:
Variance of the histogram.
"""
# Validation
if "total_sum_wx" not in self.metadata:
raise ValueError("Sum of weights * x is not available, so we cannot calculate the variance.")
# Validation
if "total_sum_wx2" not in self.metadata:
raise ValueError("Sum of weights * x * x is not available, so we cannot calculate the variance.")
# Calculate the variance.
total_sum_w = np.sum(self.y)
if total_sum_w > 0:
return cast(
float,
(self.metadata["total_sum_wx2"] - (self.metadata["total_sum_wx"] ** 2 / total_sum_w)) / total_sum_w
)
# Can't divide, so return NaN
return np.nan # type: ignore
return binned_variance(self.metadata)

def find_bin(self, value: float) -> int:
""" Find the bin corresponding to the specified value.
Expand Down Expand Up @@ -591,15 +569,10 @@ def _from_uproot(hist: Any) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[st

# Extract stats information. It will be stored in the metadata.
metadata = {}
# We extract the values directly from the members.
total_sum_w = hist._fTsumw
total_sum_w2 = hist._fTsumw2
total_sum_wx = hist._fTsumwx
total_sum_wx2 = hist._fTsumwx2
# Cross check.
assert np.isclose(total_sum_w, np.sum(y))
assert np.isclose(total_sum_w2, np.sum(errors))
metadata.update({"total_sum_wx": total_sum_wx, "total_sum_wx2": total_sum_wx2})
## We extract the values directly from the members.
metadata.update(_create_stats_dict_from_values(
hist._fTsumw, hist._fTsumw2, hist._fTsumwx, hist._fTsumwx2
))

return (bin_edges, y, errors, metadata)

Expand Down Expand Up @@ -645,18 +618,13 @@ def _from_th1(hist: Hist) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str,
# They are useful for calculating histogram properties (mean, variance, etc).
stats = np.array([0, 0, 0, 0], dtype = np.float64)
hist.GetStats(np.ctypeslib.as_ctypes(stats))
# Return values are (each one a single value):
# Return values are (each one is a single float):
# [1], [2], [3], [4]
# [1]: Sum of weights (equal to np.sum(y))
# [2]: Sum of weights squared (equal to np.sum(errors_squared))
# [3]: Sum of w*x
# [4}: Sum of w*x*x
total_sum_w, total_sum_w2, total_sum_wx, total_sum_wx2 = stats
# Cross checks.
assert np.isclose(np.sum(y), total_sum_w)
assert np.isclose(np.sum(errors), total_sum_w2)

metadata.update({"total_sum_wx": total_sum_wx, "total_sum_wx2": total_sum_wx2})
# [1]: total_sum_w: Sum of weights (equal to np.sum(y) if unscaled)
# [2]: total_sum_w2: Sum of weights squared (equal to np.sum(errors_squared) if unscaled)
# [3]: total_sum_wx: Sum of w*x
# [4}: total_sum_wx2: Sum of w*x*x
metadata.update(_create_stats_dict_from_values(*stats))

return (bin_edges, y, errors, metadata)

Expand Down Expand Up @@ -809,3 +777,115 @@ def find_bin(bin_edges: np.ndarray, value: float) -> int:
int,
np.searchsorted(bin_edges, value, side = "right") - 1
)

def _create_stats_dict_from_values(total_sum_w: float, total_sum_w2: float,
total_sum_wx: float, total_sum_wx2: float) -> Dict[str, float]:
""" Create a statistics dictionary from the provided set of values.
This is particularly useful for ensuring that the dictionary values are created uniformly.
Args:
total_sum_w: Total sum of the weights (ie. the frequencies).
total_sum_w2: Total sum of the weights squared (ie. sum of Sumw2 array).
total_sum_wx: Total sum of weights * x.
total_sum_wx2: Total sum of weights * x * x.
Returns:
Statistics dict suitable for storing in the metadata.
"""
return {
"_total_sum_w": total_sum_w, "_total_sum_w2": total_sum_w2,
"_total_sum_wx": total_sum_wx, "_total_sum_wx2": total_sum_wx2,
}

def calculate_binned_stats(bin_edges: np.array, y: np.array, weights_squared: np.array) -> Dict[str, float]:
""" Calculate the stats needed to fully determine histogram properties.
The values are calculated the same way as in ``ROOT.TH1.GetStats(...)``. Recalculating the statistics
is not ideal because information is lost compared to the information available when filling the histogram.
In particular, we actual passed x value is used to calculate these values when filling, but we can
only approximate this with the bin center when calculating these values later. Calculating them here is
equivalent to calling ``hist.ResetStats()`` before retrieving the stats.
These results are accessible from the ROOT hist using ``ctypes`` via:
>>> stats = np.array([0, 0, 0, 0], dtype = np.float64)
>>> hist.GetStats(np.ctypeslib.as_ctypes(stats))
Note:
``sum_w`` and ``sum_w2`` calculated here are _not_ equal to the ROOT values if the histogram
has been scaled. This is because the weights don't change even if the histogram has been scaled.
If the hist stats are reset, it loses this piece of information and has to reconstruct the
stats from the current frequencies, such that it will then agree with this function.
Args:
bin_edges: Histogram bin edges.
y: Histogram bin frequencies.
weights_squared: Filling weights squared (ie. this is the Sumw2 array).
Returns:
Stats dict containing the newly calculated statistics.
"""
x = bin_edges[:-1] + (bin_edges[1:] - bin_edges[:-1]) / 2
total_sum_w = np.sum(y)
total_sum_w2 = np.sum(weights_squared)
total_sum_wx = np.sum(y * x)
total_sum_wx2 = np.sum(y * x ** 2)
return _create_stats_dict_from_values(total_sum_w, total_sum_w2, total_sum_wx, total_sum_wx2)

def binned_mean(stats: Dict[str, float]) -> float:
""" Mean of values stored in the histogram.
Calculated in the same way as ROOT and physt.
Args:
stats: The histogram statistical properties.
Returns:
Mean of the histogram.
"""
# Validation
if "_total_sum_wx" not in stats:
raise ValueError("Sum of weights * x is not available, so we cannot calculate the mean.")
if "_total_sum_w" not in stats:
raise ValueError("Sum of weights is not available, so we cannot calculate the mean.")
# Calculate the mean.
total_sum_w = stats["_total_sum_w"]
if total_sum_w > 0:
return stats["_total_sum_wx"] / total_sum_w
# Can't divide, so return NaN
return np.nan # type: ignore

def binned_standard_deviation(stats: Dict[str, float]) -> float:
""" Standard deviation of the values filled into the histogram.
Calculated in the same way as ROOT and physt.
Args:
stats: The histogram statistical properties.
Returns:
Standard deviation of the histogram.
"""
return cast(float, np.sqrt(binned_variance(stats)))

def binned_variance(stats: Dict[str, float]) -> float:
""" Variance of the values filled into the histogram.
Calculated in the same way as ROOT and physt.
Args:
stats: The histogram statistical properties.
Returns:
Variance of the histogram.
"""
# Validation
if "_total_sum_wx" not in stats:
raise ValueError("Sum of weights * x is not available, so we cannot calculate the variance.")
if "_total_sum_wx2" not in stats:
raise ValueError("Sum of weights * x * x is not available, so we cannot calculate the variance.")
if "_total_sum_w" not in stats:
raise ValueError("Sum of weights is not available, so we cannot calculate the mean.")

# Calculate the variance.
total_sum_w = stats["_total_sum_w"]
if total_sum_w > 0:
return (stats["_total_sum_wx2"] - (stats["_total_sum_wx"] ** 2 / total_sum_w)) / total_sum_w
# Can't divide, so return NaN
return np.nan # type: ignore
21 changes: 21 additions & 0 deletions tests/test_histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ def test_uproot_hist_to_histogram(setup_histogram_conversion: Any) -> None:
@pytest.mark.ROOT # type: ignore
def test_derived_properties(logging_mixin: Any, test_root_hists: Any) -> None:
""" Test derived histogram properties (mean, std. dev, variance, etc). """
# Setup
h_root = test_root_hists.hist1D
h = histogram.Histogram1D.from_existing_hist(h_root)

Expand All @@ -311,6 +312,26 @@ def test_derived_properties(logging_mixin: Any, test_root_hists: Any) -> None:
# Variance
assert np.isclose(h.variance, h_root.GetStdDev() ** 2)

@pytest.mark.ROOT # type: ignore
def test_recalculated_derived_properties(logging_mixin: Any, test_root_hists: Any) -> None:
""" Test derived histogram properties (mean, std. dev, variance, etc). """
# Setup
h_root = test_root_hists.hist1D
h = histogram.Histogram1D.from_existing_hist(h_root)
stats = histogram.calculate_binned_stats(h.bin_edges, h.y, h.errors_squared)

# Need to reset the stats to force ROOT to recalculate them.
# We do it after converting just to be clear that we're not taking advantage of the ROOT
# calculation (although we couldn't anyway given the current way that histogram is built).
h_root.ResetStats()
# Now check the results.
# Mean
assert np.isclose(histogram.binned_mean(stats), h_root.GetMean())
# Standard deviation
assert np.isclose(histogram.binned_standard_deviation(stats), h_root.GetStdDev())
# Variance
assert np.isclose(histogram.binned_variance(stats), h_root.GetStdDev() ** 2)

@pytest.mark.parametrize("bin_edges, y, errors_squared", [ # type: ignore
(np.array([1, 2, 3]), np.array([1, 2, 3]), np.array([1, 2, 3])),
(np.array([1, 2, 3]), np.array([1, 2, 3]), np.array([1, 2])),
Expand Down

0 comments on commit 6ee9f7b

Please sign in to comment.