Fix histogram statistics calculation

Due to the loss of precision involved with recalculating the wx and wx2 values, as well as the lack of scaling the weights when scaling the histogram by ROOT, we use a simpler implementation where we just use the values from ROOT. However, recalculation is also implemented for future potential use.
raymondEhlers · Aug 18, 2019 · 6ee9f7b · 6ee9f7b
1 parent c5b23a7
commit 6ee9f7b
Show file tree

Hide file tree

Showing 2 changed files with 146 additions and 45 deletions.
diff --git a/pachyderm/histogram.py b/pachyderm/histogram.py
@@ -247,15 +247,7 @@ def mean(self) -> float:
         Returns:
             Mean of the histogram.
         """
-        # Validation
-        if "total_sum_wx" not in self.metadata:
-            raise ValueError("Sum of weights * x is not available, so we cannot calculate the mean.")
-        # Calculate the mean.
-        total_sum_w = np.sum(self.y)
-        if total_sum_w > 0:
-            return cast(float, self.metadata["total_sum_wx"] / total_sum_w)
-        # Can't divide, so return NaN
-        return np.nan  # type: ignore
+        return binned_mean(self.metadata)
 
     @property
     def std_dev(self) -> float:
@@ -268,7 +260,7 @@ def std_dev(self) -> float:
         Returns:
             Standard deviation of the histogram.
         """
-        return cast(float, np.sqrt(self.variance))
+        return binned_standard_deviation(self.metadata)
 
     @property
     def variance(self) -> float:
@@ -281,21 +273,7 @@ def variance(self) -> float:
         Returns:
             Variance of the histogram.
         """
-        # Validation
-        if "total_sum_wx" not in self.metadata:
-            raise ValueError("Sum of weights * x is not available, so we cannot calculate the variance.")
-        # Validation
-        if "total_sum_wx2" not in self.metadata:
-            raise ValueError("Sum of weights * x * x is not available, so we cannot calculate the variance.")
-        # Calculate the variance.
-        total_sum_w = np.sum(self.y)
-        if total_sum_w > 0:
-            return cast(
-                float,
-                (self.metadata["total_sum_wx2"] - (self.metadata["total_sum_wx"] ** 2 / total_sum_w)) / total_sum_w
-            )
-        # Can't divide, so return NaN
-        return np.nan  # type: ignore
+        return binned_variance(self.metadata)
 
     def find_bin(self, value: float) -> int:
         """ Find the bin corresponding to the specified value.
@@ -591,15 +569,10 @@ def _from_uproot(hist: Any) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[st
 
         # Extract stats information. It will be stored in the metadata.
         metadata = {}
-        # We extract the values directly from the members.
-        total_sum_w = hist._fTsumw
-        total_sum_w2 = hist._fTsumw2
-        total_sum_wx = hist._fTsumwx
-        total_sum_wx2 = hist._fTsumwx2
-        # Cross check.
-        assert np.isclose(total_sum_w, np.sum(y))
-        assert np.isclose(total_sum_w2, np.sum(errors))
-        metadata.update({"total_sum_wx": total_sum_wx, "total_sum_wx2": total_sum_wx2})
+        ## We extract the values directly from the members.
+        metadata.update(_create_stats_dict_from_values(
+            hist._fTsumw, hist._fTsumw2, hist._fTsumwx, hist._fTsumwx2
+        ))
 
         return (bin_edges, y, errors, metadata)
 
@@ -645,18 +618,13 @@ def _from_th1(hist: Hist) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str,
             # They are useful for calculating histogram properties (mean, variance, etc).
             stats = np.array([0, 0, 0, 0], dtype = np.float64)
             hist.GetStats(np.ctypeslib.as_ctypes(stats))
-            # Return values are (each one a single value):
+            # Return values are (each one is a single float):
             # [1], [2], [3], [4]
-            # [1]: Sum of weights (equal to np.sum(y))
-            # [2]: Sum of weights squared (equal to np.sum(errors_squared))
-            # [3]: Sum of w*x
-            # [4}: Sum of w*x*x
-            total_sum_w, total_sum_w2, total_sum_wx, total_sum_wx2 = stats
-            # Cross checks.
-            assert np.isclose(np.sum(y), total_sum_w)
-            assert np.isclose(np.sum(errors), total_sum_w2)
-
-            metadata.update({"total_sum_wx": total_sum_wx, "total_sum_wx2": total_sum_wx2})
+            # [1]: total_sum_w: Sum of weights (equal to np.sum(y) if unscaled)
+            # [2]: total_sum_w2: Sum of weights squared (equal to np.sum(errors_squared) if unscaled)
+            # [3]: total_sum_wx: Sum of w*x
+            # [4}: total_sum_wx2: Sum of w*x*x
+            metadata.update(_create_stats_dict_from_values(*stats))
 
         return (bin_edges, y, errors, metadata)
 
@@ -809,3 +777,115 @@ def find_bin(bin_edges: np.ndarray, value: float) -> int:
         int,
         np.searchsorted(bin_edges, value, side = "right") - 1
     )
+
+def _create_stats_dict_from_values(total_sum_w: float, total_sum_w2: float,
+                                   total_sum_wx: float, total_sum_wx2: float) -> Dict[str, float]:
+    """ Create a statistics dictionary from the provided set of values.
+
+    This is particularly useful for ensuring that the dictionary values are created uniformly.
+
+    Args:
+        total_sum_w: Total sum of the weights (ie. the frequencies).
+        total_sum_w2: Total sum of the weights squared (ie. sum of Sumw2 array).
+        total_sum_wx: Total sum of weights * x.
+        total_sum_wx2: Total sum of weights * x * x.
+    Returns:
+        Statistics dict suitable for storing in the metadata.
+    """
+    return {
+        "_total_sum_w": total_sum_w, "_total_sum_w2": total_sum_w2,
+        "_total_sum_wx": total_sum_wx, "_total_sum_wx2": total_sum_wx2,
+    }
+
+def calculate_binned_stats(bin_edges: np.array, y: np.array, weights_squared: np.array) -> Dict[str, float]:
+    """ Calculate the stats needed to fully determine histogram properties.
+
+    The values are calculated the same way as in ``ROOT.TH1.GetStats(...)``. Recalculating the statistics
+    is not ideal because information is lost compared to the information available when filling the histogram.
+    In particular, we actual passed x value is used to calculate these values when filling, but we can
+    only approximate this with the bin center when calculating these values later. Calculating them here is
+    equivalent to calling ``hist.ResetStats()`` before retrieving the stats.
+
+    These results are accessible from the ROOT hist using ``ctypes`` via:
+
+        >>> stats = np.array([0, 0, 0, 0], dtype = np.float64)
+        >>> hist.GetStats(np.ctypeslib.as_ctypes(stats))
+
+    Note:
+        ``sum_w`` and ``sum_w2`` calculated here are _not_ equal to the ROOT values if the histogram
+        has been scaled. This is because the weights don't change even if the histogram has been scaled.
+        If the hist stats are reset, it loses this piece of information and has to reconstruct the
+        stats from the current frequencies, such that it will then agree with this function.
+
+    Args:
+        bin_edges: Histogram bin edges.
+        y: Histogram bin frequencies.
+        weights_squared: Filling weights squared (ie. this is the Sumw2 array).
+    Returns:
+        Stats dict containing the newly calculated statistics.
+    """
+    x = bin_edges[:-1] + (bin_edges[1:] - bin_edges[:-1]) / 2
+    total_sum_w = np.sum(y)
+    total_sum_w2 = np.sum(weights_squared)
+    total_sum_wx = np.sum(y * x)
+    total_sum_wx2 = np.sum(y * x ** 2)
+    return _create_stats_dict_from_values(total_sum_w, total_sum_w2, total_sum_wx, total_sum_wx2)
+
+def binned_mean(stats: Dict[str, float]) -> float:
+    """ Mean of values stored in the histogram.
+
+    Calculated in the same way as ROOT and physt.
+
+    Args:
+        stats: The histogram statistical properties.
+    Returns:
+        Mean of the histogram.
+    """
+    # Validation
+    if "_total_sum_wx" not in stats:
+        raise ValueError("Sum of weights * x is not available, so we cannot calculate the mean.")
+    if "_total_sum_w" not in stats:
+        raise ValueError("Sum of weights is not available, so we cannot calculate the mean.")
+    # Calculate the mean.
+    total_sum_w = stats["_total_sum_w"]
+    if total_sum_w > 0:
+        return stats["_total_sum_wx"] / total_sum_w
+    # Can't divide, so return NaN
+    return np.nan  # type: ignore
+
+def binned_standard_deviation(stats: Dict[str, float]) -> float:
+    """ Standard deviation of the values filled into the histogram.
+
+    Calculated in the same way as ROOT and physt.
+
+    Args:
+        stats: The histogram statistical properties.
+    Returns:
+        Standard deviation of the histogram.
+    """
+    return cast(float, np.sqrt(binned_variance(stats)))
+
+def binned_variance(stats: Dict[str, float]) -> float:
+    """ Variance of the values filled into the histogram.
+
+    Calculated in the same way as ROOT and physt.
+
+    Args:
+        stats: The histogram statistical properties.
+    Returns:
+        Variance of the histogram.
+    """
+    # Validation
+    if "_total_sum_wx" not in stats:
+        raise ValueError("Sum of weights * x is not available, so we cannot calculate the variance.")
+    if "_total_sum_wx2" not in stats:
+        raise ValueError("Sum of weights * x * x is not available, so we cannot calculate the variance.")
+    if "_total_sum_w" not in stats:
+        raise ValueError("Sum of weights is not available, so we cannot calculate the mean.")
+
+    # Calculate the variance.
+    total_sum_w = stats["_total_sum_w"]
+    if total_sum_w > 0:
+        return (stats["_total_sum_wx2"] - (stats["_total_sum_wx"] ** 2 / total_sum_w)) / total_sum_w
+    # Can't divide, so return NaN
+    return np.nan  # type: ignore
diff --git a/tests/test_histogram.py b/tests/test_histogram.py
@@ -301,6 +301,7 @@ def test_uproot_hist_to_histogram(setup_histogram_conversion: Any) -> None:
 @pytest.mark.ROOT  # type: ignore
 def test_derived_properties(logging_mixin: Any, test_root_hists: Any) -> None:
     """ Test derived histogram properties (mean, std. dev, variance, etc). """
+    # Setup
     h_root = test_root_hists.hist1D
     h = histogram.Histogram1D.from_existing_hist(h_root)
 
@@ -311,6 +312,26 @@ def test_derived_properties(logging_mixin: Any, test_root_hists: Any) -> None:
     # Variance
     assert np.isclose(h.variance, h_root.GetStdDev() ** 2)
 
+@pytest.mark.ROOT  # type: ignore
+def test_recalculated_derived_properties(logging_mixin: Any, test_root_hists: Any) -> None:
+    """ Test derived histogram properties (mean, std. dev, variance, etc). """
+    # Setup
+    h_root = test_root_hists.hist1D
+    h = histogram.Histogram1D.from_existing_hist(h_root)
+    stats = histogram.calculate_binned_stats(h.bin_edges, h.y, h.errors_squared)
+
+    # Need to reset the stats to force ROOT to recalculate them.
+    # We do it after converting just to be clear that we're not taking advantage of the ROOT
+    # calculation (although we couldn't anyway given the current way that histogram is built).
+    h_root.ResetStats()
+    # Now check the results.
+    # Mean
+    assert np.isclose(histogram.binned_mean(stats), h_root.GetMean())
+    # Standard deviation
+    assert np.isclose(histogram.binned_standard_deviation(stats), h_root.GetStdDev())
+    # Variance
+    assert np.isclose(histogram.binned_variance(stats), h_root.GetStdDev() ** 2)
+
 @pytest.mark.parametrize("bin_edges, y, errors_squared", [  # type: ignore
     (np.array([1, 2, 3]), np.array([1, 2, 3]), np.array([1, 2, 3])),
     (np.array([1, 2, 3]), np.array([1, 2, 3]), np.array([1, 2])),