Skip to content

Commit

Permalink
Extract additional stats information from TH1s
Browse files Browse the repository at this point in the history
This allows for the calculation of additional properties (which will be
added soon)
  • Loading branch information
raymondEhlers committed Aug 18, 2019
1 parent 9589bd9 commit aa4cb32
Showing 1 changed file with 55 additions and 16 deletions.
71 changes: 55 additions & 16 deletions pachyderm/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,16 @@ class Histogram1D:
Note:
Underflow and overflow bins are excluded!
When converting from a TH1 (either from ROOT or uproot), additional statistical information will be extracted
from the hist to enable the calculation of additional properties. The information available is:
- Total sum of weights (equal to np.sum(self.y), which we store)
- Total sum of weights squared (equal to np.sum(self.errors_squared), which we store)
- Total sum of weights * x
- Total sum of weights * x * x
Each is a single float value. Since the later two values are unique, they are stored in the metadata.
Args:
bin_edges (np.ndarray): The histogram bin edges.
y (np.ndarray): The histogram bin values.
Expand All @@ -169,14 +179,13 @@ def __post_init__(self) -> None:

# Ensure that they're numpy arrays.
for name, arr in arrays.items():
if not isinstance(arr, np.ndarray):
try:
setattr(self, name, np.array(arr))
except TypeError as e:
raise ValueError(
f"Arrays must be numpy arrays, but could not convert object {name} of"
f" type {type(arr)} to numpy array."
) from e
try:
setattr(self, name, np.array(arr))
except TypeError as e:
raise ValueError(
f"Arrays must be numpy arrays, but could not convert object {name} of"
f" type {type(arr)} to numpy array."
) from e

# Ensure that they're the appropriate length
if not (len(self.bin_edges) - 1 == len(self.y) == len(self.errors_squared)):
Expand All @@ -191,7 +200,7 @@ def __post_init__(self) -> None:
# operations in place).
for (a_name, a), (b_name, b) in itertools.combinations(arrays.items(), 2):
if np.may_share_memory(a, b):
logger.warning(f"Object {b_name} shares memory with object {a_name}. Copying object {b_name}!")
logger.warning(f"Object '{b_name}' shares memory with object '{a_name}'. Copying object '{b_name}'!")
setattr(self, b_name, b.copy())

@property
Expand Down Expand Up @@ -499,7 +508,7 @@ def __eq__(self, other: Any) -> bool:
return all(agreement) and metadata_agree

@staticmethod
def _from_uproot(hist: Any) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def _from_uproot(hist: Any) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str, Any]]:
""" Convert a uproot histogram to a set of array for creating a Histogram.
Note:
Expand All @@ -519,10 +528,22 @@ def _from_uproot(hist: Any) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
# calculating bin errors from TH1::GetBinError()
errors = hist.variances

return (bin_edges, y, errors)
# Extract stats information. It will be stored in the metadata.
metadata = {}
# We extract the values directly from the members.
total_sum_w = hist._fTsumw
total_sum_w2 = hist._fTsumw2
total_sum_wx = hist._fTsumwx
total_sum_wx2 = hist._fTsumwx2
# Cross check.
assert np.isclose(total_sum_w, np.sum(y))
assert np.isclose(total_sum_w2, np.sum(errors))
metadata.update({"total_sum_wx": total_sum_wx, "total_sum_wx2": total_sum_wx2})

return (bin_edges, y, errors, metadata)

@staticmethod
def _from_th1(hist: Hist) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def _from_th1(hist: Hist) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str, Any]]:
""" Convert a TH1 histogram to a Histogram.
Note:
Expand All @@ -545,6 +566,7 @@ def _from_th1(hist: Hist) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
errors = np.array(hist.GetSumw2())
# Exclude the under/overflow bins
errors = errors[1:-1]
metadata = {}

# Check for a TProfile.
# In that case we need to retrieve the errors manually because the Sumw2() errors are
Expand All @@ -558,7 +580,24 @@ def _from_th1(hist: Hist) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
if not np.isclose(errors[0], hist.GetBinError(1) ** 2):
raise ValueError("Sumw2 errors don't seem to represent bin errors!")

return (bin_edges, y, errors)
# Retrieve the stats and store them in the metadata.
# They are useful for calculating histogram properties (mean, variance, etc).
stats = np.array([0, 0, 0, 0], dtype = np.float64)
hist.GetStats(np.ctypeslib.as_ctypes(stats))
# Return values are (each one a single value):
# [1], [2], [3], [4]
# [1]: Sum of weights (equal to np.sum(y))
# [2]: Sum of weights squared (equal to np.sum(errors_squared))
# [3]: Sum of w*x
# [4}: Sum of w*x*x
total_sum_w, total_sum_w2, total_sum_wx, total_sum_wx2 = stats
# Cross checks.
assert np.isclose(np.sum(y), total_sum_w)
assert np.isclose(np.sum(errors), total_sum_w2)

metadata.update({"total_sum_wx": total_sum_wx, "total_sum_wx2": total_sum_wx2})

return (bin_edges, y, errors, metadata)

@classmethod
def from_existing_hist(cls: Type[_T], hist: Union[Hist, Any]) -> _T:
Expand All @@ -584,12 +623,12 @@ def from_existing_hist(cls: Type[_T], hist: Union[Hist, Any]) -> _T:
# "values" is a proxy for if we have an uproot hist.
logger.debug(f"{hist}, {type(hist)}")
if hasattr(hist, "values"):
(bin_edges, y, errors_squared) = cls._from_uproot(hist)
(bin_edges, y, errors_squared, metadata) = cls._from_uproot(hist)
else:
# Handle traditional ROOT hists
(bin_edges, y, errors_squared) = cls._from_th1(hist)
(bin_edges, y, errors_squared, metadata) = cls._from_th1(hist)

return cls(bin_edges = bin_edges, y = y, errors_squared = errors_squared)
return cls(bin_edges = bin_edges, y = y, errors_squared = errors_squared, metadata = metadata)

def get_array_from_hist2D(hist: Hist, set_zero_to_NaN: bool = True, return_bin_edges: bool = False) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
""" Extract x, y, and bin values from a 2D ROOT histogram.
Expand Down

0 comments on commit aa4cb32

Please sign in to comment.