feat: make POI optional in config and allow POI customization for inf…

…erence (#348) * POI is now an optional configuration argument * added ability to set POI via poi_name keyword arguments in fit.ranking and fit.limit * fixed pytest deprecation warning in a test
scikit-hep · Jul 30, 2022 · 0c75aad · 0c75aad
1 parent a237589
commit 0c75aad
Show file tree

Hide file tree

Showing 10 changed files with 176 additions and 45 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -56,7 +56,7 @@ filterwarnings =
     ignore:no code associated:UserWarning:typeguard:
 
 [flake8]
-max-complexity = 16
+max-complexity = 18
 max-line-length = 88
 exclude = docs/conf.py
 count = True

diff --git a/src/cabinetry/fit/__init__.py b/src/cabinetry/fit/__init__.py
@@ -425,6 +425,7 @@ def ranking(
     data: List[float],
     *,
     fit_results: Optional[FitResults] = None,
+    poi_name: Optional[str] = None,
     init_pars: Optional[List[float]] = None,
     fix_pars: Optional[List[bool]] = None,
     par_bounds: Optional[List[Tuple[float, float]]] = None,
@@ -442,6 +443,8 @@ def ranking(
         data (List[float]): data (including auxdata) the model is fit to
         fit_results (Optional[FitResults], optional): nominal fit results to use for
             ranking, if not specified will repeat nominal fit, defaults to None
+        poi_name (Optional[str], optional): impact is calculated with respect to this
+            parameter, defaults to None (use POI specified in workspace)
         init_pars (Optional[List[float]], optional): list of initial parameter settings,
             defaults to None (use ``pyhf`` suggested inits)
         fix_pars (Optional[List[bool]], optional): list of booleans specifying which
@@ -451,6 +454,9 @@ def ranking(
         custom_fit (bool, optional): whether to use the ``pyhf.infer`` API or
             ``iminuit``, defaults to False (using ``pyhf.infer``)
 
+    Raises:
+        ValueError: if no POI is found
+
     Returns:
         RankingResults: fit results for parameters, and pre- and post-fit impacts
     """
@@ -466,7 +472,13 @@ def ranking(
 
     labels = model.config.par_names()
     prefit_unc = model_utils.prefit_uncertainties(model)
-    nominal_poi = fit_results.bestfit[model.config.poi_index]
+
+    # use POI given by kwarg, fall back to POI specified in model
+    poi_index = model_utils._poi_index(model, poi_name=poi_name)
+    if poi_index is None:
+        raise ValueError("no POI specified, cannot calculate ranking")
+
+    nominal_poi = fit_results.bestfit[poi_index]
 
     # need to get values for parameter settings, as they will be partially changed
     # during the ranking (init/fix changes)
@@ -476,9 +488,9 @@ def ranking(
 
     all_impacts = []
     for i_par, label in enumerate(labels):
-        if label == model.config.poi_name:
+        if i_par == poi_index:
             continue  # do not calculate impact of POI on itself
-        log.info(f"calculating impact of {label} on {labels[model.config.poi_index]}")
+        log.info(f"calculating impact of {label} on {labels[poi_index]}")
 
         # hold current parameter constant
         fix_pars_ranking = fix_pars.copy()
@@ -509,7 +521,7 @@ def ranking(
                     par_bounds=par_bounds,
                     custom_fit=custom_fit,
                 )
-                poi_val = fit_results_ranking.bestfit[model.config.poi_index]
+                poi_val = fit_results_ranking.bestfit[poi_index]
                 parameter_impact = poi_val - nominal_poi
                 log.debug(
                     f"POI is {poi_val:.6f}, difference to nominal is "
@@ -526,9 +538,9 @@ def ranking(
 
     # remove parameter of interest from bestfit / uncertainty / labels
     # such that their entries match the entries of the impacts
-    bestfit = np.delete(fit_results.bestfit, model.config.poi_index)
-    uncertainty = np.delete(fit_results.uncertainty, model.config.poi_index)
-    labels = np.delete(fit_results.labels, model.config.poi_index).tolist()
+    bestfit = np.delete(fit_results.bestfit, poi_index)
+    uncertainty = np.delete(fit_results.uncertainty, poi_index)
+    labels = np.delete(fit_results.labels, poi_index).tolist()
 
     ranking_results = RankingResults(
         bestfit, uncertainty, labels, prefit_up, prefit_down, postfit_up, postfit_down
@@ -582,7 +594,7 @@ def scan(
 
     # get index of parameter with name par_name
     par_index = model_utils._parameter_index(par_name, labels)
-    if par_index == -1:
+    if par_index is None:
         raise ValueError(f"parameter {par_name} not found in model")
 
     # run a fit with the parameter not held constant, to find the best-fit point
@@ -644,6 +656,7 @@ def limit(
     tolerance: float = 0.01,
     maxiter: int = 100,
     confidence_level: float = 0.95,
+    poi_name: Optional[str] = None,
     init_pars: Optional[List[float]] = None,
     fix_pars: Optional[List[bool]] = None,
     par_bounds: Optional[List[Tuple[float, float]]] = None,
@@ -652,7 +665,9 @@ def limit(
 
     Limits are calculated for the parameter of interest (POI) defined in the model.
     Brent's algorithm is used to automatically determine POI values to be tested. The
-    desired confidence level can be configured, and defaults to 95%.
+    desired confidence level can be configured, and defaults to 95%. In order to support
+    setting the POI directly without model recompilation, this temporarily changes the
+    POI index and name in the model configuration.
 
     Args:
         model (pyhf.pdf.Model): model to use in fits
@@ -668,6 +683,8 @@ def limit(
             100
         confidence_level (float, optional): confidence level for calculation, defaults
             to 0.95 (95%)
+        poi_name (Optional[str], optional): limit is calculated for this parameter,
+            defaults to None (use POI specified in workspace)
         init_pars (Optional[List[float]], optional): list of initial parameter settings,
             defaults to None (use ``pyhf`` suggested inits)
         fix_pars (Optional[List[bool]], optional): list of booleans specifying which
@@ -676,13 +693,27 @@ def limit(
             parameter bounds for fit, defaults to None (use ``pyhf`` suggested bounds)
 
     Raises:
+        ValueError: if no POI is found
         ValueError: if lower and upper bracket value are the same
+        ValueError: if starting brackets do not enclose the limit
 
     Returns:
         LimitResults: observed and expected limits, CLs values, and scanned points
     """
     pyhf.set_backend(pyhf.tensorlib, pyhf.optimize.minuit_optimizer(verbose=1))
 
+    # use POI given by kwarg, fall back to POI specified in model
+    poi_index = model_utils._poi_index(model, poi_name=poi_name)
+    if poi_index is None:
+        raise ValueError("no POI specified, cannot calculate limit")
+
+    # set POI index / name in model config to desired value, hypotest will pick this up
+    # save original values to reset model later
+    original_model_poi_index = model.config.poi_index
+    original_model_poi_name = model.config.poi_name
+    model.config.poi_index = poi_index
+    model.config.poi_name = model.config.par_names()[poi_index]
+
     # show two decimals only if confidence level in percent is not an integer
     cl_label = (
         f"{confidence_level:.{0 if (confidence_level * 100).is_integer() else 2}%}"
@@ -708,12 +739,15 @@ def limit(
     if bracket is None:
         bracket = (bracket_left_default, bracket_right_default)
     elif bracket[0] == bracket[1]:
+        # set POI index / name in model back to their original values
+        model.config.poi_index = original_model_poi_index
+        model.config.poi_name = original_model_poi_name
         raise ValueError(f"the two bracket values must not be the same: {bracket}")
 
     cache_CLs: Dict[float, tuple] = {}  # cache storing all relevant results
 
     def _cls_minus_threshold(
-        poi: float,
+        poi_val: float,
         model: pyhf.pdf.Model,
         data: List[float],
         cls_target: float,
@@ -726,7 +760,7 @@ def _cls_minus_threshold(
         cache to avoid re-fitting known POI values and to store all relevant values.
 
         Args:
-            poi (float): value for parameter of interest
+            poi_val (float): value for parameter of interest
             model (pyhf.pdf.Model): model to use in fits
             data (List[float]): data (including auxdata) the model is fit to
             cls_target (float): target CLs value to find by varying POI
@@ -738,19 +772,20 @@ def _cls_minus_threshold(
         Returns:
             float: absolute value of difference to CLs=``cls_target``
         """
-        if poi <= 0:
+        if poi_val <= 0:
             # no fit needed for negative POI value, return a default value
             log.debug(
-                f"skipping fit for {model.config.poi_name} = {poi:.4f}, setting CLs = 1"
+                f"skipping fit for {model.config.poi_name} = {poi_val:.4f}, setting "
+                "CLs = 1"
             )
             return 1 - cls_target  # distance of CLs = 1 to target CLs
-        cache = cache_CLs.get(poi)
+        cache = cache_CLs.get(poi_val)
         if cache:
             observed, expected = cache  # use result from cache
         else:
             # calculate CLs
             results = pyhf.infer.hypotest(
-                poi,
+                poi_val,
                 data,
                 model,
                 init_pars=init_pars,
@@ -761,10 +796,10 @@ def _cls_minus_threshold(
             )
             observed = float(results[0])  # 1 value per scan point
             expected = np.asarray(results[1])  # 5 per point (with 1 and 2 sigma bands)
-            cache_CLs.update({poi: (observed, expected)})
+            cache_CLs.update({poi_val: (observed, expected)})
         current_CLs = np.hstack((observed, expected))[which_limit]
         log.debug(
-            f"{model.config.poi_name} = {poi:.4f}, {limit_label} CLs = "
+            f"{model.config.poi_name} = {poi_val:.4f}, {limit_label} CLs = "
             f"{current_CLs:.4f}{' (cached)' if cache else ''}"
         )
         return current_CLs - cls_target
@@ -801,6 +836,9 @@ def _cls_minus_threshold(
                 f"CLs values at {bracket[0]:.4f} and {bracket[1]:.4f} do not bracket "
                 f"CLs={cls_target:.4f}, try a different starting bracket"
             )
+            # set POI index / name in model back to their original values
+            model.config.poi_index = original_model_poi_index
+            model.config.poi_name = original_model_poi_name
             raise
 
         if not res.converged:
@@ -842,6 +880,10 @@ def _cls_minus_threshold(
 
             bracket = (bracket_left, bracket_right)
 
+    # set POI index / name in model back to their original values
+    model.config.poi_index = original_model_poi_index
+    model.config.poi_name = original_model_poi_name
+
     # report all results
     log.info(f"total of {steps_total} steps to calculate all limits")
     if not all_converged:

diff --git a/src/cabinetry/model_utils.py b/src/cabinetry/model_utils.py
@@ -424,27 +424,63 @@ def unconstrained_parameter_count(model: pyhf.pdf.Model) -> int:
     return n_pars
 
 
-def _parameter_index(par_name: str, labels: Union[List[str], Tuple[str, ...]]) -> int:
+def _parameter_index(
+    par_name: str, labels: Union[List[str], Tuple[str, ...]]
+) -> Optional[int]:
     """Returns the position of a parameter with a given name in the list of parameters.
 
     Useful together with ``pyhf.pdf._ModelConfig.par_names`` to find the position of a
     parameter when the name is known. If the parameter is not found, logs an error and
-    returns a default value of -1.
+    returns a default value of None.
 
     Args:
         par_name (str): name of parameter to find in list
         labels (Union[List[str], Tuple[str, ...]]): list or tuple with all parameter
             names in the model
 
     Returns:
-        int: index of parameter
+        Optional[int]: index of parameter, or None if parameter was not found
     """
-    par_index = next((i for i, label in enumerate(labels) if label == par_name), -1)
-    if par_index == -1:
+    par_index = next((i for i, label in enumerate(labels) if label == par_name), None)
+    if par_index is None:
         log.error(f"parameter {par_name} not found in model")
     return par_index
 
 
+def _poi_index(
+    model: pyhf.pdf.Model, *, poi_name: Optional[str] = None
+) -> Optional[int]:
+    """Returns the index of the POI specified in the argument or the model default.
+
+    If a string is given as argument, this takes priority. Otherwise the POI from the
+    model is used. If no POI is specified there either, logs an error and returns a
+    default value of None.
+
+    Args:
+        model (pyhf.pdf.Model): model for which to find the POI index
+        poi_name (Optional[str], optional): name of the POI, defaults to None
+
+    Raises:
+        ValueError: if the specified POI name cannot be found in the model
+
+    Returns:
+        Optional[int]: POI index, or None if no POI could be found
+    """
+    if poi_name is not None:
+        # use POI given by kwarg if specified
+        poi_index = _parameter_index(poi_name, model.config.par_names())
+        if poi_index is None:
+            raise ValueError(f"parameter {poi_name} not found in model")
+    elif model.config.poi_index is not None:
+        # use POI specified in model
+        poi_index = model.config.poi_index
+    else:
+        log.error("could not find POI for model")
+        poi_index = None
+
+    return poi_index
+
+
 def _strip_auxdata(model: pyhf.pdf.Model, data: List[float]) -> List[float]:
     """Always returns observed yields, no matter whether data includes auxdata.
 

diff --git a/src/cabinetry/schemas/config.json b/src/cabinetry/schemas/config.json
@@ -62,14 +62,14 @@
             "$$target": "#/definitions/general",
             "description": "general settings",
             "type": "object",
-            "required": ["Measurement", "POI", "InputPath", "HistogramFolder"],
+            "required": ["Measurement", "InputPath", "HistogramFolder"],
             "properties": {
                 "Measurement": {
                     "description": "name of measurement",
                     "type": "string"
                 },
                 "POI": {
-                    "description": "name of parameter of interest",
+                    "description": "name of parameter of interest, defaults to empty string",
                     "type": "string"
                 },
                 "InputPath": {

diff --git a/src/cabinetry/workspace.py b/src/cabinetry/workspace.py
@@ -358,7 +358,8 @@ def measurements(self) -> List[Dict[str, Any]]:
 
         parameters = {"parameters": parameters_list}
         config_dict.update(parameters)
-        config_dict.update({"poi": self.config["General"]["POI"]})
+        # POI defaults to "" (interpreted as "no POI" by pyhf) if not specified
+        config_dict.update({"poi": self.config["General"].get("POI", "")})
         measurement.update({"config": config_dict})
         measurements.append(measurement)
         return measurements