pymc-labs · cetagostini · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/causalpy/pymc_experiments.py b/causalpy/pymc_experiments.py
@@ -29,7 +29,11 @@
     FormulaException,
 )
 from causalpy.plot_utils import plot_xY
-from causalpy.utils import _is_variable_dummy_coded, round_num
+from causalpy.utils import (
+    _is_variable_dummy_coded,
+    round_num,
+    compute_bayesian_tail_probability,
+)
 
 LEGEND_FONT_SIZE = 12
 az.style.use("arviz-darkgrid")
@@ -334,18 +338,217 @@
 
         return fig, ax
 
-    def summary(self, round_to=None) -> None:
+    def summary(
+        self, round_to=None, version: str = "coefficients", **kwargs
+    ) -> Union[None, pd.DataFrame]:
         """
         Print text output summarising the results
 
         :param round_to:
             Number of decimals used to round results. Defaults to 2. Use "None" to return raw numbers.
         """
 
-        print(f"{self.expt_type:=^80}")
-        print(f"Formula: {self.formula}")
-        # TODO: extra experiment specific outputs here
-        self.print_coefficients(round_to)
+        if version == "coefficients":
+            print(f"{self.expt_type:=^80}")
+            print(f"Formula: {self.formula}")
+            # TODO: extra experiment specific outputs here
+            self.print_coefficients()
+        elif version == "intervention":
+            return self._summary_intervention(**kwargs)
+
+    def _summary_intervention(self, alpha: float = 0.05, **kwargs) -> pd.DataFrame:
+        """
+        Calculate and summarize the intervention analysis results in a DataFrame format.
+
+        This function performs cumulative and mean calculations on the posterior predictive distributions,
+        computes Bayesian tail probabilities, posterior estimations, causal effects, and confidence intervals.
+        It optionally applies corrections to the cumulative and mean calculations.
+
+        Parameters
+        ----------
+        - alpha (float, optional): The significance level for confidence interval calculations. Default is 0.05.
+        - kwargs (Dict[str, Any], optional): Additional keyword arguments.
+            - "correction" (bool or Dict[str, float]): If True, applies predefined corrections to cumulative and mean results.
+            If a dictionary, the corrections for 'cumulative' and 'mean' should be provided. Default is False.
+
+        Returns
+        -------
+        - pd.DataFrame: A DataFrame where each row represents different statistical measures such as
+        Bayesian tail probability, posterior estimation, causal effect, and confidence intervals for cumulative and mean results.
+        """
+        correction = kwargs.get("correction", False)
+
+        results = {}
+        ci = (alpha * 100) / 2
+
+        # Cumulative calculations
+        cumulative_results = self.post_y.sum()
+        _mu_samples_cumulative = (
+            self.post_pred["posterior_predictive"]
+            .mu.stack(sample=("chain", "draw"))
+            .sum("obs_ind")
+        )
+
+        # Mean calculations
+        mean_results = self.post_y.mean()
+        _mu_samples_mean = (
+            self.post_pred["posterior_predictive"]
+            .mu.stack(sample=("chain", "draw"))
+            .mean("obs_ind")
+        )
+
+        if not isinstance(correction, bool):
+            _mu_samples_cumulative += correction["cumulative"]
+            _mu_samples_mean += correction["mean"]
+
+        # Bayesian Tail Probability
+        results["bayesian_tail_probability"] = {
+            "cumulative": compute_bayesian_tail_probability(
+                posterior=_mu_samples_cumulative, x=cumulative_results
+            ),
+            "mean": compute_bayesian_tail_probability(
+                posterior=_mu_samples_mean, x=mean_results
+            ),
+        }
+
+        # Posterior Mean
+        results["posterior_estimation"] = {
+            "cumulative": np.mean(_mu_samples_cumulative.values),
+            "mean": np.mean(_mu_samples_mean.values),
+        }
+
+        results["results"] = {
+            "cumulative": cumulative_results,
+            "mean": mean_results,
+        }
+
+        # Causal Effect
+        results["causal_effect"] = {
+            "cumulative": cumulative_results
+            - results["posterior_estimation"]["cumulative"],
+            "mean": mean_results - results["posterior_estimation"]["mean"],
+        }
+
+        # Confidence Intervals
+        results["ci"] = {
+            "cumulative": [
+                np.percentile(_mu_samples_cumulative, ci),
+                np.percentile(_mu_samples_cumulative, 100 - ci),
+            ],
+            "mean": [
+                np.percentile(_mu_samples_mean, ci),
+                np.percentile(_mu_samples_mean, 100 - ci),
+            ],
+        }
+
+        # Convert to DataFrame
+        results_df = pd.DataFrame(results)
+
+        return results_df
+
+    def power_summary(
+        self, alpha: float = 0.05, correction: bool = False
+    ) -> pd.DataFrame:
+        """
+        Summarize the power estimation results in a DataFrame format.
+
+        This function perform a power estimation and then
+        converts the resulting dictionary into a pandas DataFrame for easier analysis and visualization.
+
+        Parameters
+        ----------
+        - alpha (float, optional): The significance level for confidence interval calculations used in power estimation. Default is 0.05.
+        - correction (bool, optional): Indicates whether to apply corrections in the power estimation process. Default is False.
+
+        Returns
+        -------
+        - pd.DataFrame: A DataFrame representing the power estimation results, including posterior estimations,
+        systematic differences, confidence intervals, and posterior MDE for cumulative and mean results.
+        """
+        return pd.DataFrame(self._power_estimation(alpha=alpha, correction=correction))
+
+    def plot_power(self, alpha: float = 0.05, correction: bool = False) -> plt.Figure:
+        """
+        Generate and return a figure containing plots that visualize power estimation results.
+
+        This function creates a two-panel plot (for mean and cumulative measures) to visualize the posterior distributions
+        along with the confidence intervals, real mean, and posterior mean values. It allows for adjustments based on
+        systematic differences if the correction is applied.
+
+        Parameters
+        ----------
+        - alpha (float, optional): The significance level for confidence interval calculations used in power estimation. Default is 0.05.
+        - correction (bool, optional): Indicates whether to apply corrections for systematic differences in the plotting process. Default is False.
+
+        Returns
+        -------
+        - plt.Figure: A matplotlib figure object containing the plots.
+        """
+        _estimates = self._power_estimation(alpha=alpha, correction=correction)
+
+        fig, axs = plt.subplots(1, 2, figsize=(20, 6))  # Two subplots side by side
+
+        # Adjustments for Mean and Cumulative plots
+        for i, key in enumerate(["mean", "cumulative"]):
+            _mu_samples = self.post_pred["posterior_predictive"].mu.stack(
+                sample=("chain", "draw")
+            )
+            if key == "mean":
+                _mu_samples = _mu_samples.mean("obs_ind")
+            elif key == "cumulative":
+                _mu_samples = _mu_samples.sum("obs_ind")
+
+            if correction:
+                _mu_samples += _estimates["_systematic_differences"][key]
+
+            # Histogram and KDE
+            sns.histplot(
+                _mu_samples,
+                bins=30,
+                kde=True,
+                ax=axs[i],
+                color="C0",
+                stat="density",
+                alpha=0.6,
+            )
+            kde_x, kde_y = (
+                sns.kdeplot(_mu_samples, color="C1", fill=True, ax=axs[i])
+                .get_lines()[0]
+                .get_data()
+            )
+
+            # Adjust y-limits based on max density
+            max_density = max(kde_y)
+            axs[i].set_ylim(0, max_density + 0.05 * max_density)  # Adding 5% buffer
+
+            # Fill between for the percentile interval
+            axs[i].fill_betweenx(
+                y=np.linspace(0, max_density + 0.05 * max_density, 100),
+                x1=_estimates["ci"][key][0],
+                x2=_estimates["ci"][key][1],
+                color="C0",
+                alpha=0.3,
+                label="C.I",
+            )
+
+            # Vertical lines for the means
+            axs[i].axvline(
+                _estimates["results"][key], color="C3", linestyle="-", label="Real Mean"
+            )
+            if not correction:
+                axs[i].axvline(
+                    _estimates["posterior_estimation"][key],
+                    color="C4",
+                    linestyle="--",
+                    label="Posterior Mean",
+                )
+
+            axs[i].set_title(f"Posterior of mu ({key.capitalize()})")
+            axs[i].set_xlabel("mu")
+            axs[i].set_ylabel("Density")
+            axs[i].legend()
+
+        return fig, axs
 
 
 class InterruptedTimeSeries(PrePostFit):

diff --git a/causalpy/tests/test_utils.py b/causalpy/tests/test_utils.py
@@ -2,9 +2,15 @@
 Tests for utility functions
 """
 
+import numpy as np
 import pandas as pd
 
-from causalpy.utils import _is_variable_dummy_coded, _series_has_2_levels, round_num
+from causalpy.utils import (
+    _is_variable_dummy_coded,
+    _series_has_2_levels,
+    compute_bayesian_tail_probability,
+    round_num,
+)
 
 
 def test_dummy_coding():
@@ -44,3 +50,45 @@ def test_round_num():
     assert round_num(123.456, 5) == "123.46"
     assert round_num(123.456, 6) == "123.456"
     assert round_num(123.456, 7) == "123.456"
+
+
+def test_compute_bayesian_tail_probability():
+    """
+    Re-running all tests for the compute_bayesian_tail_probability function with the corrected understanding
+    and expectations for various scenarios.
+    """
+    # Test 1: Posterior is a standard normal distribution, x = mean = 0
+    posterior_standard_normal = np.random.normal(0, 1, 10000)
+    x_at_mean = 0
+    prob_at_mean = compute_bayesian_tail_probability(
+        posterior_standard_normal, x_at_mean
+    )
+    assert np.isclose(prob_at_mean, 1, atol=0.05), f"Expected 1, got {prob_at_mean}"
+
+    # Test 2: Posterior is a standard normal distribution, x = 1
+    x_one_std_above = 1
+    prob_one_std_above = compute_bayesian_tail_probability(
+        posterior_standard_normal, x_one_std_above
+    )
+    assert (
+        0 < prob_one_std_above < 1
+    ), "Probability should decrease from 1 as x moves away from mean"
+
+    # Test 3: Posterior is a standard normal distribution, x well outside the distribution
+    x_far_out = 5
+    prob_far_out = compute_bayesian_tail_probability(
+        posterior_standard_normal, x_far_out
+    )
+    # Expect a very low probability for a value far outside the distribution
+    assert prob_far_out < 0.01, f"Expected a value < 0.01, got {prob_far_out}"
+
+    # Test 4: Posterior is a normal distribution with mean=5, std=2, x = mean
+    posterior_shifted = np.random.normal(5, 2, 10000)
+    x_at_shifted_mean = 5
+    prob_at_shifted_mean = compute_bayesian_tail_probability(
+        posterior_shifted, x_at_shifted_mean
+    )
+    # Expect the probability at the mean of a shifted distribution to be close to 1
+    assert np.isclose(
+        prob_at_shifted_mean, 1, atol=0.05
+    ), f"Expected 1, got {prob_at_shifted_mean}"
diff --git a/causalpy/utils.py b/causalpy/utils.py
@@ -3,6 +3,7 @@
 """
 import numpy as np
 import pandas as pd
+from scipy.stats import norm
 
 
 def _is_variable_dummy_coded(series: pd.Series) -> bool:
@@ -48,3 +49,29 @@ def _format_sig_figs(value, default=None):
     if value == 0:
         return 1
     return max(int(np.log10(np.abs(value))) + 1, default)
+
+
+def compute_bayesian_tail_probability(posterior, x) -> float:
+    """
+    Calculate the probability of a given value being in a distribution defined by the posterior,
+
+    Args:
+    - data: a list or array-like object containing the data to define the distribution
+    - x: a numeric value for which to calculate the probability of being in the distribution
+
+    Returns:
+    - prob: a numeric value representing the probability of x being in the distribution
+    """
+    lower_bound, upper_bound = min(posterior), max(posterior)
+    mean, std = np.mean(posterior), np.std(posterior)
+
+    cdf_lower = norm.cdf(lower_bound, mean, std)
+    cdf_upper = 1 - norm.cdf(upper_bound, mean, std)
+    cdf_x = norm.cdf(x, mean, std)
+
+    if cdf_x <= 0.5:
+        probability = 2 * (cdf_x - cdf_lower) / (1 - cdf_lower - cdf_upper)
+    else:
+        probability = 2 * (1 - cdf_x + cdf_lower) / (1 - cdf_lower - cdf_upper)
+
+    return abs(probability)