In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import statsmodels.formula.api as smf
from statsmodels.iolib.smpickle import load_pickle
from scipy.stats import chi2, norm

from saturation.utils import *

In [3]:
n_cores = 26

spark = (
    SparkSession.builder
    .master(f"local[{n_cores}]")
    .appName("Saturation")
    .config("spark.sql.shuffle.partitions", "500")
    .config("spark.driver.memory", "40g")
    .config("spark.driver.maxResultSize", "16g")
    .getOrCreate()
)

25/03/02 11:33:29 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.50.14 instead (on interface enp8s0)
25/03/02 11:33:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/02 11:33:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/02 11:33:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
base_path = "/data/saturation/thesis_run_20250223/"

colors = ["blue", "black", "r", "orange", "g"]
line_styles = ["-"]
font_size = 24
dpi = 400

configs_pdf, configs_df, configs_dict = get_configs(
    base_path=base_path,
    spark=spark
)

25/03/02 11:33:32 WARN CacheManager: Asked to cache already cached data.


# Configuration variables

In [5]:
# Maximum nstat at which to retrieve statistics
MAX_NSTAT_FOR_STATISTICS = int(1e6)

# States dataset variables
N_NSTATS = 1000
MIN_NSTAT = int(2.0e6)
MAX_NSTAT = int(2.5e6)

# Range of simulation IDs to be used for fitting, inclusive
MIN_SIMULATION_ID = 1
MAX_SIMULATION_ID = 150

# Filter on the simulations to be used for fitting, inclusive
MAX_SLOPE = -2.5

# Retrieve simulation metadata for simulations with slope <= -2.5

In [6]:
steep_slope_simulation_ids = list(
    configs_pdf[configs_pdf.simulation_id.between(MIN_SIMULATION_ID, MAX_SIMULATION_ID) & (configs_pdf.slope <= MAX_SLOPE)].simulation_id
)
steep_slope_configs_pdf = configs_pdf[configs_pdf.simulation_id.isin(steep_slope_simulation_ids)].copy()

# Prepare and write out data for the pooled model

## Write out states

In [7]:
step = int((MAX_NSTAT - MIN_NSTAT) / N_NSTATS)
nstats = [MIN_NSTAT + x * step for x in range(N_NSTATS)]

In [None]:
first_sim_id = steep_slope_simulation_ids[0]
study_region_size = configs_dict[first_sim_id]["study_region_size"]
study_region_padding = configs_dict[first_sim_id]["study_region_padding"]

for simulation_id in steep_slope_simulation_ids:
    stats_df = spark.read.parquet(f"{base_path}/{simulation_id}/statistics_*.parquet")
    craters_df = spark.read.parquet( f"{base_path}/{simulation_id}/craters_*.parquet")
    removals_df = spark.read.parquet(f"{base_path}/{simulation_id}/crater_removals_*.parquet")
    
    states = get_states(
        stats_df=stats_df,
        craters_df=craters_df,
        removals_df=removals_df,
        nstats=nstats,
        study_region_size=study_region_size,
        study_region_padding=study_region_padding,
        spark=spark,
        result_columns=["crater_id", "radius", "nstat"],
    )
    states["simulation_id"] = simulation_id
    states = states.set_index("simulation_id").sort_index()
    states.to_parquet(f"data/states_{simulation_id}_{N_NSTATS}_{MAX_SLOPE:.2f}.parquet")



## Write out statistics

In [None]:
result_columns = [
    "radius",
    "lifespan",
    "simulation_id"
]
statistics = get_statistics_with_lifespans_for_simulations(
    simulation_ids=steep_slope_simulation_ids,
    base_path=base_path,
    configs_df=configs_df,
    spark=spark,
    result_columns=result_columns,
    max_nstat=MAX_NSTAT_FOR_STATISTICS,
)
statistics = statistics.set_index("simulation_id").sort_index()
statistics.to_parquet("data/statistics_{MAX_SLOPE:.2f}.parquet")

# Fit the pooled model

## Load the statistics data back from disk

In [None]:
statistics = pd.read_parquet("data/statistics_{MAX_SLOPE:.2f}.parquet")

## Fit the model, write out to disk

In [None]:
# Sample the data; 10% is nearly the max that can fit into memory
statistics_sample = statistics.sample(frac=0.1)

model_formula = (
    "lifespan ~ 1 "
    "+ slope:rim_erasure_exponent:np.log(radius) "
    "+ rim_erasure_exponent:np.log(radius) "
    "+ np.log(-slope)"
)

lifespan_model = smf.negativebinomial(
    data=statistics_sample,
    formula=model_formula
).fit(
    maxiter=1000,
    method="BFGS"
)
lifespan_model.summary()

In [None]:
lifespan_model.save("data/pooled_lifespan_model_{MAX_SLOPE:.2f}.pkl")

# Create the prediction dataset

## Reload the model from disk

In [None]:
lifespan_model = load_pickle("data/pooled_lifespan_model_{MAX_SLOPE:.2f}.pkl")

In [None]:
lifespan_model.summary()

## Reload statistics from disk

In [None]:
statistics = pd.read_parquet("data/statistics_{MAX_SLOPE:.2f}.parquet")

## Predict using Little's Law for all simulations

In [None]:
# Extract coefficients
slope_exponent_radius_coeff = lifespan_model.params["slope:rim_erasure_exponent:np.log(radius)"]
exponent_radius_coeff = lifespan_model.params["rim_erasure_exponent:np.log(radius)"]

In [None]:
# Construct Little's Law model predictions for each simulation
predictions = pd.DataFrame(steep_slope_simulation_ids, columns=["simulation_id"])
predictions["slope"] = [configs_dict[x]["slope"] for x in steep_slope_simulation_ids]
predictions["rim_erasure_exponent"] = [configs_dict[x]["rim_erasure_method"]["exponent"] for x in steep_slope_simulation_ids]
predictions["lifespan_model_exponent"] = (
    slope_exponent_radius_coeff * predictions["slope"] * predictions["rim_erasure_exponent"]
    + exponent_radius_coeff * predictions["rim_erasure_exponent"]
)
predictions["littles_law_model_slope_prediction"] = predictions.slope + predictions.lifespan_model_exponent
predictions.set_index("simulation_id", inplace=True)

## Estimate slopes for each simulation using MLE

### Load states data

In [None]:
states_sub_dfs = []
for simulation_id in steep_slope_simulation_ids:
    state = pd.read_parquet(f"data/states_{simulation_id}_{N_NSTATS}_{MAX_SLOPE:.2f}.parquet")
    states_sub_dfs.append(state)
states = pd.concat(states_sub_dfs, axis=0)
del states_sub_dfs

### Estimate MLE slope and sigma for each simulation

In [None]:
for simulation_id in steep_slope_simulation_ids:
    state = states.loc[simulation_id]
    mle_slope, sigma = estimate_cumulative_slope(
        radii=state.radius,
        rmin=configs_dict[simulation_id]["rstat"],
        rmax=configs_dict[simulation_id]["rmax"],
        min_search_slope=-10.0,
        max_search_slope=-1
    )
    predictions.loc[simulation_id, "mle_slope"] = mle_slope
    predictions.loc[simulation_id, "mle_slope_sigma"] = sigma

In [None]:
predictions

## Save off predictions

In [None]:
predictions.to_parquet("data/predictions_{MAX_SLOPE:.2f}.parquet")

# Perform hypothesis testing

## Reload predictions from disk

In [None]:
predictions = pd.read_parquet("data/predictions_{MAX_SLOPE:.2f}.parquet")

## Perform the TOST

In [None]:
def tost_equivalence_test(
    *,
    mle_slope: float,
    mle_slope_sigma: float,
    predicted_slope: float,
    margin: float
) -> float:
    """
    Perform a TOST (Two One-Sided Test) equivalence check for a single simulation.

    Null hypothesis (H0): The true slope is outside ±margin of predicted_slope.
    Alternative (H1): The true slope is within ±margin of predicted_slope.

    This function returns a single p-value for the equivalence test,
    following the approach of taking the maximum of the two one-sided p-values
    (i.e., p_equiv = max(p1, p2)).

    Parameters:
    -----------
    mle_slope : float
        MLE estimate of the slope for this simulation.
    mle_slope_sigma : float
        Standard error of the MLE slope estimate.
    predicted_slope : float
        The predicted slope from the model (Little's Law).
    margin : float
        Equivalence margin (e.g., ±0.05).

    Returns:
    --------
    p_equiv : float
        A single p-value for the TOST equivalence test.
        Typically compared to alpha (e.g., 0.05).
        A smaller value indicates stronger evidence of equivalence.
    """
    # Two one-sided tests:
    #  1) slope > predicted_slope - margin
    #  2) slope < predicted_slope + margin

    z1 = (
        mle_slope
        - (predicted_slope - margin)
    ) / mle_slope_sigma
    p1 = 1.0 - norm.cdf(z1)

    z2 = (
        (predicted_slope + margin)
        - mle_slope
    ) / mle_slope_sigma
    p2 = 1.0 - norm.cdf(z2)

    # A single TOST p-value is often the max of these two p-values.
    p_equiv = max(p1, p2)

    return p_equiv


def fishers_method(
    p_values: list[float]
) -> tuple[float, float]:
    """
    Combine a list of p-values using Fisher's method.

    Returns:
    --------
    chi2_stat : float
        The combined chi-square statistic.
    combined_pval : float
        p-value for the combined test.
    """
    valid_pvals = [
        p for p in p_values
        if 0.0 < p < 1.0
    ]

    if not valid_pvals:
        # If no valid p-values, return defaults
        return 0.0, 1.0

    chi2_stat = -2.0 * np.sum(
        np.log(valid_pvals)
    )
    df = 2 * len(valid_pvals)
    combined_pval = 1.0 - chi2.cdf(
        chi2_stat,
        df
    )

    return chi2_stat, combined_pval


def run_equivalence_testing(
    *,
    df: pd.DataFrame,
    margin: float,
    alpha: float
) -> pd.DataFrame:
    """
    Main routine to run TOST per simulation, then apply Fisher's method
    to combine p-values for an overall conclusion.

    The DataFrame `df` must contain columns:
      - "littles_law_model_slope_prediction"
      - "mle_slope"
      - "mle_slope_sigma"

    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with required columns, indexed by simulation_id or similar.
    margin : float
        Equivalence margin (e.g., ±0.05).
    alpha : float
        Significance level for TOST (e.g., 0.05).

    Returns:
    --------
    results_df : pd.DataFrame
        A DataFrame with:
          - "predicted_slope"
          - "predicted_lower_ci"
          - "predicted_upper_ci"
          - "mle_slope"
          - "mle_slope_sigma"
          - "mle_lower_ci"
          - "mle_upper_ci"
          - "p_tost"
          - "equivalent"
        The function prints a Fisher combined p-value for the entire set.
    """
    # z-critical value for the (1 - 2*alpha)% CI
    # e.g., alpha=0.05 => 1 - 2*0.05=0.90 => z ~1.645
    z_crit = norm.ppf(1.0 - alpha)

    p_values = []
    dfs_list = []

    for idx, row in df.iterrows():
        predicted_slope = row["littles_law_model_slope_prediction"]
        est_slope = row["mle_slope"]
        est_sigma = row["mle_slope_sigma"]

        # 1) TOST p-value
        p_tost = tost_equivalence_test(
            mle_slope=est_slope,
            mle_slope_sigma=est_sigma,
            predicted_slope=predicted_slope,
            margin=margin
        )

        # 2) Equivalence pass/fail
        equivalent = (p_tost < alpha)

        # 3) Confidence Interval for predicted slope: simply ± margin
        predicted_lower_ci = predicted_slope - margin
        predicted_upper_ci = predicted_slope + margin

        # 4) (1 - 2*alpha)% CI for MLE slope
        # e.g. for alpha=0.05 => 90% CI
        mle_lower_ci = est_slope - z_crit * est_sigma
        mle_upper_ci = est_slope + z_crit * est_sigma

        p_values.append(p_tost)
        dfs_list.append({
            "simulation_id": idx,
            "predicted_slope": predicted_slope,
            "predicted_lower_ci": predicted_lower_ci,
            "predicted_upper_ci": predicted_upper_ci,
            "mle_slope": est_slope,
            "mle_slope_sigma": est_sigma,
            "mle_lower_ci": mle_lower_ci,
            "mle_upper_ci": mle_upper_ci,
            "p_tost": p_tost,
            "equivalent": equivalent
        })

    results_df = pd.DataFrame(dfs_list).set_index("simulation_id")

    # Combine p-values via Fisher
    chi2_stat, combined_pval = fishers_method(p_values)

    dof = 2 * len(p_values)
    print(f"Fisher Combined chi-square = {chi2_stat:.4f} with dof={dof}")
    print(f"Fisher Combined p-value = {combined_pval:.6g}")
    print("Conclusion: If combined_p-value < alpha, the model's predicted slope is strongly supported across all simulations.")

    return results_df

In [None]:
equivalence_test_results = run_equivalence_testing(
    df=predictions,
    alpha=0.05,
    margin=0.05
)

result_cols = [
    "mle_lower_ci",
    "mle_upper_ci",
    "predicted_lower_ci",
    "predicted_upper_ci",
    "p_tost",
    "equivalent"
]
predictions[result_cols] = equivalence_test_results[result_cols]

In [None]:
equivalence_test_results

In [None]:
predictions[~predictions.equivalent].sort_values(["slope", "rim_erasure_exponent"])