In [3]:
import sys

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
from typing import *

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Window

In [5]:
n_cores = 30

spark = (SparkSession.builder
         .master(f"local[{n_cores}]")
         .appName("Saturation")
         .config("spark.sql.shuffle.partitions", "1000")
         .config("spark.driver.memory", "64g")
         .config("spark.driver.maxResultSize", "8g")
         .getOrCreate())

23/10/31 19:12:14 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.86.20 instead (on interface enp8s0)
23/10/31 19:12:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/31 19:12:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config


def read_configs(base_path: str, spark_session: SparkSession) -> pyspark.RDD:
    completed_filenames = list(Path(base_path).glob("*/completed.txt"))
    configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
    configs = map(read_config, configs)
    return spark_session.sparkContext.parallelize(configs)


def create_configs_df(configs: pyspark.RDD) -> DataFrame:
    config_columns = [
        "simulation_id",
        "slope",
        "r_stat_multiplier",
        "effective_radius_multiplier",
        "min_rim_percentage"
    ]
    return configs.map(lambda x: {k: v for k, v in x.items() if k in config_columns}).toDF().cache()
    

def quantize_value(value: float, delta: float) -> float:
    return np.round(int(np.round(value / delta, decimals=5)) * delta, decimals=5)


def get_min_max_n_buckets(column: str, delta: float, df) -> Tuple[float, float, int]:
    min_max_df = df.select(F.min(column), F.max(column)).toPandas()
    
    min_val = quantize_value(min_max_df.iloc[0, 0], delta)
    max_val = quantize_value(min_max_df.iloc[0, 1], delta)
    n_buckets = int(np.round((max_val - min_val) / delta, decimals=5)) + 1

    return min_val, max_val, n_buckets

In [7]:
base_path = "/data/saturation/n_craters_stop_condition_20230918"

In [8]:
r_stat = 5

configs_df = create_configs_df(read_configs(base_path, spark))
data = spark.read.parquet(f"{base_path}/*/statistics_*.parquet")

# State c2c nn dist in terms of r_stat and log-scale
data = data.select("*", F.log10(F.col("center_to_center_nearest_neighbor_distance_mean") / F.lit(r_stat)).alias("log_mean_c2c_nn_dist"))

                                                                                

In [9]:
def get_confidence_interval(data: DataFrame,
                            configs: DataFrame,
                            target: str,
                            observation: Dict[str, float],
                            bandwidths: Dict[str, float],
                            simulation_id: Optional[int] = None) -> pd.DataFrame:
    data.createOrReplaceTempView("data")
    configs.createOrReplaceTempView("configs")
    
    simulation_id_clause = f"AND data.simulation_id <> {simulation_id}\n" if simulation_id else ""
    bandwidth_clauses = ""
    for var, bandwidth in bandwidths.items():
        observation_value = observation[var]
        bandwidth_clauses += f"AND {var} BETWEEN {observation_value - bandwidth} AND {observation_value + bandwidth}\n"
    
    result = spark.sql(f"""
    SELECT
        mean({target}) as target_mean,
        count({target}) as n_obs,
        count(distinct data.simulation_id) as n_unique_sims,
        approx_percentile({target}, array(0.025, 0.05, 0.10, 0.5, 0.90, 0.95, 0.975), 10000) as percentiles
    FROM
        data
        INNER JOIN configs
            ON data.simulation_id = configs.simulation_id
    WHERE
        1=1
        {simulation_id_clause}
        {bandwidth_clauses}
    """).toPandas()

    return result

In [11]:
def get_confidence_intervals(data: DataFrame,
                             configs: DataFrame,
                             target: str,
                             observations: pd.DataFrame,
                             bandwidths: pd.DataFrame,
                             spark_session: SparkSession,
                             simulation_id: Optional[int] = None) -> pd.DataFrame:
    data.createOrReplaceTempView("data")
    configs.createOrReplaceTempView("configs")

    observations_df = spark_session.createDataFrame(observations).cache()
    observations_df.createOrReplaceTempView("observations")
    bandwidths_df = spark_session.createDataFrame(bandwidths).cache()
    bandwidths_df.createOrReplaceTempView("bandwidths")

    # Trick it into caching and broadcasting
    observations_df.count()
    bandwidths_df.count()
    
    simulation_id_clause = f"AND d.simulation_id <> {simulation_id}\n" if simulation_id else ""
    group_by_clause = ",\n".join([f"o.{x},\nb.{x}" for x in observations.columns])
    
    bandwidth_clauses = ""
    observations_select_clause = ""
    bandwidths_select_clause = ""
    for var in observations.columns:
        bandwidth_clauses += f"AND d.{var} BETWEEN (o.{var} - b.{var}) AND (o.{var} + b.{var})\n"
        observations_select_clause += f"o.{var} as observation_{var},\n"
        bandwidths_select_clause += f"b.{var} as bandwith_{var},\n"

    query = f"""
    SELECT
        {observations_select_clause}
        {bandwidths_select_clause}
        mean({target}) as target_mean,
        count({target}) as n_obs,
        count(distinct d.simulation_id) as n_unique_sims,
        approx_percentile({target}, array(0.025, 0.10, 0.90, 0.975), 1000) as percentiles
    FROM
        (
            SELECT
                configs.slope,
                configs.r_stat_multiplier,
                configs.effective_radius_multiplier,
                configs.min_rim_percentage,
                data.*
            FROM
                data
                INNER JOIN configs
                    ON data.simulation_id = configs.simulation_id
        ) AS d
        INNER JOIN observations o
        INNER JOIN bandwidths b
    WHERE
        1=1
        {simulation_id_clause}
        {bandwidth_clauses}
    GROUP BY
        {group_by_clause}
    """
    result = spark.sql(query)

    return result.toPandas()

In [None]:
# DSP
target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
    [target_slope, target_log_mean_c2c_nn_dist],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.001],
    [0.01, 0.005],
    [0.005, 0.001],
    [0.005, 0.005],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result

In [None]:
# DSP
target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
    [target_slope, target_log_mean_c2c_nn_dist],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.0005],
    [0.01, 0.001],
    [0.01, 0.005],
    [0.025, 0.0005],
    [0.025, 0.001],
    [0.025, 0.005],
    [0.05, 0.0005],
    [0.05, 0.001],
    [0.05, 0.005],
    [0.1, 0.0001],
    [0.1, 0.00025],
    [0.1, 0.0005],
    [0.1, 0.001],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result.sort_values([f"observation_{x}" for x in observations.columns])

In [None]:
# DEE

target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
    [target_slope, target_log_mean_c2c_nn_dist],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.0005],
    [0.01, 0.001],
    [0.01, 0.005],
    [0.025, 0.0005],
    [0.025, 0.001],
    [0.025, 0.005],
    [0.05, 0.0005],
    [0.05, 0.001],
    [0.05, 0.005],
    [0.1, 0.0001],
    [0.1, 0.00025],
    [0.1, 0.0005],
    [0.1, 0.001],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result.sort_values([f"observation_{x}" for x in observations.columns])

In [None]:
# Dione 3 (DICP)
target = "n_craters_added_in_study_region"
target_slope = 2.08
slope_ci_width = 0.09
target_log_mean_c2c_nn_dist = 0.9338405903389017

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
    [target_slope, target_log_mean_c2c_nn_dist],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.0005],
    [0.01, 0.001],
    [0.01, 0.005],
    [0.025, 0.0005],
    [0.025, 0.001],
    [0.025, 0.005],
    [0.05, 0.0005],
    [0.05, 0.001],
    [0.05, 0.005],
    [0.1, 0.0001],
    [0.1, 0.00025],
    [0.1, 0.0005],
    [0.1, 0.001],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result.sort_values([f"observation_{x}" for x in observations.columns])

In [None]:
# Dione 4 (DDCP)
target = "n_craters_added_in_study_region"
target_slope = 2.15
slope_ci_width = 0.05
target_log_mean_c2c_nn_dist = 0.5480628421533491

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
    [target_slope, target_log_mean_c2c_nn_dist],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.0005],
    [0.01, 0.001],
    [0.01, 0.005],
    [0.025, 0.0005],
    [0.025, 0.001],
    [0.025, 0.005],
    [0.05, 0.0005],
    [0.05, 0.001],
    [0.05, 0.005],
    [0.1, 0.0001],
    [0.1, 0.00025],
    [0.1, 0.0005],
    [0.1, 0.001],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result.sort_values([f"observation_{x}" for x in observations.columns])

In [11]:
# The Grid
target = "n_craters_added_in_study_region"

log_mean_c2c_nn_dist_delta = 0.05
slope_delta = 0.05

min_slope = configs_df.select(F.min("slope")).toPandas().iloc[0,0]
max_slope = configs_df.select(F.max("slope")).toPandas().iloc[0,0]

min_log_mean_c2c_nn_dist = data.select(F.min("log_mean_c2c_nn_dist")).toPandas().iloc[0,0]
max_log_mean_c2c_nn_dist = data.select(F.max("log_mean_c2c_nn_dist")).toPandas().iloc[0,0]

min_log_mean_c2c_nn_dist = quantize_value(min_log_mean_c2c_nn_dist + log_mean_c2c_nn_dist_delta * 2, log_mean_c2c_nn_dist_delta)
max_log_mean_c2c_nn_dist = quantize_value(max_log_mean_c2c_nn_dist - log_mean_c2c_nn_dist_delta * 2, log_mean_c2c_nn_dist_delta)

log_c2c_nn_dist_choices = [round(min_log_mean_c2c_nn_dist + x * log_mean_c2c_nn_dist_delta, 4) for x in range(int((max_log_mean_c2c_nn_dist - min_log_mean_c2c_nn_dist) / log_mean_c2c_nn_dist_delta))]

min_slope = quantize_value(min_slope + 2 * slope_delta, slope_delta)
max_slope = quantize_value(max_slope - 2 * slope_delta, slope_delta)
slope_choices = [round(min_slope + x * slope_delta, 4) for x in range(int((max_slope - min_slope) / slope_delta))]

observations_data = [
    [x, y]
    for x in slope_choices
    for y in log_c2c_nn_dist_choices
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.0005],
    [0.01, 0.001],
    [0.01, 0.005],
    [0.025, 0.0005],
    [0.025, 0.001],
    [0.025, 0.005],
    [0.05, 0.0005],
    [0.05, 0.001],
    [0.05, 0.005],
    [0.1, 0.0001],
    [0.1, 0.00025],
    [0.1, 0.0005],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result.sort_values([f"observation_{x}" for x in observations.columns])

                                                                                

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles
2398,1.1,0.9,0.100,0.00025,138130.879420,1103,16,"[7931, 9416, 16397, 152707, 231611, 287825, 28..."
3211,1.1,0.9,0.025,0.00500,41371.122449,49,1,"[41348, 41349, 41351, 41371, 41391, 41395, 41396]"
6490,1.1,0.9,0.050,0.00100,123207.154412,136,2,"[41350, 41353, 41364, 134573, 135344, 135365, ..."
7650,1.1,0.9,0.050,0.00050,127333.932432,74,2,"[41352, 41354, 134387, 135160, 135362, 135366,..."
9717,1.1,0.9,0.100,0.00050,136294.286171,2191,17,"[7830, 8375, 16291, 140544, 231633, 287830, 28..."
...,...,...,...,...,...,...,...,...
6366,2.8,2.8,0.050,0.00050,2.000000,1,1,"[2, 2, 2, 2, 2, 2, 2]"
7788,2.8,2.8,0.025,0.00050,2.000000,1,1,"[2, 2, 2, 2, 2, 2, 2]"
8140,2.8,2.8,0.025,0.00100,2.000000,1,1,"[2, 2, 2, 2, 2, 2, 2]"
13782,2.8,2.8,0.100,0.00050,2.000000,3,3,"[2, 2, 2, 2, 2, 2, 2]"


In [29]:
result.to_parquet("quantile_kernel_regression_grid_multiple_bandwidths.parquet")

In [65]:
result = result.sort_values([f"bandwith_{x}" for x in bandwidths.columns])

In [72]:
# Dione region 1, DEE
target_slope = 2.10
target_log_mean_c2c_nn_dist = 0.8136322010294104
r = result[(result.observation_slope == 2.1) & (result.observation_log_mean_c2c_nn_dist == 0.80)].copy()
r["95_percentile_absolute_range"] = r.percentiles.apply(lambda x: x[-1]) - r.percentiles.apply(lambda x: x[0])
r["95_percentile_ratio"] = r.percentiles.apply(lambda x: x[-1]) / r.percentiles.apply(lambda x: x[0])
r

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles,95_percentile_absolute_range,95_percentile_ratio
8867,2.1,0.8,0.01,0.0005,98620.790572,12601,98,"[4751, 4992, 5488, 86395, 223221, 235944, 312353]",307602,65.744685
10342,2.1,0.8,0.01,0.001,101786.584058,25881,98,"[4769, 5006, 5521, 97981, 223202, 235889, 260985]",256216,54.725309
9130,2.1,0.8,0.01,0.005,100420.7869,130568,98,"[4760, 5014, 5525, 87671, 221856, 234620, 261193]",256433,54.872479
1194,2.1,0.8,0.025,0.0005,112935.159524,49541,271,"[4885, 5169, 5929, 113247, 227645, 247673, 324...",319342,66.371955
12946,2.1,0.8,0.025,0.001,112632.230295,99303,271,"[4907, 5187, 5915, 109543, 227589, 246739, 324...",319146,66.038924
4412,2.1,0.8,0.025,0.005,110181.841509,482948,271,"[4891, 5170, 5879, 107130, 225682, 242412, 250...",245109,51.114292
6601,2.1,0.8,0.05,0.0005,107754.179889,85703,535,"[4853, 5114, 5731, 98404, 227945, 249306, 340720]",335867,70.208119
11636,2.1,0.8,0.05,0.001,107720.240132,169669,535,"[4866, 5109, 5679, 98445, 228115, 249757, 340587]",335721,69.993218
12147,2.1,0.8,0.05,0.005,107792.196954,837052,535,"[4841, 5099, 5701, 98268, 230338, 249869, 326514]",321673,67.447635
12765,2.1,0.8,0.1,0.0001,104653.344087,29664,1030,"[4797, 5056, 5594, 88323, 230894, 246801, 325567]",320770,67.868876


In [67]:
# Dione region 2, DSP
target_slope = 2.66
target_log_mean_c2c_nn_dist = 0.6685316230432883
r = result[(result.observation_slope == 2.65) & (result.observation_log_mean_c2c_nn_dist == 0.65)].copy()
r["95_percentile_absolute_range"] = r.percentiles.apply(lambda x: x[-1]) - r.percentiles.apply(lambda x: x[0])
r["95_percentile_ratio"] = r.percentiles.apply(lambda x: x[-1]) / r.percentiles.apply(lambda x: x[0])
r

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles,95_percentile_absolute_range,95_percentile_ratio
11360,2.65,0.65,0.01,0.0005,84718.356397,24902,86,"[9387, 10037, 11863, 38821, 181715, 318780, 32...",317152,34.7863
3963,2.65,0.65,0.01,0.001,85031.553982,46886,86,"[9362, 9981, 11656, 31155, 182071, 324492, 327...",318527,35.023392
10929,2.65,0.65,0.01,0.005,99068.376,346226,89,"[9789, 11044, 12714, 81340, 203999, 268766, 32...",312739,32.948003
2746,2.65,0.65,0.025,0.0005,67478.913578,63479,224,"[9431, 10131, 11497, 62512, 160820, 181629, 24...",234145,25.827166
1744,2.65,0.65,0.025,0.001,67617.379438,119287,224,"[9396, 10040, 11396, 50853, 166495, 203484, 31...",309012,33.887612
5916,2.65,0.65,0.025,0.005,82089.259363,710511,227,"[9592, 10359, 11881, 61222, 200787, 223996, 26...",258652,27.965388
11710,2.65,0.65,0.05,0.0005,93768.767623,184958,457,"[9828, 10733, 12598, 75989, 224156, 243582, 24...",239691,25.388584
14073,2.65,0.65,0.05,0.001,92260.895959,346440,457,"[9788, 10687, 12457, 74997, 222675, 239760, 24...",239310,25.449326
10781,2.65,0.65,0.05,0.005,96902.47209,1786499,464,"[9853, 10789, 12599, 81374, 215433, 236357, 24...",237788,25.133563
7636,2.65,0.65,0.1,0.0001,115275.525529,82966,959,"[9991, 11072, 13108, 88408, 239432, 251835, 28...",272963,28.320889


In [68]:
# Dione region 3, DICP
target_slope = 2.08
target_log_mean_c2c_nn_dist = 0.9338405903389017
r = result[(result.observation_slope == 2.05) & (result.observation_log_mean_c2c_nn_dist == 0.95)].copy()
r["95_percentile_absolute_range"] = r.percentiles.apply(lambda x: x[-1]) - r.percentiles.apply(lambda x: x[0])
r["95_percentile_ratio"] = r.percentiles.apply(lambda x: x[-1]) / r.percentiles.apply(lambda x: x[0])
r

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles,95_percentile_absolute_range,95_percentile_ratio
9310,2.05,0.95,0.01,0.0005,2497.74751,1707,123,"[2142, 2183, 2217, 2410, 2897, 3168, 3260]",1118,1.521942
12590,2.05,0.95,0.01,0.001,2495.182776,3321,123,"[2155, 2185, 2216, 2402, 2894, 3171, 3273]",1118,1.518794
408,2.05,0.95,0.01,0.005,2507.175999,17324,123,"[2150, 2181, 2216, 2411, 2914, 3168, 3320]",1170,1.544186
1190,2.05,0.95,0.025,0.0005,2529.70426,3662,263,"[2158, 2186, 2222, 2451, 2924, 3229, 3523]",1365,1.63253
5685,2.05,0.95,0.025,0.001,2523.773881,7443,263,"[2159, 2189, 2224, 2448, 2907, 3219, 3533]",1374,1.636406
8719,2.05,0.95,0.025,0.005,2520.419237,37344,263,"[2151, 2181, 2216, 2440, 2925, 3193, 3341]",1190,1.553231
11016,2.05,0.95,0.05,0.0005,2507.091965,7318,533,"[2143, 2174, 2210, 2431, 2864, 3169, 3278]",1135,1.529631
1690,2.05,0.95,0.05,0.001,2506.182119,14798,533,"[2142, 2176, 2212, 2437, 2851, 3160, 3266]",1124,1.524743
3293,2.05,0.95,0.05,0.005,2522.796576,73821,533,"[2134, 2168, 2208, 2434, 2892, 3153, 3285]",1151,1.539363
11454,2.05,0.95,0.1,0.0001,2513.04436,2908,895,"[2148, 2168, 2202, 2430, 2914, 3195, 3305]",1157,1.538641


In [69]:
# DICP with lower nn dist:
r = result[(result.observation_slope == 2.05) & (result.observation_log_mean_c2c_nn_dist == 0.90)].copy()
r["95_percentile_absolute_range"] = r.percentiles.apply(lambda x: x[-1]) - r.percentiles.apply(lambda x: x[0])
r["95_percentile_ratio"] = r.percentiles.apply(lambda x: x[-1]) / r.percentiles.apply(lambda x: x[0])
r

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles,95_percentile_absolute_range,95_percentile_ratio
3133,2.05,0.9,0.01,0.0005,5558.645288,2292,122,"[2790, 2826, 2884, 3240, 4169, 5315, 80789]",77999,28.956631
16679,2.05,0.9,0.01,0.001,5535.827368,4889,123,"[2794, 2833, 2893, 3257, 4204, 5340, 80790]",77996,28.915533
5846,2.05,0.9,0.01,0.005,6106.412804,24462,123,"[2757, 2815, 2887, 3256, 4236, 5391, 80951]",78194,29.361988
16195,2.05,0.9,0.025,0.0005,6152.130631,5305,262,"[2773, 2804, 2863, 3274, 4767, 5692, 80955]",78182,29.194014
6771,2.05,0.9,0.025,0.001,6233.177368,10949,263,"[2767, 2805, 2868, 3275, 4752, 5658, 81426]",78659,29.427539
14516,2.05,0.9,0.025,0.005,7172.927962,53805,263,"[2748, 2802, 2860, 3276, 4728, 31248, 81507]",78759,29.66048
12755,2.05,0.9,0.05,0.0005,7252.036802,10706,532,"[2765, 2802, 2866, 3262, 4669, 5770, 77644]",74879,28.081013
16059,2.05,0.9,0.05,0.001,7624.758909,21805,533,"[2762, 2803, 2870, 3268, 4716, 9999, 77750]",74988,28.149891
8218,2.05,0.9,0.05,0.005,8882.293881,107962,533,"[2745, 2801, 2863, 3271, 4839, 33872, 93615]",90870,34.103825
2311,2.05,0.9,0.1,0.0001,10793.751491,4193,961,"[2761, 2805, 2868, 3260, 4894, 34373, 147959]",145198,53.588917


In [70]:
# Dione 4 (DDCP)
target_slope = 2.15
target_log_mean_c2c_nn_dist = 0.5480628421533491
r = result[(result.observation_slope == 2.15) & (result.observation_log_mean_c2c_nn_dist == 0.55)].copy()
r["95_percentile_absolute_range"] = r.percentiles.apply(lambda x: x[-1]) - r.percentiles.apply(lambda x: x[0])
r["95_percentile_ratio"] = r.percentiles.apply(lambda x: x[-1]) / r.percentiles.apply(lambda x: x[0])
r

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles,95_percentile_absolute_range,95_percentile_ratio
10525,2.15,0.55,0.01,0.0005,146838.945604,66365,30,"[22005, 26671, 40553, 141892, 236505, 280562, ...",295131,14.411997
2132,2.15,0.55,0.01,0.001,144494.004027,134830,30,"[21917, 26752, 40280, 126402, 241492, 299917, ...",307986,15.052379
15530,2.15,0.55,0.01,0.005,142077.620269,655154,33,"[23156, 27369, 39920, 126124, 240700, 297301, ...",306761,14.247582
12410,2.15,0.55,0.025,0.0005,143743.774451,184789,77,"[26180, 30535, 43041, 128076, 238961, 278807, ...",281084,11.736593
436,2.15,0.55,0.025,0.001,143593.662402,368506,78,"[26095, 30222, 42609, 128125, 241503, 279736, ...",284138,11.888599
13061,2.15,0.55,0.025,0.005,143265.792687,1929730,82,"[25949, 30447, 43295, 129558, 243050, 282838, ...",284361,11.958457
6784,2.15,0.55,0.05,0.0005,145645.205144,447173,175,"[26124, 32703, 45410, 142167, 242537, 273319, ...",273049,11.452036
16372,2.15,0.55,0.05,0.001,145290.742448,899779,176,"[25835, 32328, 44780, 141885, 244132, 274934, ...",273881,11.601161
15104,2.15,0.55,0.05,0.005,143950.274693,4490156,185,"[25480, 31675, 44601, 139829, 244856, 274241, ...",276120,11.836735
15291,2.15,0.55,0.1,0.0001,148852.16854,173715,360,"[25392, 32054, 42690, 142931, 247842, 291776, ...",295831,12.650559


In [23]:
# Dione region 1: DEE - extended
target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
slope_step = 0.05
target_log_mean_c2c_nn_dist = 0.8136
target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
dee_extended = result

                                                                                

In [26]:
result.sort_values("95_ci_ratio")[:25]

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles,95_ci_ratio,95_ci_range
96,2.35,1.0636,0.04,0.0025,1294.466283,12338,391,"[1187, 1218, 1377, 1432]",0.081492,"[1187, 1218]"
53,2.35,1.0636,0.04,0.005,1295.895664,24795,391,"[1185, 1215, 1382, 1444]",0.085849,"[1185, 1215]"
103,2.25,1.0636,0.04,0.0025,1310.396422,11851,377,"[1196, 1230, 1407, 1479]",0.092237,"[1196, 1230]"
204,2.35,1.0636,0.08,0.0025,1300.770574,24169,760,"[1187, 1220, 1391, 1470]",0.092867,"[1187, 1220]"
172,2.3,1.0636,0.08,0.0025,1304.754516,24360,770,"[1188, 1223, 1397, 1476]",0.09427,"[1188, 1223]"
301,2.25,1.0636,0.04,0.005,1312.351263,24392,377,"[1191, 1226, 1409, 1491]",0.097566,"[1191, 1226]"
345,2.25,1.0636,0.08,0.0025,1312.406629,24590,780,"[1189, 1226, 1412, 1490]",0.098004,"[1189, 1226]"
180,2.2,1.0636,0.08,0.0025,1318.605638,24759,771,"[1197, 1231, 1422, 1501]",0.098287,"[1197, 1231]"
356,2.2,1.0636,0.04,0.0025,1313.205832,12860,405,"[1194, 1226, 1419, 1499]",0.098797,"[1194, 1226]"
305,2.3,1.0636,0.04,0.0025,1308.545651,11555,368,"[1182, 1224, 1406, 1484]",0.098816,"[1182, 1224]"


In [27]:
# Dione region 1: DEE
target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136

# observations_data = [
#     [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
#     [target_slope, target_log_mean_c2c_nn_dist],
#     [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
# ]
# observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

# bandwidths_data = [
#     [slope_ci_width, 0.005],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

# result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
# result[result.observation_slope == target_slope]
target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
dee = result

                                                                                

In [13]:
percentiles = result[result.observation_slope == target_slope].percentiles.iloc[0]
print(f"Orders of magnitude: {np.log10(percentiles[-1] / percentiles[0])}")
print(f"95% CI: ({percentiles[0]}, {percentiles[-1]})")

Orders of magnitude: 1.7848574272443358
95% CI: (4388, 267377)


In [28]:
dee.sort_values("95_ci_ratio")

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles,95_ci_ratio,95_ci_range
96,2.35,1.0636,0.04,0.0025,1294.466283,12338,391,"[1187, 1218, 1377, 1432]",0.081492,"[1187, 1218]"
53,2.35,1.0636,0.04,0.0050,1295.895664,24795,391,"[1185, 1215, 1382, 1444]",0.085849,"[1185, 1215]"
103,2.25,1.0636,0.04,0.0025,1310.396422,11851,377,"[1196, 1230, 1407, 1479]",0.092237,"[1196, 1230]"
204,2.35,1.0636,0.08,0.0025,1300.770574,24169,760,"[1187, 1220, 1391, 1470]",0.092867,"[1187, 1220]"
172,2.30,1.0636,0.08,0.0025,1304.754516,24360,770,"[1188, 1223, 1397, 1476]",0.094270,"[1188, 1223]"
...,...,...,...,...,...,...,...,...,...,...
32,1.85,0.9636,0.08,0.0025,42872.731775,87643,866,"[2072, 2179, 167152, 288950]",2.144433,"[2072, 2179]"
241,1.85,0.9136,0.08,0.0025,114128.441139,309923,866,"[2770, 3138, 260677, 438093]",2.199087,"[2770, 3138]"
112,1.85,0.9136,0.04,0.0050,100338.184777,243872,435,"[2732, 3053, 245278, 441880]",2.208824,"[2732, 3053]"
217,1.85,0.9136,0.04,0.0025,101761.061886,122241,435,"[2725, 3048, 249510, 442007]",2.210063,"[2725, 3048]"


In [29]:
# Dione region 2: DSP
target = "n_craters_added_in_study_region"
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685

# observations_data = [
#     [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
#     [target_slope, target_log_mean_c2c_nn_dist],
#     [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
# ]
# observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

# bandwidths_data = [
#     [slope_ci_width, 0.005],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

# result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
# result[result.observation_slope == target_slope]

target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
dsp_extended = result

                                                                                

In [93]:
percentiles = result[result.observation_slope == target_slope].percentiles.iloc[0]
print(f"Orders of magnitude: {np.log10(percentiles[-1] / percentiles[0])}")
print(f"95% CI: ({percentiles[0]}, {percentiles[-1]})")

Orders of magnitude: 1.482484902499199
95% CI: (8884, 269832)


In [None]:
dsp_extended.sort_values("95_ci_ratio")

In [30]:
# Dione region 3: DICP
target = "n_craters_added_in_study_region"
target_slope = 2.08
slope_ci_width = 0.09
target_log_mean_c2c_nn_dist = 0.9338

# observations_data = [
#     [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
#     [target_slope, target_log_mean_c2c_nn_dist],
#     [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
# ]
# observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

# bandwidths_data = [
#     [slope_ci_width, 0.005],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

# result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
# result[result.observation_slope == target_slope]

target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
dicp = result

                                                                                

In [95]:
percentiles = result[result.observation_slope == target_slope].percentiles.iloc[0]
print(f"Orders of magnitude: {np.log10(percentiles[-1] / percentiles[0])}")
print(f"95% CI: ({percentiles[0]}, {percentiles[-1]})")

Orders of magnitude: 0.2314474274605097
95% CI: (2300, 3919)


In [31]:
dicp.sort_values("95_ci_ratio")

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles,95_ci_ratio,95_ci_range
191,2.33,1.1338,0.045,0.0025,922.114301,9650,435,"[846, 867, 979, 1014]",0.078668,"[846, 867]"
225,2.33,1.1338,0.090,0.0025,921.848111,18553,837,"[842, 866, 980, 1014]",0.080726,"[842, 866]"
141,2.28,1.1338,0.045,0.0025,926.402327,9368,420,"[844, 868, 985, 1017]",0.080979,"[844, 868]"
275,2.28,1.1338,0.090,0.0025,925.193104,19518,873,"[844, 868, 985, 1020]",0.082258,"[844, 868]"
329,2.08,1.1838,0.045,0.0025,736.665876,8850,491,"[669, 692, 786, 809]",0.082522,"[669, 692]"
...,...,...,...,...,...,...,...,...,...,...
64,1.88,0.9338,0.090,0.0025,64911.517317,139344,951,"[2413, 2558, 227742, 294552]",2.086605,"[2413, 2558]"
233,1.83,0.9838,0.090,0.0025,29163.051332,76969,952,"[1869, 1959, 128745, 230659]",2.091361,"[1869, 1959]"
259,1.83,0.9338,0.090,0.0025,96765.483216,239274,952,"[2488, 2733, 240147, 348497]",2.146349,"[2488, 2733]"
43,1.83,0.9338,0.045,0.0025,86652.304923,97195,470,"[2466, 2674, 237247, 350335]",2.152490,"[2466, 2674]"


In [32]:
# Dione region 3: DICP, D > 4000m
target = "n_craters_added_in_study_region"
target_slope = 2.08
slope_ci_width = 0.09
target_log_mean_c2c_nn_dist = 0.6304

# observations_data = [
#     [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
#     [target_slope, target_log_mean_c2c_nn_dist],
#     [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
# ]
# observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

# bandwidths_data = [
#     [slope_ci_width, 0.005],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

# result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
# result[result.observation_slope == target_slope]

target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
dicp_4k_plus = result

                                                                                

In [97]:
percentiles = result[result.observation_slope == target_slope].percentiles.iloc[0]
print(f"Orders of magnitude: {np.log10(percentiles[-1] / percentiles[0])}")
print(f"95% CI: ({percentiles[0]}, {percentiles[-1]})")

Orders of magnitude: 1.2782469662193756
95% CI: (16748, 317841)


In [34]:
dicp_4k_plus.sort_values("95_ci_ratio")

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles,95_ci_ratio,95_ci_range
191,2.08,0.3804,0.045,0.0050,207343.027027,74,1,"[207276, 207283, 207497, 207507]",0.000484,"[207276, 207283]"
111,2.13,0.3804,0.045,0.0050,207343.027027,74,1,"[207276, 207283, 207497, 207507]",0.000484,"[207276, 207283]"
237,1.98,0.4304,0.045,0.0050,82615.570324,2716,1,"[80979, 81541, 83711, 83915]",0.015467,"[80979, 81541]"
148,1.83,0.4804,0.090,0.0025,167993.102452,5954,1,"[164898, 165359, 170928, 171375]",0.016732,"[164898, 165359]"
51,1.88,0.4804,0.045,0.0025,167993.102452,5954,1,"[164898, 165359, 170928, 171375]",0.016732,"[164898, 165359]"
...,...,...,...,...,...,...,...,...,...,...
283,1.93,0.8804,0.045,0.0025,92958.557430,146413,466,"[3202, 3505, 242454, 310643]",1.986840,"[3202, 3505]"
317,1.88,0.8804,0.090,0.0025,114805.089448,597742,951,"[3347, 4018, 242543, 325084]",1.987340,"[3347, 4018]"
239,1.93,0.8804,0.045,0.0050,94367.362940,291368,466,"[3200, 3501, 240092, 311697]",1.988583,"[3200, 3501]"
203,1.98,0.8804,0.045,0.0025,51980.157414,80495,452,"[3105, 3266, 195436, 340776]",2.040407,"[3105, 3266]"


In [33]:
# Dione region 4: DDCP
target = "n_craters_added_in_study_region"
target_slope = 2.15
slope_ci_width = 0.05
target_log_mean_c2c_nn_dist = 0.5481

# observations_data = [
#     [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
#     [target_slope, target_log_mean_c2c_nn_dist],
#     [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
# ]
# observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

# bandwidths_data = [
#     [slope_ci_width, 0.005],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

# result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
# result[result.observation_slope == target_slope]
target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
ddcp = result

                                                                                

In [101]:
percentiles = result[result.observation_slope == target_slope].percentiles.iloc[0]
print(f"Orders of magnitude: {np.log10(percentiles[-1] / percentiles[0])}")
print(f"95% CI: ({percentiles[0]}, {percentiles[-1]})")

Orders of magnitude: 1.0797471370653988
95% CI: (25629, 307949)


In [35]:
ddcp.sort_values("95_ci_ratio")

Unnamed: 0,observation_slope,observation_log_mean_c2c_nn_dist,bandwith_slope,bandwith_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles,95_ci_ratio,95_ci_range
90,1.95,0.4481,0.025,0.0050,241242.500000,20,1,"[241233, 241234, 241250, 241252]",0.000034,"[241233, 241234]"
201,2.40,0.2981,0.050,0.0025,182919.861728,2025,1,"[182025, 182177, 183796, 184114]",0.004956,"[182025, 182177]"
127,2.10,0.3981,0.025,0.0025,220034.474878,15783,1,"[188035, 192444, 257981, 259161]",0.139331,"[188035, 192444]"
81,2.10,0.3981,0.025,0.0050,229260.445277,43373,2,"[187582, 191446, 258429, 261658]",0.144543,"[187582, 191446]"
208,2.25,0.3481,0.025,0.0025,158557.811948,53230,1,"[114226, 118296, 187882, 191905]",0.225321,"[114226, 118296]"
...,...,...,...,...,...,...,...,...,...,...
192,2.00,0.7981,0.025,0.0025,131306.018712,448581,274,"[5256, 8172, 237407, 373340]",1.851449,"[5256, 8172]"
287,2.30,0.7481,0.050,0.0025,95582.147390,313969,459,"[5902, 6707, 226313, 475692]",1.906327,"[5902, 6707]"
103,2.35,0.7481,0.050,0.0025,83357.532059,236331,481,"[5774, 6290, 246790, 477838]",1.917804,"[5774, 6290]"
63,2.35,0.7481,0.025,0.0025,110857.833328,167665,237,"[5872, 6547, 267375, 541837]",1.965083,"[5872, 6547]"


In [None]:
# DEE w/AD

target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104
target_ad = 0.06

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist, target_ad],
    [target_slope, target_log_mean_c2c_nn_dist, target_ad],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist, target_ad],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist", "areal_density"])

bandwidths_data = [
    [0.01, 0.001, 0.1],
    [0.01, 0.005, 0.1],
    [0.005, 0.001, 0.1],
    [0.005, 0.005, 0.1],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist", "areal_density"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result

In [None]:
# DEE w/Za

target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104
target_za = -0.23

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist, target_za],
    [target_slope, target_log_mean_c2c_nn_dist, target_za],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist, target_za],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist", "za"])

bandwidths_data = [
    [0.01, 0.001, 3],
    [0.01, 0.005, 3],
    [0.005, 0.001, 3],
    [0.005, 0.005, 3],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist", "za"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result

In [None]:
target = "n_craters_added_in_study_region"

observation = {
    "slope": 1.5,
    "log_mean_c2c_nn_dist": 1.8
}
bandwidths = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.05
}

result = get_confidence_interval(data, configs_df, target, observation, bandwidths)

In [None]:
result

In [None]:
simulation_id = None
target = "n_craters_added_in_study_region"

bandwidths = {
    "slope": 0.01,
    "log_mean_c2c_nn_dist": 0.001
}

In [None]:
# Dione 1 (DEE)
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione1_result = result
result

In [None]:
# Dione 2 (DSP)
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685316230432883
target_effective_radius_multiplier = 1.3

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione2_result = result
result

In [None]:
# Dione 2 (DSP)
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685316230432883
target_effective_radius_multiplier = 1.3

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione2_result = result
result

In [None]:
# Dione 2 (DSP)
# Wider slope bandwidth
target_slope = 2.8
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685316230432883
target_effective_radius_multiplier = 1.3

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione2_result = result
result

In [None]:
# Dione 3 (DICP)
target_slope = 2.08
slope_ci_width = 0.09
target_log_mean_c2c_nn_dist = 0.9338405903389017

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione3_result = result
result

In [None]:
# Dione 4 (DDCP)
target_slope = 2.15
slope_ci_width = 0.05
target_log_mean_c2c_nn_dist = 0.5480628421533491

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione4_result = result
result

In [None]:
# Dione 2 (DSP)
# With eff radius multiplier
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685316230432883
target_effective_radius_multiplier = 1.3

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist,
    "effective_radius_multiplier": target_effective_radius_multiplier
}
bandwidths_with_eff = bandwidths.copy()
bandwidths_with_eff.update({"effective_radius_multiplier": 0.05})
result = get_confidence_interval(data, configs_df, target, observation, bandwidths_with_eff)
dione2_with_eff_result = result
result

In [None]:
# Try using SQL
simulation_id = None
target = "n_craters_added_in_study_region"
bandwidths = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.05
}

observation = {
    "slope": 1.5,
    "log_mean_c2c_nn_dist": 1.8
}


data.createOrReplaceTempView("data")
configs_df.createOrReplaceTempView("configs")

simulation_id_clause = f"AND data.simulation_id <> {simulation_id}\n" if simulation_id else ""
bandwidth_clauses = ""
for var, bandwidth in bandwidths.items():
    observation_value = observation[var]
    bandwidth_clauses += f"AND {var} BETWEEN {observation_value - bandwidth} AND {observation_value + bandwidth}\n"

result = spark.sql(f"""
SELECT
    *
    -- approx_percentile({target}, array(0.025, 0.5, 0.975), 10000) as percentiles
FROM
    data
    INNER JOIN configs
        ON data.simulation_id = configs.simulation_id
WHERE
    1=1
    {simulation_id_clause}
    {bandwidth_clauses}
""").toPandas()

In [None]:
data.columns

In [None]:
slope_delta = 0.1
log_mean_c2c_nn_dist_delta = 0.1
effective_radius_multiplier_delta = 0.1

min_slope, max_slope, n_slope_buckets = get_min_max_n_buckets("slope", slope_delta, configs_df)
min_log_mean_c2c_nn_dist, max_log_mean_c2c_nn_dist, n_log_mean_c2c_nn_dist_buckets = get_min_max_n_buckets("log_mean_c2c_nn_dist", log_mean_c2c_nn_dist_delta, data)
min_effective_radius_multiplier, max_effective_radius_multiplier, n_effective_radius_multiplier_buckets = get_min_max_n_buckets("effective_radius_multiplier", effective_radius_multiplier_delta, configs_df)

In [None]:
targets_array = [
    [np.round(x, decimals=5), np.round(y, decimals=5), np.round(z, decimals=5)]
    for x in np.linspace(min_slope, max_slope, n_slope_buckets)
    for y in np.linspace(min_log_mean_c2c_nn_dist, max_log_mean_c2c_nn_dist, n_log_mean_c2c_nn_dist_buckets)
    for z in np.linspace(min_effective_radius_multiplier, max_effective_radius_multiplier, n_effective_radius_multiplier_buckets)
]

targets = pd.DataFrame(targets_array, columns=["target_slope", "target_log_mean_c2c_nn_dist", "target_effective_radius_multiplier"])
targets_spark_df = spark.createDataFrame(targets)
data_with_targets = data.join(F.broadcast(targets_spark_df), how="cross")

In [None]:
data_with_configs = data_with_targets.join(F.broadcast(configs_df), on="simulation_id")

In [None]:
# Parameters
simulation_id = None
slope_ci_width = 0.05
log_mean_c2c_nn_dist_width = 0.01
effective_radius_multiplier_width = 0.1

# Select where not equal to a simulation ID
data_to_query = data_with_configs
if simulation_id is not None:
    data_to_query = data_to_query.where(F.col("simulation_id") != simulation_id)

# Select within a range of slope and log_c2c_nn_dist_mean
result = (
    data_to_query
        .where((F.col("slope") > F.col("target_slope") - F.lit(slope_width))
               & (F.col("slope") <= F.col("target_slope") + F.lit(slope_width))
               & (F.col("log_mean_c2c_nn_dist") > F.col("target_log_mean_c2c_nn_dist") - F.lit(log_mean_c2c_nn_dist_width))
               & (F.col("log_mean_c2c_nn_dist") <= F.col("target_log_mean_c2c_nn_dist") + F.lit(log_mean_c2c_nn_dist_width))
               & (F.col("effective_radius_multiplier") > F.col("target_effective_radius_multiplier") - F.lit(effective_radius_multiplier_width))
               & (F.col("effective_radius_multiplier") <= F.col("target_effective_radius_multiplier") + F.lit(effective_radius_multiplier_width)))
        .groupby("target_slope", "target_log_mean_c2c_nn_dist", "target_effective_radius_multiplier")
        .agg(
            F.percentile_approx("n_craters_added_in_study_region", F.array(*[F.lit((x) / 40.0) for x in range(40)]), 10000).alias("N_percentiles"),
            F.mean("n_craters_added_in_study_region").alias("N_mean"),
            F.stddev("n_craters_added_in_study_region").alias("N_stdev"),
            F.count("n_craters_added_in_study_region").alias("count")
        )
)

In [None]:
df = result.toPandas()
df

In [None]:
df[df.target_slope == 2.1].sort_values(["target_log_mean_c2c_nn_dist", "target_effective_radius_multiplier"])

In [None]:
def get_single_confidence_interval(target_slope: float,
                                   slope_delta: float,
                                   target_log_mean_c2c_nn_dist: float,
                                   log_mean_c2c_nn_dist_delta: float,
                                   target_effective_radius_multiplier: float,
                                   effective_radius_multiplier_delta: float,
                                   data: pd.DataFrame) -> Tuple[float, float, float]:
    target_slope = quantize_value(target_slope, slope_delta)
    target_log_mean_c2c_nn_dist = quantize_value(target_log_mean_c2c_nn_dist, log_mean_c2c_nn_dist_delta)
    target_effective_radius_multiplier = quantize_value(target_effective_radius_multiplier, effective_radius_multiplier_delta)
    
    row = data[(np.abs(data.target_slope - target_slope) < 0.001)
                & (np.abs(data.target_log_mean_c2c_nn_dist - target_log_mean_c2c_nn_dist) < 0.001)
                & (np.abs(data.target_effective_radius_multiplier - target_effective_radius_multiplier) < 0.001)
            ].iloc[0]

    # 2.5th and 97.5th percentiles
    return row.N_percentiles[1], row.N_percentiles[-2]

def get_confidence_intervals(target_slope: float,
                             slope_delta: float,
                             slope_ci_width: float,
                             target_log_mean_c2c_nn_dist: float,
                             log_mean_c2c_nn_dist_delta: float,
                             target_effective_radius_multiplier: float,
                             effective_radius_multiplier_delta: float,
                             data: pd.DataFrame) -> Tuple[float, float]:
    lower_ci = get_single_confidence_interval(target_slope - slope_ci_width, slope_delta, target_log_mean_c2c_nn_dist, log_mean_c2c_nn_dist_delta, target_effective_radius_multiplier, effective_radius_multiplier_delta, data)
    mean_ci = get_single_confidence_interval(target_slope, slope_delta, target_log_mean_c2c_nn_dist, log_mean_c2c_nn_dist_delta, target_effective_radius_multiplier, effective_radius_multiplier_delta, data)
    upper_ci = get_single_confidence_interval(target_slope + slope_ci_width, slope_delta, target_log_mean_c2c_nn_dist, log_mean_c2c_nn_dist_delta, target_effective_radius_multiplier, effective_radius_multiplier_delta, data)

    return [lower_ci, mean_ci, upper_ci]

In [None]:
# Dione 1 (DEE)
# target_slope = 2.10
# slope_ci_width = 0.08
# target_log_mean_c2c_nn_dist = 0.8136322010294104

# Dione 2 (DSP)
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685316230432883
target_effective_radius_multiplier = 1.3

# Dione 3 (DICP)
# target_slope = 2.08
# slope_ci_width = 0.09
# target_log_mean_c2c_nn_dist = 0.9338405903389017

# Dione 4 (DDCP)
# target_slope = 2.15
# slope_ci_width = 0.05
# target_log_mean_c2c_nn_dist = 0.5480628421533491


get_confidence_intervals(target_slope, slope_delta, slope_ci_width, target_log_mean_c2c_nn_dist, log_mean_c2c_nn_dist_delta, target_effective_radius_multiplier, effective_radius_multiplier_delta, df)