In [1]:
import sys

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
from typing import *

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Window
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
import itertools

from pyspark.ml.feature import VectorAssembler, MinMaxScaler, BucketedRandomProjectionLSH
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array

In [2]:
n_cores = 30

spark = (SparkSession.builder
         .master(f"local[{n_cores}]")
         .appName("Saturation")
         .config("spark.sql.shuffle.partitions", "500")
         .config("spark.driver.memory", "64g")
         .config("spark.driver.maxResultSize", "8g")
         .getOrCreate())

23/11/24 14:51:52 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.86.20 instead (on interface enp8s0)
23/11/24 14:51:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/24 14:51:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/24 14:51:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config


def read_configs(base_path: str, spark_session: SparkSession) -> pyspark.RDD:
    completed_filenames = list(Path(base_path).glob("*/completed.txt"))
    configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
    configs = map(read_config, configs)
    return spark_session.sparkContext.parallelize(configs)


def create_configs_df(configs: pyspark.RDD) -> DataFrame:
    config_columns = [
        "simulation_id",
        "slope",
        "r_stat_multiplier",
        "effective_radius_multiplier",
        "min_rim_percentage"
    ]
    return configs.map(lambda x: {k: v for k, v in x.items() if k in config_columns}).toDF().cache()
    

def quantize_value(value: float, delta: float) -> float:
    return np.round(int(np.round(value / delta, decimals=5)) * delta, decimals=5)


def get_min_max_n_buckets(column: str, delta: float, df) -> Tuple[float, float, int]:
    min_max_df = df.select(F.min(column), F.max(column)).toPandas()
    
    min_val = quantize_value(min_max_df.iloc[0, 0], delta)
    max_val = quantize_value(min_max_df.iloc[0, 1], delta)
    n_buckets = int(np.round((max_val - min_val) / delta, decimals=5)) + 1

    return min_val, max_val, n_buckets

In [4]:
base_path = "/data/saturation/n_craters_stop_condition_20230918"

In [5]:
r_stat = 5
study_region_size = 4000 * 4000 / r_stat ** 2

configs_df = create_configs_df(read_configs(base_path, spark))
data = spark.read.parquet(f"{base_path}/*/statistics_*.parquet").coalesce(500)

# State c2c nn dist in terms of r_stat and log-scale
data = data.select("*",
                   F.log10(F.col("center_to_center_nearest_neighbor_distance_mean") / F.lit(r_stat)).alias("log_mean_c2c_nn_dist"),
                   F.log10(F.col("n_craters_in_study_region") / study_region_size).alias("log_intensity"),
                   F.log10("areal_density").alias("log_ad")
                  )

                                                                                

In [None]:
# def get_confidence_interval(data: DataFrame,
#                             configs: DataFrame,
#                             target: str,
#                             observation: Dict[str, float],
#                             bandwidths: Dict[str, float],
#                             simulation_id: Optional[int] = None) -> pd.DataFrame:
#     data.createOrReplaceTempView("data")
#     configs.createOrReplaceTempView("configs")
    
#     simulation_id_clause = f"AND data.simulation_id <> {simulation_id}\n" if simulation_id else ""
#     bandwidth_clauses = ""
#     for var, bandwidth in bandwidths.items():
#         observation_value = observation[var]
#         bandwidth_clauses += f"AND {var} BETWEEN {observation_value - bandwidth} AND {observation_value + bandwidth}\n"
    
#     result = spark.sql(f"""
#     SELECT
#         mean({target}) as target_mean,
#         count({target}) as n_obs,
#         count(distinct data.simulation_id) as n_unique_sims,
#         approx_percentile({target}, array(0.025, 0.05, 0.10, 0.5, 0.90, 0.95, 0.975), 10000) as percentiles
#     FROM
#         data
#         INNER JOIN configs
#             ON data.simulation_id = configs.simulation_id
#     WHERE
#         1=1
#         {simulation_id_clause}
#         {bandwidth_clauses}
#     """).toPandas()

#     return result

In [None]:
# def get_confidence_intervals(data: DataFrame,
#                              configs: DataFrame,
#                              target: str,
#                              observations: pd.DataFrame,
#                              bandwidths: pd.DataFrame,
#                              spark_session: SparkSession) -> pd.DataFrame:
#     data.createOrReplaceTempView("data")
#     configs.createOrReplaceTempView("configs")

#     observations_df = spark_session.createDataFrame(observations).cache()
#     observations_df.createOrReplaceTempView("observations")
#     bandwidths_df = spark_session.createDataFrame(bandwidths).cache()
#     bandwidths_df.createOrReplaceTempView("bandwidths")

#     # Trick it into caching and broadcasting
#     observations_df.count()
#     bandwidths_df.count()

#     simulation_id_present = "simulation_id" in observations.columns
    
#     group_by_clause = ",\n".join([f"o.{x}" for x in observations.columns] + [f"b.{x}" for x in bandwidths.columns])
#     if simulation_id_present:
#         simulation_id_clause = "AND d.simulation_id <> o.simulation_id\n"
#         target_select_clause = "o.target AS target,\n"
#         bandwidths_select_clause = "o.simulation_id AS simulation_id, o.crater_id AS crater_id,\n"
#     else:
#         simulation_id_clause = ""
#         target_select_clause = ""
#         bandwidths_select_clause = ""
       
#     observations_select_clause = ""
#     bandwidth_clauses = ""
#     for var in observations.columns:
#         observations_select_clause += f"o.{var} as observation_{var},\n"
#         if var in bandwidths.columns:
#             bandwidth_clauses += f"AND d.{var} BETWEEN (o.{var} - b.{var}) AND (o.{var} + b.{var})\n"

#     for var in bandwidths.columns:
#         bandwidths_select_clause += f"b.{var} as bandwith_{var},\n"

#     query = f"""
#     SELECT
#         {observations_select_clause}
#         {bandwidths_select_clause}
#         {target_select_clause}
#         mean({target}) as target_mean,
#         count({target}) as n_obs,
#         count(distinct d.simulation_id) as n_unique_sims,
#         approx_percentile({target}, array(0.025, 0.10, 0.90, 0.975), 1000) as percentiles
#     FROM
#         (
#             SELECT
#                 configs.slope,
#                 configs.r_stat_multiplier,
#                 configs.effective_radius_multiplier,
#                 configs.min_rim_percentage,
#                 data.*
#             FROM
#                 data
#                 INNER JOIN configs
#                     ON data.simulation_id = configs.simulation_id
#         ) AS d
#         INNER JOIN observations o
#         INNER JOIN bandwidths b
#     WHERE
#         1=1
#         {simulation_id_clause}
#         {bandwidth_clauses}
#     GROUP BY
#         {group_by_clause}
#     """
#     result = spark.sql(query)

#     return result.toPandas()

In [6]:
# Using HyperOpt to optimize
def get_confidence_intervals(data: DataFrame,
                             observations: DataFrame,
                             predictor_variables: List[str],
                             bandwidths: pd.DataFrame,
                             spark_session: SparkSession) -> pd.DataFrame:
    data.createOrReplaceTempView("data")
    
    bandwidths_df = spark_session.createDataFrame(bandwidths).cache()
    bandwidths_df.createOrReplaceTempView("bandwidths")

    # Trick it into caching and broadcasting
    bandwidths_df.count()

    simulation_id_present = "simulation_id" in observations.columns
    
    group_by_clause = ",\n ".join([f"o.{x}" for x in predictor_variables] + [f"b.{x}" for x in bandwidths.columns])

    if "target" in observations.columns:
        group_by_clause += "\n, o.target"
        
    if simulation_id_present:
        simulation_id_clause = "AND d.simulation_id <> o.simulation_id\n"
        target_select_clause = "o.target AS target,"
        bandwidths_select_clause = "o.simulation_id AS simulation_id, o.crater_id AS crater_id,\n"
        group_by_clause += "\n, o.simulation_id, o.crater_id"
    else:
        simulation_id_clause = ""
        target_select_clause = ""
        bandwidths_select_clause = ""
        

    observations_select_clause = ""
    bandwidth_clauses = "1=1 "
    for var in predictor_variables:
        observations_select_clause += f"o.{var} as observation_{var},\n"
        if var in bandwidths.columns:
            bandwidth_clauses += f"AND d.{var} BETWEEN (o.{var} - b.{var}) AND (o.{var} + b.{var})\n"

    for var in predictor_variables:
        bandwidths_select_clause += f"b.{var} as bandwidth_{var},\n"

    query = f"""
    SELECT
        {observations_select_clause}
        {bandwidths_select_clause}
        {target_select_clause}
        mean(d.target) as target_mean,
        count(d.target) as n_obs,
        count(distinct d.simulation_id) as n_unique_sims,
        approx_percentile(d.target, array(0.025, 0.975), 500) as percentiles
    FROM
        observations o
        INNER JOIN bandwidths b
        LEFT JOIN data d
            ON {bandwidth_clauses}
    WHERE
        1=1
        {simulation_id_clause}
    GROUP BY
        {group_by_clause}
    """
    result = spark.sql(query)

    return result.toPandas()


def setup_dataset(data: DataFrame,
                  configs: DataFrame,
                  predictor_variables: List[str],
                  target: str,
                  spark: SparkSession):
    data.createOrReplaceTempView("data")
    configs.createOrReplaceTempView("configs")
    
    # Join data and config
    data_and_config_select_clause = ",\n".join(["data.simulation_id", "crater_id"] + predictor_variables)
    data_and_config_select_clause += f",\n {target} as target"
    query = f"""
    SELECT
        {data_and_config_select_clause}
    FROM
        data
        INNER JOIN configs
            ON data.simulation_id = configs.simulation_id
    """
    data_and_configs = spark.sql(query)

    return data_and_configs


def setup_datasets_for_optimization(data: DataFrame,
                                    configs: DataFrame,
                                    predictor_variables: List[str],
                                    target: str,
                                    data_sample_fraction: float,
                                    test_sample_fraction: float,
                                    spark: SparkSession,
                                    cache_data: bool = True,
                                    cache_sample: bool = True):
    data_and_configs = setup_dataset(data, configs, predictor_variables, target, spark)

    if cache_data:
        data_sample = data_and_configs.sample(data_sample_fraction).cache()
        data_sample.createOrReplaceTempView("data")
        data_sample.count()
    else:
        data_sample = data_and_configs.sample(data_sample_fraction)
        data_sample.createOrReplaceTempView("data")

    if cache_sample:
        test_sample = data_and_configs.sample(test_sample_fraction).cache()
        test_sample.createOrReplaceTempView("observations")
        test_sample.count()
    else:
        test_sample = data_and_configs.sample(test_sample_fraction)
        test_sample.createOrReplaceTempView("observations")
    
    return data_sample, test_sample
    

def create_objective_function(data_sample: DataFrame,
                              observations: DataFrame,
                              predictor_variables: List[str],
                              spark: SparkSession):
    def objective_function(args):
        min_obs = 100
        n_too_many_obs = 500000
        
        bandwidths_data = [[args[f"bandwidth_{x}"] for x in predictor_variables]]
        bandwidths = pd.DataFrame(bandwidths_data, columns=predictor_variables)
        
        result = get_confidence_intervals(data_sample, observations, predictor_variables, bandwidths, spark)

        ci_high = result.percentiles.map(lambda x: x[-1])
        ci_low = result.percentiles.map(lambda x: x[0])
        orders_of_magnitude_rms = np.sqrt((np.log10(ci_high / ci_low) ** 2).mean())
        orders_of_magnitude = np.log10(ci_high / ci_low).mean()
        orders_of_magnitude_stdev = np.log10(ci_high / ci_low).std()
        mean_n_unique_sims = result.n_unique_sims.mean()
        mean_n_obs = result.n_obs.mean()
        
        percent_inside_ci = ((result.target >= ci_low) & (result.target <= ci_high)).mean()

        # Penalize if more or less than 5% are outside of the CI
        deviance_from_95 = np.abs(0.95 - percent_inside_ci)
        deviance_loss_multiplier = 1.0 if deviance_from_95 < 0.02 else 1 + deviance_from_95 * 10.0
        
        # Penalize for the fraction with too few observations
        too_few = result.n_obs[(result.n_obs < min_obs) | (result.n_obs.isna())]
        n_obs_loss_multiplier = 1 + too_few.mean() * 100 if too_few.shape[0] > 0 else 1.0

        # Penalize for the fraction with too many observations
        too_many = result.n_obs > n_too_many_obs
        n_too_many_obs_loss_multiplier = 1 + too_many.mean() * 10 if too_many.shape[0] > 0 else 1.0

        loss = orders_of_magnitude_rms * deviance_loss_multiplier * n_obs_loss_multiplier * n_too_many_obs_loss_multiplier

        print(f"{args}, {loss:.3f}, {orders_of_magnitude:.4f}, {orders_of_magnitude_stdev:.4f}, {orders_of_magnitude_rms:.4f}, {n_obs_loss_multiplier:.4f}, {n_too_many_obs_loss_multiplier:.4f}, {deviance_loss_multiplier:.4f}, {mean_n_unique_sims}, {mean_n_obs:.4f}, ")
        
        return {
            "loss": loss,
            "params": args,
            "orders_of_magnitude": orders_of_magnitude,
            "orders_of_magnitude_rms": orders_of_magnitude_rms,
            "n_obs_loss_multiplier": n_obs_loss_multiplier,
            "deviance_loss_multiplier": deviance_loss_multiplier,
            "mean_n_unique_sims": mean_n_unique_sims,
            "mean_n_obs": mean_n_obs,
            "status": STATUS_OK
        }
        
    return objective_function

In [7]:
def make_updates_and_report(data: pd.DataFrame) -> DataFrame:
    data["95_ci_ratio"] = data.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
    data["95_ci_low"] = data.percentiles.map(lambda x: x[0])
    data["95_ci_high"] = data.percentiles.map(lambda x: x[-1])
    
    data["80_ci_low"] = data.percentiles.map(lambda x: x[1])
    data["80_ci_high"] = data.percentiles.map(lambda x: x[-2])

    ci_95_hits = ((data.target >= data["95_ci_low"]) & (data.target <= data["95_ci_high"])).mean()
    ci_80_hits = ((data.target >= data["80_ci_low"]) & (data.target <= data["80_ci_high"])).mean()

    data["95_percentile_absolute_range"] = data.percentiles.apply(lambda x: x[-1]) - data.percentiles.apply(lambda x: x[0])
    data["95_percentile_ratio"] = data.percentiles.apply(lambda x: x[-1]) / data.percentiles.apply(lambda x: x[0])
    perc_ratio = np.mean(np.log10(data["95_percentile_ratio"]))

    mean_n_obs = data.n_obs.mean()
    n_obs_05 = data.n_obs.quantile(0.05)
    n_obs_25 = data.n_obs.quantile(0.25)
    n_obs_75 = data.n_obs.quantile(0.75)

    print(f"{ci_95_hits:.3f}, {ci_80_hits:.3f}, {perc_ratio:.3f}, {mean_n_obs:.3f}, {n_obs_05:.3f}, {n_obs_25:.3f}, {n_obs_75:.3f}")

In [8]:
# New hyperopt run

In [9]:
n_samples = 5000
n_obs_total = 2754157738
data_sample_fraction = 0.25
test_sample_fraction = n_samples / n_obs_total
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    "log_ad"
]

data_sample, test_sample = setup_datasets_for_optimization(data, configs_df, predictor_variables, target, data_sample_fraction, test_sample_fraction, spark, cache_data=False, cache_sample=True)

trials = Trials()
space = {
    # "bandwidth_slope": hp.uniform("bandwidth_slope", 0.1499, 0.15),
    "bandwidth_slope": hp.choice("bandwidth_slope", [0.1]),
    "bandwidth_log_mean_c2c_nn_dist": hp.uniform("bandwidth_log_mean_c2c_nn_dist", 0.0005, 0.02),
    "bandwidth_log_ad": hp.uniform("bandwidth_log_ad", 0.001, 0.15),
}

best = fmin(create_objective_function(data_sample, test_sample, predictor_variables, spark),
            space=space,
            algo=tpe.suggest,
            max_evals=100)



  0%|             | 0/100 [00:00<?, ?trial/s, best loss=?]



{'bandwidth_log_ad': 0.10783117230901547, 'bandwidth_log_mean_c2c_nn_dist': 0.010874825931929153, 'bandwidth_slope': 0.1}, 14.919, 1.3742, 0.3377, 1.4150, 1.0000, 10.5430, 1.0000, 583.7412772903489, 3144004.0396, 
  1%| | 1/100 [1:00:09<99:16:09, 3609.79s/trial, best loss



{'bandwidth_log_ad': 0.06615375780316522, 'bandwidth_log_mean_c2c_nn_dist': 0.013230994044638926, 'bandwidth_slope': 0.1}, 14.602, 1.3657, 0.3445, 1.4085, 1.0000, 10.3675, 1.0000, 529.9022648439094, 2994619.1871, 
  2%| | 2/100 [1:57:28<95:31:21, 3509.00s/trial, best loss



{'bandwidth_log_ad': 0.06360108789894536, 'bandwidth_log_mean_c2c_nn_dist': 0.002056107778387604, 'bandwidth_slope': 0.1}, 39308.072, 1.3625, 0.3512, 1.4071, 5826.0000, 4.7951, 1.0000, 448.19893899204243, 460388.5885, 
  3%| | 3/100 [2:38:10<81:25:25, 3021.91s/trial, best loss



{'bandwidth_log_ad': 0.0156654683446627, 'bandwidth_log_mean_c2c_nn_dist': 0.008741557197830166, 'bandwidth_slope': 0.1}, 40234.747, 1.3442, 0.3706, 1.3944, 5201.0000, 5.5481, 1.0000, 346.75943684962255, 692079.5134, 
  4%| | 4/100 [3:21:21<76:03:03, 2851.91s/trial, best loss



{'bandwidth_log_ad': 0.08511018017933902, 'bandwidth_log_mean_c2c_nn_dist': 0.009675676169845722, 'bandwidth_slope': 0.1}, 14.598, 1.3700, 0.3418, 1.4119, 1.0000, 10.3389, 1.0000, 545.0903897163844, 2508126.7492, 
  5%| | 5/100 [4:15:23<78:57:49, 2992.31s/trial, best loss



{'bandwidth_log_ad': 0.11868081963471469, 'bandwidth_log_mean_c2c_nn_dist': 0.01760474595550032, 'bandwidth_slope': 0.1}, 15.194, 1.3764, 0.3344, 1.4164, 1.0000, 10.7266, 1.0000, 622.6306876147725, 5273270.8604, 
  6%| | 6/100 [5:31:31<92:07:21, 3528.10s/trial, best loss



{'bandwidth_log_ad': 0.10883839782273212, 'bandwidth_log_mean_c2c_nn_dist': 0.019544262742744484, 'bandwidth_slope': 0.1}, 15.192, 1.3758, 0.3343, 1.4158, 1.0000, 10.7307, 1.0000, 620.539073658437, 5627336.8286, 
  7%| | 7/100 [6:53:10<102:43:28, 3976.43s/trial, best los



{'bandwidth_log_ad': 0.009549974608668601, 'bandwidth_log_mean_c2c_nn_dist': 0.0063654696445903935, 'bandwidth_slope': 0.1}, 17977.362, 1.3395, 0.3773, 1.3916, 4201.0000, 3.0751, 1.0000, 292.4176698632932, 323022.1806, 
  8%| | 8/100 [7:32:03<88:14:58, 3453.25s/trial, best loss



{'bandwidth_log_ad': 0.14921682891780272, 'bandwidth_log_mean_c2c_nn_dist': 0.015290208762783522, 'bandwidth_slope': 0.1}, 15.217, 1.3779, 0.3342, 1.4178, 1.0000, 10.7327, 1.0000, 637.800448887982, 5015784.1375, 
  9%| | 9/100 [8:45:50<94:59:08, 3757.68s/trial, best loss



{'bandwidth_log_ad': 0.0737209298148961, 'bandwidth_log_mean_c2c_nn_dist': 0.0013120611849859968, 'bandwidth_slope': 0.1}, 19334.439, 1.3655, 0.3498, 1.4095, 4576.0000, 2.9976, 1.0000, 465.72189349112426, 317869.5862, 
 10%| | 10/100 [9:25:16<83:12:09, 3328.10s/trial, best los



{'bandwidth_log_ad': 0.00374984137991682, 'bandwidth_log_mean_c2c_nn_dist': 0.018596954922119845, 'bandwidth_slope': 0.1}, 27201.200, 1.3482, 0.3673, 1.3973, 6061.0000, 3.2118, 1.0000, 358.8545194858192, 321986.2228, 
 11%| | 11/100 [10:06:07<75:38:13, 3059.48s/trial, best lo



{'bandwidth_log_ad': 0.09228785219927313, 'bandwidth_log_mean_c2c_nn_dist': 0.013368137236164089, 'bandwidth_slope': 0.1}, 14.920, 1.3723, 0.3388, 1.4135, 1.0000, 10.5552, 1.0000, 574.8732911650684, 3591863.8635, 
 12%| | 12/100 [11:09:30<80:19:09, 3285.79s/trial, best lo



{'bandwidth_log_ad': 0.12130208951063115, 'bandwidth_log_mean_c2c_nn_dist': 0.019207627710791736, 'bandwidth_slope': 0.1}, 15.226, 1.3769, 0.3337, 1.4168, 1.0000, 10.7470, 1.0000, 630.7784125688635, 5797033.4371, 
 13%|▏| 13/100 [12:32:36<91:51:07, 3800.78s/trial, best lo



{'bandwidth_log_ad': 0.14133639144397367, 'bandwidth_log_mean_c2c_nn_dist': 0.009560591084935992, 'bandwidth_slope': 0.1}, 14.999, 1.3762, 0.3370, 1.4169, 1.0000, 10.5858, 1.0000, 611.2732095490716, 3084594.8519, 
 14%|▏| 14/100 [13:31:15<88:45:58, 3715.79s/trial, best lo



{'bandwidth_log_ad': 0.10985108985194812, 'bandwidth_log_mean_c2c_nn_dist': 0.01592531578037195, 'bandwidth_slope': 0.1}, 15.129, 1.3752, 0.3357, 1.4156, 1.0000, 10.6878, 1.0000, 607.436849622526, 4622294.4169, 
 15%|▏| 15/100 [14:44:34<92:35:28, 3921.51s/trial, best lo



{'bandwidth_log_ad': 0.006758347301801427, 'bandwidth_log_mean_c2c_nn_dist': 0.007921047365593493, 'bandwidth_slope': 0.1}, 18106.087, 1.3397, 0.3771, 1.3918, 4701.0000, 2.7673, 1.0000, 291.02244897959184, 283379.7473, 
 16%|▏| 16/100 [15:24:53<80:56:51, 3469.19s/trial, best lo



{'bandwidth_log_ad': 0.05901596639429092, 'bandwidth_log_mean_c2c_nn_dist': 0.01689167389106631, 'bandwidth_slope': 0.1}, 14.700, 1.3641, 0.3452, 1.4071, 1.0000, 10.4471, 1.0000, 535.2615792695368, 3550885.4018, 
 17%|▏| 17/100 [16:26:53<81:43:29, 3544.69s/trial, best lo



{'bandwidth_log_ad': 0.01045038532458865, 'bandwidth_log_mean_c2c_nn_dist': 0.0072583649091522845, 'bandwidth_slope': 0.1}, 25853.214, 1.3408, 0.3759, 1.3925, 5041.0000, 3.6831, 1.0000, 306.6292593348296, 398688.3063, 
 18%|▏| 18/100 [17:06:24<72:42:36, 3192.15s/trial, best lo



{'bandwidth_log_ad': 0.10989929159726461, 'bandwidth_log_mean_c2c_nn_dist': 0.006834937097390827, 'bandwidth_slope': 0.1}, 14.499, 1.3732, 0.3397, 1.4146, 1.0000, 10.2491, 1.0000, 567.1997551520097, 1996734.2910, 
 19%|▏| 19/100 [17:59:20<71:42:37, 3187.13s/trial, best lo



{'bandwidth_log_ad': 0.025078130168792873, 'bandwidth_log_mean_c2c_nn_dist': 0.005821042192015836, 'bandwidth_slope': 0.1}, 42420.825, 1.3470, 0.3670, 1.3961, 5234.3333, 5.8051, 1.0000, 360.08202407671905, 701809.2120, 
 20%|▏| 20/100 [18:40:35<66:04:28, 2973.36s/trial, best lo



{'bandwidth_log_ad': 0.045513529631349095, 'bandwidth_log_mean_c2c_nn_dist': 0.004406270125141629, 'bandwidth_slope': 0.1}, 68070.535, 1.3561, 0.3570, 1.4023, 6751.0000, 7.1906, 1.0000, 418.14588859416443, 810574.9423, 
 21%|▏| 21/100 [19:24:57<63:11:59, 2879.99s/trial, best lo



{'bandwidth_log_ad': 0.08735125717049712, 'bandwidth_log_mean_c2c_nn_dist': 0.011158547526482545, 'bandwidth_slope': 0.1}, 14.753, 1.3708, 0.3404, 1.4124, 1.0000, 10.4450, 1.0000, 556.6329320546827, 2926000.7299, 
 22%|▏| 22/100 [20:23:39<66:34:28, 3072.67s/trial, best lo



{'bandwidth_log_ad': 0.1355232282779234, 'bandwidth_log_mean_c2c_nn_dist': 0.0038876454683425517, 'bandwidth_slope': 0.1}, 13.781, 1.3751, 0.3399, 1.4165, 1.0000, 9.7288, 1.0000, 581.9363395225464, 1236927.3540, 
 23%|▏| 23/100 [21:11:04<64:15:38, 3004.39s/trial, best lo



{'bandwidth_log_ad': 0.13820288938712605, 'bandwidth_log_mean_c2c_nn_dist': 0.004215318652011179, 'bandwidth_slope': 0.1}, 14.054, 1.3753, 0.3395, 1.4166, 1.0000, 9.9206, 1.0000, 586.0689655172414, 1350998.0869, 
 24%|▏| 24/100 [21:58:10<62:17:44, 2950.85s/trial, best lo



{'bandwidth_log_ad': 0.13382078855488183, 'bandwidth_log_mean_c2c_nn_dist': 0.0034266949933472544, 'bandwidth_slope': 0.1}, 13.383, 1.3751, 0.3402, 1.4166, 1.0000, 9.4473, 1.0000, 578.0426443582942, 1085109.0916, 
 25%|▎| 25/100 [22:44:23<60:21:45, 2897.40s/trial, best lo



{'bandwidth_log_ad': 0.1312393060016259, 'bandwidth_log_mean_c2c_nn_dist': 0.0005246049768738124, 'bandwidth_slope': 0.1}, 6125.979, 1.3754, 0.3439, 1.4177, 4321.0000, 1.0000, 1.0000, 551.4188941032443, 164996.6227, 
 26%|▎| 26/100 [23:23:35<56:11:45, 2733.85s/trial, best lo



{'bandwidth_log_ad': 0.14981396158263807, 'bandwidth_log_mean_c2c_nn_dist': 0.003504047249404048, 'bandwidth_slope': 0.1}, 13.665, 1.3758, 0.3397, 1.4171, 1.0000, 9.6431, 1.0000, 593.0785553968578, 1156409.6350, 
 27%|▎| 27/100 [24:11:35<56:19:23, 2777.58s/trial, best lo



{'bandwidth_log_ad': 0.14622174373724575, 'bandwidth_log_mean_c2c_nn_dist': 0.00250020021913336, 'bandwidth_slope': 0.1}, 105779.721, 1.3753, 0.3407, 1.4169, 8601.0000, 8.6801, 1.0000, 585.1238522750459, 818191.5952, 
 28%|▎| 28/100 [24:58:19<55:42:43, 2785.60s/trial, best lo



{'bandwidth_log_ad': 0.12836252833084247, 'bandwidth_log_mean_c2c_nn_dist': 0.0030970781469214287, 'bandwidth_slope': 0.1}, 118692.896, 1.3747, 0.3406, 1.4163, 9201.0000, 9.1085, 1.0000, 570.4590899816365, 965196.9506, 
 29%|▎| 29/100 [25:46:26<55:32:20, 2816.06s/trial, best lo



{'bandwidth_log_ad': 0.14994877738599266, 'bandwidth_log_mean_c2c_nn_dist': 0.004991426771077515, 'bandwidth_slope': 0.1}, 14.444, 1.3758, 0.3389, 1.4169, 1.0000, 10.1940, 1.0000, 599.6927157722914, 1647436.1771, 
 30%|▎| 30/100 [26:37:53<56:20:01, 2897.17s/trial, best lo



{'bandwidth_log_ad': 0.09988609841478757, 'bandwidth_log_mean_c2c_nn_dist': 0.011711049211123096, 'bandwidth_slope': 0.1}, 14.905, 1.3732, 0.3385, 1.4143, 1.0000, 10.5389, 1.0000, 577.57824933687, 3269895.5836, 
 31%|▎| 31/100 [27:39:17<60:03:17, 3133.29s/trial, best lo



{'bandwidth_log_ad': 0.12216894266923946, 'bandwidth_log_mean_c2c_nn_dist': 0.000610362893330922, 'bandwidth_slope': 0.1}, 6633.148, 1.3747, 0.3440, 1.4170, 4681.0000, 1.0000, 1.0000, 542.4197102632116, 186637.4791, 
 32%|▎| 32/100 [28:19:32<55:06:58, 2917.91s/trial, best lo



{'bandwidth_log_ad': 0.03262986139887513, 'bandwidth_log_mean_c2c_nn_dist': 0.0018401977125061843, 'bandwidth_slope': 0.1}, 13460.893, 1.3493, 0.3657, 1.3980, 3641.0000, 2.6446, 1.0000, 348.31360946745565, 272295.0945, 
 33%|▎| 33/100 [28:58:28<51:03:24, 2743.35s/trial, best lo



{'bandwidth_log_ad': 0.05547261506238846, 'bandwidth_log_mean_c2c_nn_dist': 0.005405557834022934, 'bandwidth_slope': 0.1}, 84151.031, 1.3598, 0.3515, 1.4045, 6901.0000, 8.6821, 1.0000, 454.5027545398898, 1118714.5831, 
 34%|▎| 34/100 [29:45:25<50:41:44, 2765.22s/trial, best lo



{'bandwidth_log_ad': 0.09902142179439247, 'bandwidth_log_mean_c2c_nn_dist': 0.008314309859896259, 'bandwidth_slope': 0.1}, 14.624, 1.3724, 0.3404, 1.4139, 1.0000, 10.3430, 1.0000, 559.7573964497042, 2317687.2497, 
 35%|▎| 35/100 [30:39:13<52:26:16, 2904.25s/trial, best lo



{'bandwidth_log_ad': 0.07502270220192445, 'bandwidth_log_mean_c2c_nn_dist': 0.0009357037136288999, 'bandwidth_slope': 0.1}, 7951.633, 1.3666, 0.3503, 1.4107, 4141.0000, 1.3612, 1.0000, 464.1030401958784, 228766.3722, 
 36%|▎| 36/100 [31:18:09<48:35:58, 2733.73s/trial, best lo



{'bandwidth_log_ad': 0.11547314897732239, 'bandwidth_log_mean_c2c_nn_dist': 0.0029711114648258152, 'bandwidth_slope': 0.1}, 102477.479, 1.3735, 0.3414, 1.4153, 8301.0000, 8.7229, 1.0000, 553.9971434401143, 887201.9590, 
 37%|▎| 37/100 [32:04:17<48:01:11, 2743.99s/trial, best lo



{'bandwidth_log_ad': 0.1274874397059936, 'bandwidth_log_mean_c2c_nn_dist': 0.012901872474258743, 'bandwidth_slope': 0.1}, 15.110, 1.3764, 0.3358, 1.4168, 1.0000, 10.6654, 1.0000, 612.6776168128953, 3994617.7519, 
 38%|▍| 38/100 [33:10:24<53:34:42, 3111.02s/trial, best lo



{'bandwidth_log_ad': 0.14340565414024375, 'bandwidth_log_mean_c2c_nn_dist': 0.009647786073193887, 'bandwidth_slope': 0.1}, 15.015, 1.3764, 0.3368, 1.4170, 1.0000, 10.5960, 1.0000, 613.2379106304836, 3129236.1541, 
 39%|▍| 39/100 [34:12:16<55:45:58, 3291.13s/trial, best lo



{'bandwidth_log_ad': 0.10166278301625527, 'bandwidth_log_mean_c2c_nn_dist': 0.003300788617712265, 'bandwidth_slope': 0.1}, 101581.025, 1.3716, 0.3426, 1.4137, 8201.0000, 8.7617, 1.0000, 535.8704346051826, 932805.2098, 
 40%|▍| 40/100 [34:56:25<51:38:27, 3098.46s/trial, best lo



{'bandwidth_log_ad': 0.08781218579631586, 'bandwidth_log_mean_c2c_nn_dist': 0.0019336836300717615, 'bandwidth_slope': 0.1}, 61218.280, 1.3690, 0.3458, 1.4120, 7676.0000, 5.6480, 1.0000, 501.82574984697, 510791.9378, 
 41%|▍| 41/100 [35:37:50<47:46:01, 2914.60s/trial, best lo



{'bandwidth_log_ad': 0.08012170464286641, 'bandwidth_log_mean_c2c_nn_dist': 0.007159623542589547, 'bandwidth_slope': 0.1}, 14.095, 1.3684, 0.3438, 1.4109, 1.0000, 9.9900, 1.0000, 522.0073454397062, 1804597.3234, 
 42%|▍| 42/100 [36:29:13<47:46:11, 2965.03s/trial, best lo



{'bandwidth_log_ad': 0.148498674772893, 'bandwidth_log_mean_c2c_nn_dist': 0.006232018429822617, 'bandwidth_slope': 0.1}, 14.705, 1.3760, 0.3385, 1.4170, 1.0000, 10.3777, 1.0000, 603.6955723321771, 2049335.2242, 
 43%|▍| 43/100 [37:24:05<48:29:57, 3063.11s/trial, best lo



{'bandwidth_log_ad': 0.1348727116159063, 'bandwidth_log_mean_c2c_nn_dist': 0.008708851778778306, 'bandwidth_slope': 0.1}, 14.930, 1.3758, 0.3376, 1.4166, 1.0000, 10.5389, 1.0000, 602.457049581718, 2762025.3793, 
 44%|▍| 44/100 [38:21:58<49:33:47, 3186.21s/trial, best lo



{'bandwidth_log_ad': 0.11559635507780952, 'bandwidth_log_mean_c2c_nn_dist': 0.004758374576633318, 'bandwidth_slope': 0.1}, 13.982, 1.3737, 0.3402, 1.4152, 1.0000, 9.8798, 1.0000, 563.9294021628239, 1421042.3522, 
 45%|▍| 45/100 [39:09:27<47:07:46, 3084.85s/trial, best lo



{'bandwidth_log_ad': 0.06482153214697922, 'bandwidth_log_mean_c2c_nn_dist': 0.010475672897127206, 'bandwidth_slope': 0.1}, 14.344, 1.3648, 0.3460, 1.4079, 1.0000, 10.1879, 1.0000, 511.1348704346052, 2354153.9580, 
 46%|▍| 46/100 [40:03:41<47:02:01, 3135.58s/trial, best lo



{'bandwidth_log_ad': 0.04438222701808957, 'bandwidth_log_mean_c2c_nn_dist': 0.01242235176139509, 'bandwidth_slope': 0.1}, 13.959, 1.3578, 0.3523, 1.4028, 1.0000, 9.9512, 1.0000, 474.4225668230973, 2210435.7205, 
 47%|▍| 47/100 [40:56:40<46:21:21, 3148.70s/trial, best lo

                                                                                

{'bandwidth_log_ad': 0.09360908124758116, 'bandwidth_log_mean_c2c_nn_dist': 0.014063072033365863, 'bandwidth_slope': 0.1}, 14.944, 1.3726, 0.3381, 1.4136, 1.0000, 10.5715, 1.0000, 579.9251173229953, 3801520.4330, 
 48%|▍| 48/100 [42:00:53<48:31:57, 3359.95s/trial, best lo

                                                                                

{'bandwidth_log_ad': 0.12606506324405822, 'bandwidth_log_mean_c2c_nn_dist': 0.00771876871757168, 'bandwidth_slope': 0.1}, 14.791, 1.3751, 0.3384, 1.4161, 1.0000, 10.4450, 1.0000, 589.9604162415834, 2385723.9080, 
 49%|▍| 49/100 [42:54:16<46:55:52, 3312.80s/trial, best lo



{'bandwidth_log_ad': 0.14023817528611468, 'bandwidth_log_mean_c2c_nn_dist': 0.0036058803118839363, 'bandwidth_slope': 0.1}, 13.626, 1.3752, 0.3399, 1.4166, 1.0000, 9.6186, 1.0000, 585.1395633544174, 1162061.5285, 
 50%|▌| 50/100 [43:40:20<43:43:30, 3148.22s/trial, best lo



{'bandwidth_log_ad': 0.10636020820582846, 'bandwidth_log_mean_c2c_nn_dist': 0.014527043709719373, 'bandwidth_slope': 0.1}, 15.072, 1.3745, 0.3365, 1.4151, 1.0000, 10.6511, 1.0000, 597.9345031626199, 4162306.6440, 
 51%|▌| 51/100 [44:49:16<46:53:05, 3444.60s/trial, best lo



{'bandwidth_log_ad': 0.0696715319754357, 'bandwidth_log_mean_c2c_nn_dist': 0.0017657825703389237, 'bandwidth_slope': 0.1}, 35299.931, 1.3644, 0.3503, 1.4086, 5626.0000, 4.4544, 1.0000, 460.6686390532544, 415260.9465, 
 52%|▌| 52/100 [45:30:04<41:56:26, 3145.55s/trial, best lo



{'bandwidth_log_ad': 0.14160523304714517, 'bandwidth_log_mean_c2c_nn_dist': 0.005654584017554483, 'bandwidth_slope': 0.1}, 14.548, 1.3756, 0.3387, 1.4167, 1.0000, 10.2695, 1.0000, 595.5727402570903, 1828154.8278, 
 53%|▌| 53/100 [46:23:01<41:11:24, 3155.00s/trial, best lo



{'bandwidth_log_ad': 0.11834711270244802, 'bandwidth_log_mean_c2c_nn_dist': 0.0067318740654504295, 'bandwidth_slope': 0.1}, 14.566, 1.3743, 0.3392, 1.4156, 1.0000, 10.2899, 1.0000, 577.059375637625, 2028926.0663, 
 54%|▌| 54/100 [47:14:30<40:03:34, 3135.11s/trial, best lo



{'bandwidth_log_ad': 0.08252294990538746, 'bandwidth_log_mean_c2c_nn_dist': 0.001157134207420277, 'bandwidth_slope': 0.1}, 19455.908, 1.3676, 0.3477, 1.4111, 5561.0000, 2.4793, 1.0000, 483.5913078963477, 296703.6607, 
 55%|▌| 55/100 [47:54:05<36:20:17, 2907.05s/trial, best lo



{'bandwidth_log_ad': 0.09445901596581333, 'bandwidth_log_mean_c2c_nn_dist': 0.016801237617045287, 'bandwidth_slope': 0.1}, 15.064, 1.3731, 0.3370, 1.4138, 1.0000, 10.6552, 1.0000, 593.1964905121404, 4546940.3883, 
 56%|▌| 56/100 [49:05:42<40:37:47, 3324.26s/trial, best lo



{'bandwidth_log_ad': 0.10687281231466149, 'bandwidth_log_mean_c2c_nn_dist': 0.009183992175887858, 'bandwidth_slope': 0.1}, 14.778, 1.3736, 0.3390, 1.4148, 1.0000, 10.4450, 1.0000, 574.6900632523975, 2647439.3879, 
 57%|▌| 57/100 [50:00:37<39:35:57, 3315.29s/trial, best lo



{'bandwidth_log_ad': 0.01833177637994657, 'bandwidth_log_mean_c2c_nn_dist': 0.004108352295023933, 'bandwidth_slope': 0.1}, 25866.040, 1.3429, 0.3730, 1.3937, 5101.0000, 3.6382, 1.0000, 313.58702305651906, 384743.0506, 
 58%|▌| 58/100 [50:39:46<35:17:55, 3025.60s/trial, best lo



{'bandwidth_log_ad': 0.13214183114376682, 'bandwidth_log_mean_c2c_nn_dist': 0.010152831931586134, 'bandwidth_slope': 0.1}, 14.994, 1.3761, 0.3370, 1.4167, 1.0000, 10.5838, 1.0000, 605.9606202815752, 3192769.7470, 
 59%|▌| 59/100 [51:42:14<36:55:30, 3242.22s/trial, best lo



{'bandwidth_log_ad': 0.12414980980023119, 'bandwidth_log_mean_c2c_nn_dist': 0.0028364466478184025, 'bandwidth_slope': 0.1}, 102802.499, 1.3742, 0.3408, 1.4158, 8301.0000, 8.7474, 1.0000, 564.154050193838, 872426.4193, 
 60%|▌| 60/100 [52:26:35<34:05:21, 3068.03s/trial, best lo



{'bandwidth_log_ad': 0.11234983935422471, 'bandwidth_log_mean_c2c_nn_dist': 0.011511448660796234, 'bandwidth_slope': 0.1}, 14.957, 1.3747, 0.3370, 1.4154, 1.0000, 10.5674, 1.0000, 591.8481942460722, 3386158.4099, 
 61%|▌| 61/100 [53:28:30<35:20:19, 3262.03s/trial, best lo



{'bandwidth_log_ad': 0.13821431523016678, 'bandwidth_log_mean_c2c_nn_dist': 0.0050337503380672045, 'bandwidth_slope': 0.1}, 14.367, 1.3753, 0.3391, 1.4165, 1.0000, 10.1430, 1.0000, 589.7924913283003, 1613109.9906, 
 62%|▌| 62/100 [54:20:04<33:54:01, 3211.62s/trial, best lo



{'bandwidth_log_ad': 0.05151352498395999, 'bandwidth_log_mean_c2c_nn_dist': 0.0037690214639357874, 'bandwidth_slope': 0.1}, 60265.990, 1.3585, 0.3545, 1.4040, 6151.0000, 6.9784, 1.0000, 430.37359722505613, 747890.2903, 
 63%|▋| 63/100 [55:01:55<30:50:56, 3001.52s/trial, best lo



{'bandwidth_log_ad': 0.034082207149426696, 'bandwidth_log_mean_c2c_nn_dist': 0.002570517098587811, 'bandwidth_slope': 0.1}, 30110.490, 1.3503, 0.3644, 1.3986, 5461.0000, 3.9423, 1.0000, 362.1964905121404, 391750.5003, 
 64%|▋| 64/100 [55:41:40<28:09:55, 2816.54s/trial, best lo



{'bandwidth_log_ad': 0.1443435671887846, 'bandwidth_log_mean_c2c_nn_dist': 0.019855778771515, 'bandwidth_slope': 0.1}, 15.263, 1.3785, 0.3326, 1.4180, 1.0000, 10.7633, 1.0000, 650.0534584778617, 6413168.8641, 
 65%|▋| 65/100 [57:13:12<35:11:06, 3619.04s/trial, best lo



{'bandwidth_log_ad': 0.1382173895131304, 'bandwidth_log_mean_c2c_nn_dist': 0.00625278610360084, 'bandwidth_slope': 0.1}, 14.649, 1.3755, 0.3386, 1.4166, 1.0000, 10.3410, 1.0000, 595.1481330340747, 2003227.5136, 
 66%|▋| 66/100 [58:06:04<32:54:45, 3484.88s/trial, best lo



{'bandwidth_log_ad': 0.13121985889428367, 'bandwidth_log_mean_c2c_nn_dist': 0.007721635218225483, 'bandwidth_slope': 0.1}, 14.822, 1.3754, 0.3382, 1.4163, 1.0000, 10.4654, 1.0000, 595.026525198939, 2424359.3269, 
 67%|▋| 67/100 [58:59:58<31:15:19, 3409.70s/trial, best lo



{'bandwidth_log_ad': 0.14889653902827343, 'bandwidth_log_mean_c2c_nn_dist': 0.0037032622398334826, 'bandwidth_slope': 0.1}, 13.838, 1.3757, 0.3396, 1.4170, 1.0000, 9.7656, 1.0000, 593.21975107121, 1219495.6348, 
 68%|▋| 68/100 [59:46:40<28:41:19, 3227.48s/trial, best lo



{'bandwidth_log_ad': 0.12039831471758733, 'bandwidth_log_mean_c2c_nn_dist': 0.0023536291590597336, 'bandwidth_slope': 0.1}, 81140.987, 1.3738, 0.3416, 1.4156, 7201.0000, 7.9598, 1.0000, 556.7025096918996, 715100.9227, 
 69%|▋| 69/100 [60:29:39<26:06:57, 3032.81s/trial, best lo



{'bandwidth_log_ad': 0.14685520866124935, 'bandwidth_log_mean_c2c_nn_dist': 0.004674678548341524, 'bandwidth_slope': 0.1}, 14.326, 1.3758, 0.3390, 1.4169, 1.0000, 10.1104, 1.0000, 595.826974086921, 1531646.8519, 
 70%|▋| 70/100 [61:17:40<24:53:40, 2987.36s/trial, best lo



{'bandwidth_log_ad': 0.1349272455520414, 'bandwidth_log_mean_c2c_nn_dist': 0.0013934442451398084, 'bandwidth_slope': 0.1}, 52314.864, 1.3749, 0.3422, 1.4168, 7201.0000, 5.1277, 1.0000, 567.2230157110794, 442880.1610, 
 71%|▋| 71/100 [62:00:16<23:01:19, 2857.90s/trial, best lo



{'bandwidth_log_ad': 0.11295852713191408, 'bandwidth_log_mean_c2c_nn_dist': 0.0006215631466130293, 'bandwidth_slope': 0.1}, 6258.970, 1.3733, 0.3441, 1.4157, 4421.0000, 1.0000, 1.0000, 529.9106304835748, 184009.3336, 
 72%|▋| 72/100 [62:39:17<21:01:14, 2702.66s/trial, best lo



{'bandwidth_log_ad': 0.1389866418549084, 'bandwidth_log_mean_c2c_nn_dist': 0.003321496644250492, 'bandwidth_slope': 0.1}, 13.371, 1.3752, 0.3400, 1.4166, 1.0000, 9.4391, 1.0000, 582.5945725362171, 1066905.2750, 
 73%|▋| 73/100 [63:25:51<20:28:31, 2730.07s/trial, best lo



{'bandwidth_log_ad': 0.10372872904840727, 'bandwidth_log_mean_c2c_nn_dist': 0.005464694131923442, 'bandwidth_slope': 0.1}, 14.059, 1.3722, 0.3411, 1.4140, 1.0000, 9.9431, 1.0000, 551.5535604978576, 1557424.9853, 
 74%|▋| 74/100 [64:15:46<20:17:32, 2809.70s/trial, best lo



{'bandwidth_log_ad': 0.1300168442903225, 'bandwidth_log_mean_c2c_nn_dist': 0.008566398834898542, 'bandwidth_slope': 0.1}, 14.894, 1.3756, 0.3379, 1.4165, 1.0000, 10.5144, 1.0000, 597.430524382779, 2678982.6554, 
 75%|▊| 75/100 [65:13:06<20:49:27, 2998.71s/trial, best lo



{'bandwidth_log_ad': 0.14013244245669096, 'bandwidth_log_mean_c2c_nn_dist': 0.006766448671329027, 'bandwidth_slope': 0.1}, 14.750, 1.3757, 0.3382, 1.4166, 1.0000, 10.4124, 1.0000, 599.0565190777393, 2178589.0426, 
 76%|▊| 76/100 [66:05:31<20:17:03, 3042.65s/trial, best lo



{'bandwidth_log_ad': 0.1237891932593295, 'bandwidth_log_mean_c2c_nn_dist': 0.004372652643417492, 'bandwidth_slope': 0.1}, 13.900, 1.3746, 0.3397, 1.4159, 1.0000, 9.8166, 1.0000, 571.8190165272393, 1342997.9498, 
 77%|▊| 77/100 [66:52:37<19:01:28, 2977.75s/trial, best lo



{'bandwidth_log_ad': 0.11733361523126343, 'bandwidth_log_mean_c2c_nn_dist': 0.007361028640904271, 'bandwidth_slope': 0.1}, 14.661, 1.3743, 0.3390, 1.4155, 1.0000, 10.3573, 1.0000, 578.8773719649051, 2210179.8072, 
 78%|▊| 78/100 [67:47:01<18:43:19, 3063.59s/trial, best lo



{'bandwidth_log_ad': 0.09842990115797981, 'bandwidth_log_mean_c2c_nn_dist': 0.005866916875302863, 'bandwidth_slope': 0.1}, 14.121, 1.3716, 0.3414, 1.4135, 1.0000, 9.9900, 1.0000, 545.8026933278923, 1632893.8898, 
 79%|▊| 79/100 [68:36:14<17:40:39, 3030.46s/trial, best lo



{'bandwidth_log_ad': 0.08750270557031527, 'bandwidth_log_mean_c2c_nn_dist': 0.0015453908778118549, 'bandwidth_slope': 0.1}, 37085.894, 1.3689, 0.3462, 1.4119, 5976.0000, 4.3952, 1.0000, 497.82085288716587, 407589.2246, 
 80%|▊| 80/100 [69:17:19<15:53:32, 2860.65s/trial, best lo



{'bandwidth_log_ad': 0.07847283081340523, 'bandwidth_log_mean_c2c_nn_dist': 0.0023193004522297435, 'bandwidth_slope': 0.1}, 59817.505, 1.3671, 0.3474, 1.4105, 6801.0000, 6.2357, 1.0000, 485.78147316874106, 579692.1547, 
 81%|▊| 81/100 [69:58:59<14:31:40, 2752.64s/trial, best lo



{'bandwidth_log_ad': 0.0689261221531188, 'bandwidth_log_mean_c2c_nn_dist': 0.0005025792822514791, 'bandwidth_slope': 0.1}, 6362.194, 1.3649, 0.3522, 1.4096, 4513.5000, 1.0000, 1.0000, 440.8498265660069, 117568.4093, 
 82%|▊| 82/100 [70:37:17<13:04:50, 2616.12s/trial, best lo



{'bandwidth_log_ad': 0.1444697531335721, 'bandwidth_log_mean_c2c_nn_dist': 0.002882692443760932, 'bandwidth_slope': 0.1}, 121640.532, 1.3753, 0.3402, 1.4167, 9401.0000, 9.1330, 1.0000, 585.4882677004692, 939236.0220, 
 83%|▊| 83/100 [71:23:08<12:32:41, 2656.54s/trial, best lo



{'bandwidth_log_ad': 0.11091143842330187, 'bandwidth_log_mean_c2c_nn_dist': 0.00814341451555501, 'bandwidth_slope': 0.1}, 14.731, 1.3738, 0.3391, 1.4151, 1.0000, 10.4103, 1.0000, 574.8551316057948, 2386964.5207, 
 84%|▊| 84/100 [72:18:22<12:41:02, 2853.90s/trial, best lo



{'bandwidth_log_ad': 0.12662193288796253, 'bandwidth_log_mean_c2c_nn_dist': 0.006110254830460793, 'bandwidth_slope': 0.1}, 14.511, 1.3748, 0.3395, 1.4161, 1.0000, 10.2471, 1.0000, 583.2836155886554, 1892872.7041, 
 85%|▊| 85/100 [73:08:21<12:04:20, 2897.33s/trial, best lo



{'bandwidth_log_ad': 0.09573505726230133, 'bandwidth_log_mean_c2c_nn_dist': 0.005269400944514953, 'bandwidth_slope': 0.1}, 13.831, 1.3711, 0.3422, 1.4131, 1.0000, 9.7880, 1.0000, 538.0983472760661, 1448353.3440, 
 86%|▊| 86/100 [73:55:54<11:12:55, 2883.94s/trial, best lo



{'bandwidth_log_ad': 0.0020298870623864373, 'bandwidth_log_mean_c2c_nn_dist': 0.009038627536462366, 'bandwidth_slope': 0.1}, 6696.203, 1.3391, 0.3797, 1.3919, 4801.0000, 1.0020, 1.0000, 261.0804081632653, 96780.1014, 
 87%|▊| 87/100 [74:34:24<9:47:34, 2711.90s/trial, best los



{'bandwidth_log_ad': 0.060498738497553335, 'bandwidth_log_mean_c2c_nn_dist': 0.003329789897726883, 'bandwidth_slope': 0.1}, 59646.974, 1.3616, 0.3515, 1.4062, 6001.0000, 7.0681, 1.0000, 451.2438277902469, 724862.1883, 
 88%|▉| 88/100 [75:17:42<8:55:32, 2677.71s/trial, best los

                                                                                

{'bandwidth_log_ad': 0.1333882744122675, 'bandwidth_log_mean_c2c_nn_dist': 0.0009494482732010762, 'bandwidth_slope': 0.1}, 17226.319, 1.3751, 0.3425, 1.4171, 5951.0000, 2.0426, 1.0000, 561.3189145072434, 300494.5317, 
 89%|▉| 89/100 [75:59:58<8:03:05, 2635.06s/trial, best los



{'bandwidth_log_ad': 0.12007676806914487, 'bandwidth_log_mean_c2c_nn_dist': 0.018950773193070446, 'bandwidth_slope': 0.1}, 15.216, 1.3768, 0.3337, 1.4166, 1.0000, 10.7409, 1.0000, 628.7926953682921, 5696791.9857, 
 90%|▉| 90/100 [77:23:09<9:16:59, 3341.94s/trial, best los



{'bandwidth_log_ad': 0.10475532075269942, 'bandwidth_log_mean_c2c_nn_dist': 0.010066025911722069, 'bandwidth_slope': 0.1}, 14.826, 1.3733, 0.3387, 1.4145, 1.0000, 10.4817, 1.0000, 576.1968985921241, 2874672.8529, 
 91%|▉| 91/100 [78:21:26<8:28:16, 3388.55s/trial, best los



{'bandwidth_log_ad': 0.1423450513980269, 'bandwidth_log_mean_c2c_nn_dist': 0.0019475367000840573, 'bandwidth_slope': 0.1}, 75134.876, 1.3751, 0.3414, 1.4168, 7101.0000, 7.4681, 1.0000, 578.4652111813915, 631292.5717, 
 92%|▉| 92/100 [79:04:17<6:59:04, 3143.12s/trial, best los



{'bandwidth_log_ad': 0.0911810421725779, 'bandwidth_log_mean_c2c_nn_dist': 0.004094547085383622, 'bandwidth_slope': 0.1}, 109134.306, 1.3702, 0.3435, 1.4126, 8401.0000, 9.1963, 1.0000, 523.2454601101816, 1100493.7625, 
 93%|▉| 93/100 [79:49:07<5:50:51, 3007.39s/trial, best los

                                                                                

{'bandwidth_log_ad': 0.11502342727155893, 'bandwidth_log_mean_c2c_nn_dist': 0.010781808781713802, 'bandwidth_slope': 0.1}, 14.951, 1.3749, 0.3374, 1.4157, 1.0000, 10.5613, 1.0000, 591.65149969394, 3204883.8745, 
 94%|▉| 94/100 [80:50:47<5:21:31, 3215.20s/trial, best los



{'bandwidth_log_ad': 0.10887192742521631, 'bandwidth_log_mean_c2c_nn_dist': 0.01209478162951825, 'bandwidth_slope': 0.1}, 14.968, 1.3746, 0.3372, 1.4153, 1.0000, 10.5756, 1.0000, 590.4199143032034, 3508377.2881, 
 95%|▉| 95/100 [81:51:43<4:38:56, 3347.25s/trial, best los



{'bandwidth_log_ad': 0.14975914675362692, 'bandwidth_log_mean_c2c_nn_dist': 0.017906017620450507, 'bandwidth_slope': 0.1}, 15.262, 1.3786, 0.3332, 1.4182, 1.0000, 10.7613, 1.0000, 646.9889818404407, 5870368.3381, 
 96%|▉| 96/100 [83:16:25<4:17:50, 3867.73s/trial, best los



{'bandwidth_log_ad': 0.13647093526980336, 'bandwidth_log_mean_c2c_nn_dist': 0.006513609398309092, 'bandwidth_slope': 0.1}, 14.680, 1.3755, 0.3387, 1.4166, 1.0000, 10.3634, 1.0000, 594.7123036115079, 2076707.9208, 
 97%|▉| 97/100 [84:08:58<3:02:39, 3653.26s/trial, best los



{'bandwidth_log_ad': 0.12479438262642509, 'bandwidth_log_mean_c2c_nn_dist': 0.013368955257382648, 'bandwidth_slope': 0.1}, 15.109, 1.3763, 0.3358, 1.4166, 1.0000, 10.6654, 1.0000, 612.0740665170373, 4102669.4338, 
 98%|▉| 98/100 [85:16:56<2:06:01, 3780.64s/trial, best los



{'bandwidth_log_ad': 0.12948053557376377, 'bandwidth_log_mean_c2c_nn_dist': 0.007280090142058937, 'bandwidth_slope': 0.1}, 14.762, 1.3753, 0.3385, 1.4163, 1.0000, 10.4226, 1.0000, 591.4690879412365, 2274276.1947, 
 99%|▉| 99/100 [86:09:23<59:50, 3590.70s/trial, best loss:



{'bandwidth_log_ad': 0.08399743575303709, 'bandwidth_log_mean_c2c_nn_dist': 0.0047082212495189076, 'bandwidth_slope': 0.1}, 121777.152, 1.3686, 0.3446, 1.4113, 9201.0000, 9.3779, 1.0000, 514.0748826770047, 1216214.0837, 
100%|█| 100/100 [86:56:27<00:00, 3129.88s/trial, best loss


23/12/03 11:11:30 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /tmp/blockmgr-7d5212aa-ac3e-45b0-8ac4-d6d667929773. Falling back to Java IO way
java.io.IOException: Failed to delete: /tmp/blockmgr-7d5212aa-ac3e-45b0-8ac4-d6d667929773
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:177)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:113)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:94)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1231)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:368)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockManager.scala:364)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach

In [None]:
"""
{'bandwidth_log_ad': 0.09573505726230133, 'bandwidth_log_mean_c2c_nn_dist': 0.005269400944514953, 'bandwidth_slope': 0.1}, 13.831, 1.3711, 0.3422, 1.4131, 1.0000, 9.7880, 1.0000, 538.0983472760661, 1448353.3440, 

n_samples = 5000
n_obs_total = 2754157738
data_sample_fraction = 0.25
test_sample_fraction = n_samples / n_obs_total
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    "log_ad"
]

data_sample, test_sample = setup_datasets_for_optimization(data, configs_df, predictor_variables, target, data_sample_fraction, test_sample_fraction, spark, cache_data=False, cache_sample=True)

trials = Trials()
space = {
    # "bandwidth_slope": hp.uniform("bandwidth_slope", 0.1499, 0.15),
    "bandwidth_slope": hp.choice("bandwidth_slope", [0.1]),
    "bandwidth_log_mean_c2c_nn_dist": hp.uniform("bandwidth_log_mean_c2c_nn_dist", 0.0005, 0.02),
    "bandwidth_log_ad": hp.uniform("bandwidth_log_ad", 0.001, 0.15),
}

best = fmin(create_objective_function(data_sample, test_sample, predictor_variables, spark),
            space=space,
            algo=tpe.suggest,
            max_evals=100)
"""

In [None]:
"""
{'bandwidth_log_ad': 0.03499639653600532, 'bandwidth_log_mean_c2c_nn_dist': 0.009940201998102223, 'bandwidth_slope': 0.1}, 1.396, 1.3507, 0.3539, 1.3962, 1.0000, 1.0000, 431.40763052208837, 615367.3594, 

n_samples = 1000
n_obs_total = 2754157738
data_sample_fraction = 0.1
test_sample_fraction = n_samples / n_obs_total
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    "log_ad"
]

data_sample, test_sample = setup_datasets_for_optimization(data, configs_df, predictor_variables, target, data_sample_fraction, test_sample_fraction, spark, cache_data=True)

trials = Trials()
space = {
    # "bandwidth_slope": hp.uniform("bandwidth_slope", 0.1499, 0.15),
    "bandwidth_slope": hp.choice("bandwidth_slope", [0.1]),
    "bandwidth_log_mean_c2c_nn_dist": hp.uniform("bandwidth_log_mean_c2c_nn_dist", 0.0005, 0.01),
    "bandwidth_log_ad": hp.uniform("bandwidth_log_ad", 0.025, 0.2),
}

best = fmin(create_objective_function(data_sample, test_sample, predictor_variables, spark),
            space=space,
            algo=tpe.suggest,
            max_evals=100)
"""

In [None]:
"""
{'bandwidth_log_ad': 0.05003817530737366, 'bandwidth_log_mean_c2c_nn_dist': 0.019032186174695872, 'bandwidth_slope': 0.15}, 1.405, 0.3343, 1.4049, 1.0000, 1.0000, 753.8094758064516    

n_samples = 1000
n_obs_total = 2754157738
data_sample_fraction = 0.1
test_sample_fraction = n_samples / n_obs_total
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    "log_ad"
]

data_sample, test_sample = setup_datasets_for_optimization(data, configs_df, predictor_variables, target, data_sample_fraction, test_sample_fraction, spark, cache_data=True)

trials = Trials()
space = {
    # "bandwidth_slope": hp.uniform("bandwidth_slope", 0.1499, 0.15),
    "bandwidth_slope": hp.choice("bandwidth_slope", [0.15]),
    "bandwidth_log_mean_c2c_nn_dist": hp.uniform("bandwidth_log_mean_c2c_nn_dist", 0.005, 0.1),
    "bandwidth_log_ad": hp.uniform("bandwidth_log_ad", 0.05, 0.3),
}

best = fmin(create_objective_function(data_sample, test_sample, predictor_variables, spark),
            space=space,
            algo=tpe.suggest,
            max_evals=100)
"""

In [None]:
"""
Best configuration with settings:
n_samples = 1000
n_obs_total = 2754157738
data_sample_fraction = 0.05
test_sample_fraction = n_samples / n_obs_total
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    "log_ad"
]

data_sample, test_sample = setup_datasets_for_optimization(data, configs_df, predictor_variables, target, data_sample_fraction, test_sample_fraction, spark)

trials = Trials()
space = {
    "bandwidth_slope": hp.uniform("bandwidth_slope", 0.05, 0.2),
    "bandwidth_log_mean_c2c_nn_dist": hp.uniform("bandwidth_log_mean_c2c_nn_dist", 0.05, 0.15),
    "bandwidth_log_ad": hp.uniform("bandwidth_log_ad", 0.05, 0.3),
}

{'bandwidth_log_ad': 0.07543308379510501, 'bandwidth_log_mean_c2c_nn_dist': 0.07542596980056125, 'bandwidth_slope': 0.051337832245347034}, 1.426, 0.3178, 1.4260, 1.0000, 1.0000, 384.2230695900858   
"""

In [None]:
# Pulled apart for debugging

# observations = test_sample

# min_obs = 300

# # args = {
# #     "bandwidth_slope": 0.1,
# #     "bandwidth_log_mean_c2c_nn_dist": 0.1,
# #     "bandwidth_log_ad": 0.1
# # }

# # bandwidths_data = [[args[f"bandwidth_{x}"] for x in predictor_variables]]
# # bandwidths = pd.DataFrame(bandwidths_data, columns=predictor_variables)

# # result = get_confidence_intervals(data_sample, observations, predictor_variables, bandwidths, spark)

# ci_high = result.percentiles.map(lambda x: x[-1])
# ci_low = result.percentiles.map(lambda x: x[0])
# orders_of_magnitude = np.log10(ci_high / ci_low).mean()
# orders_of_magnitude_stdev = np.log10(ci_high / ci_low).std()

# percent_inside_ci = ((result.target >= ci_low) & (result.target <= ci_high)).mean()

# # Penalize if more or less than 5% are outside of the CI
# deviance_from_95 = np.abs(0.95 - percent_inside_ci)
# deviance_loss_multiplier = 1.0 if deviance_from_95 < 0.02 else 1 + deviance_from_95 * 10.0

# # Penalize for the fraction with too few observations
# too_few = result.n_obs[result.n_obs < min_obs]
# n_obs_loss_multiplier = 1 + too_few.mean() * 100 if too_few.shape[0] > 0 else 1.0

# loss = orders_of_magnitude * deviance_loss_multiplier * n_obs_loss_multiplier

# print(f"{args}, {loss:.3f}, {orders_of_magnitude_stdev:.4f}")

# {
#     "loss": loss,
#     "orders_of_magnitude": orders_of_magnitude,
#     "n_obs_loss_multiplier": n_obs_loss_multiplier,
#     "deviance_loss_multiplier": deviance_loss_multiplier,
#     "status": STATUS_OK
# }

In [None]:
result.n_obs[result.n_obs < 300].shape

In [None]:
# Using only slope
n_samples = 1000
n_obs_total = 2754157738
data_sample_fraction = 0.01
test_sample_fraction = n_samples / n_obs_total
target = "n_craters_added_in_study_region"
predictor_variables = ["slope"]

data_sample, test_sample = setup_datasets(data, configs_df, predictor_variables, target, data_sample_fraction, test_sample_fraction, spark)

# 0.007, 0.011, 0.019, 1.247, 0.4383
# 0.055, 0.004, 0.009, 1.268, 0.4417  


trials = Trials()
space = {
    "bandwidth_slope": hp.uniform("bandwidth_slope", 0.001, 0.1),
}

best = fmin(create_objective_function(data_sample, test_sample, predictor_variables, spark),
            space=space,
            algo=tpe.suggest,
            max_evals=30)

In [None]:
# Using only areal density
n_samples = 1000
n_obs_total = 2754157738
data_sample_fraction = 0.01
test_sample_fraction = n_samples / n_obs_total
target = "n_craters_added_in_study_region"
predictor_variables = ["areal_density"]

data_sample, test_sample = setup_datasets(data, configs_df, predictor_variables, target, data_sample_fraction, test_sample_fraction, spark)

# 0.007, 0.011, 0.019, 1.247, 0.4383
# 0.055, 0.004, 0.009, 1.268, 0.4417  


trials = Trials()
space = {
    "bandwidth_areal_density": hp.uniform("bandwidth_areal_density", 0.001, 0.05),
}

best = fmin(create_objective_function(data_sample, test_sample, predictor_variables, spark),
            space=space,
            algo=tpe.suggest,
            max_evals=30)

In [None]:
# Using areal density
n_samples = 1000
n_obs_total = 2754157738
data_sample_fraction = 0.01
test_sample_fraction = n_samples / n_obs_total
target = "n_craters_added_in_study_region"
predictor_variables = ["slope", "areal_density"]

data_sample, test_sample = setup_datasets(data, configs_df, predictor_variables, target, data_sample_fraction, test_sample_fraction, spark)

# 0.007, 0.011, 0.019, 1.247, 0.4383
# 0.055, 0.004, 0.009, 1.268, 0.4417  


trials = Trials()
space = {
    "bandwidth_slope": hp.uniform("bandwidth_slope", 0.001, 0.1),
    "bandwidth_areal_density": hp.uniform("bandwidth_areal_density", 0.001, 0.05),
}

best = fmin(create_objective_function(data_sample, test_sample, predictor_variables, spark),
            space=space,
            algo=tpe.suggest,
            max_evals=30)

In [None]:
# Using just c2c
n_samples = 1000
n_obs_total = 2754157738
data_sample_fraction = 0.01
test_sample_fraction = n_samples / n_obs_total
target = "n_craters_added_in_study_region"
predictor_variables = ["log_mean_c2c_nn_dist"]

data_sample, test_sample = setup_datasets(data, configs_df, predictor_variables, target, data_sample_fraction, test_sample_fraction, spark)

# 0.007, 0.011, 0.019, 1.247, 0.4383
# 0.055, 0.004, 0.009, 1.268, 0.4417  


trials = Trials()
space = {
    "bandwidth_log_mean_c2c_nn_dist": hp.uniform("bandwith_log_mean_c2c_nn_dist", 0.000001, 0.05),
}

best = fmin(create_objective_function(data_sample, test_sample, predictor_variables, spark),
            space=space,
            algo=tpe.suggest,
            max_evals=30)

In [None]:
# Using all three w/areal density
# best: {'bandwidth_areal_density': 0.01698628794548847, 'bandwidth_log_mean_c2c_nn_dist': 0.017780010374043788, 'bandwidth_slope': 0.006946936017484633}, 1.324, 0.3964   

n_samples = 1000
n_obs_total = 2754157738
data_sample_fraction = 0.01
test_sample_fraction = n_samples / n_obs_total
target = "n_craters_added_in_study_region"
predictor_variables = ["slope", "log_mean_c2c_nn_dist", "areal_density"]

data_sample, test_sample = setup_datasets(data, configs_df, predictor_variables, target, data_sample_fraction, test_sample_fraction, spark)

# 0.007, 0.011, 0.019, 1.247, 0.4383
# 0.055, 0.004, 0.009, 1.268, 0.4417  


trials = Trials()
space = {
    "bandwidth_slope": hp.uniform("bandwidth_slope", 0.001, 0.1),
    "bandwidth_log_mean_c2c_nn_dist": hp.uniform("bandwith_log_mean_c2c_nn_dist", 0.000001, 0.05),
    "bandwidth_areal_density": hp.uniform("bandwidth_areal_density", 0.001, 0.05),
}

best = fmin(create_objective_function(data_sample, test_sample, predictor_variables, spark),
            space=space,
            algo=tpe.suggest,
            max_evals=500)

In [None]:
# DSP
data_sample_fraction = 0.1
n_steps = 5
target = "n_craters_added_in_study_region"

observation = {
    "slope": 2.66,
    "log_mean_c2c_nn_dist": 0.6685316230432883,
    "log_intensity": -3.71673468421674
    # "areal_density": 0.13
}

steps = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.1,
    "log_intensity": 1.0
    # "areal_density": 0.01
}

bandwidths = {
    "slope": 0.2,
    "log_mean_c2c_nn_dist": 0.1,
    "log_intensity": 1.0
    # "areal_density": 0.1
}

predictor_variables = list(observation.keys())

data_and_configs = setup_dataset(data, configs_df, predictor_variables, target, spark)
data_and_configs = data_and_configs.sample(data_sample_fraction)
data_and_configs.createOrReplaceTempView("data")

# Construct the observations grid
observations_df = pd.DataFrame(
    map(dict, itertools.product(*[[(k, observation[k] + x * v) for x in range(-n_steps, n_steps + 1)]  for k, v in observation.items()]))
)
observations_df = spark.createDataFrame(observations_df).cache()
observations_df.count()
observations_df.createOrReplaceTempView("observations")

bandwidths_df = pd.DataFrame([bandwidths])

result = get_confidence_intervals(data_and_configs, observations_df, predictor_variables, bandwidths_df, spark)
result

In [None]:
# DEE
data_sample_fraction = 1.0
n_steps = 5
target = "n_craters_added_in_study_region"

observation = {
    "slope": 2.10,
    "log_mean_c2c_nn_dist": 0.8136,
    "log_intensity": -4.402922308048596
    # "areal_density": 0.13
}

steps = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.1,
    "log_intensity": 1.0
    # "areal_density": 0.01
}

bandwidths = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.1,
    "log_intensity": 1.0
    # "areal_density": 0.1
}

predictor_variables = list(observation.keys())

data_and_configs = setup_dataset(data, configs_df, predictor_variables, target, spark)
data_and_configs = data_and_configs.sample(data_sample_fraction)
data_and_configs.createOrReplaceTempView("data")

# Construct the observations grid
observations_df = pd.DataFrame(
    map(dict, itertools.product(*[[(k, observation[k] + x * v) for x in range(-n_steps, n_steps + 1)]  for k, v in observation.items()]))
)
observations_df = spark.createDataFrame(observations_df).cache()
observations_df.count()
observations_df.createOrReplaceTempView("observations")

bandwidths_df = pd.DataFrame([bandwidths])

result = get_confidence_intervals(data_and_configs, observations_df, predictor_variables, bandwidths_df, spark)
result

In [None]:
result1 = result

In [None]:
# DEE, AD
data_sample_fraction = 1.0
n_steps = 3
target = "n_craters_added_in_study_region"

observation = {
    "slope": 2.10,
    "log_mean_c2c_nn_dist": 0.8136,
    # "log_intensity": -4.402922308048596
    "areal_density": 0.06
}

steps = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.2,
    # "log_intensity": 1.0
    "areal_density": 0.05
}

bandwidths = {
    "slope": 0.25,
    "log_mean_c2c_nn_dist": 0.2,
    # "log_intensity": 1.0
    "areal_density": 0.05
}

predictor_variables = list(observation.keys())

data_and_configs = setup_dataset(data, configs_df, predictor_variables, target, spark)
data_and_configs = data_and_configs.sample(data_sample_fraction)
data_and_configs.createOrReplaceTempView("data")

# Construct the observations grid
observations_df = pd.DataFrame(
    map(dict, itertools.product(*[[(k, observation[k] + x * v) for x in range(-n_steps, n_steps + 1)]  for k, v in observation.items()]))
)
observations_df = spark.createDataFrame(observations_df).cache()
observations_df.count()
observations_df.createOrReplaceTempView("observations")

bandwidths_df = pd.DataFrame([bandwidths])

result = get_confidence_intervals(data_and_configs, observations_df, predictor_variables, bandwidths_df, spark)
result

In [None]:
result.n_unique_sims.mean()

In [None]:
# Searching for bandwidths in c2c_nn

In [None]:
def get_nearest_neighbors(data: DataFrame,
                          observation: Dict[str, float],
                          predictor_variables: List[str],
                          scaling_factors: Dict[str, float],
                          n_neighbors: int) -> pd.DataFrame:
    assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in predictor_variables]
    scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in predictor_variables]
    pipeline = Pipeline(stages=assemblers + scalers)
    scaler = pipeline.fit(data_without_nulls)
    scaled = scaler.transform(data_without_nulls)

    assembler = VectorAssembler(inputCols=[f"{x}_scaled" for x in predictor_variables], outputCol="features")
    ready_to_model = assembler.transform(scaled_fixed)
    ready_to_model = ready_to_model.select(*[x for x in ready_to_model.columns if "scaled_vec" not in x])

    brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", seed=12345, bucketLength=1.0)
    model = brp.fit(ready_to_model)

    vec = pd.DataFrame([[observation[x] for x in predictor_variables]], columns=predictor_variables)
    vec_scaled = assembler.transform(scaler.transform(spark.createDataFrame(vec)))

    return model.approxNearestNeighbors(ready_to_model, vec_scaled.select("features").collect()[0].features, n_neighbors).toPandas()

In [None]:
data_sample_fraction = 1.0
n_steps = 3
target = "n_craters_added_in_study_region"

observation = {
    "slope": 2.10,
    "log_mean_c2c_nn_dist": 0.8136,
    # "log_intensity": -4.402922308048596
    "areal_density": 0.06
}

steps = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.2,
    # "log_intensity": 1.0
    "areal_density": 0.05
}

bandwidths = {
    "slope": 0.25,
    "log_mean_c2c_nn_dist": 0.2,
    # "log_intensity": 1.0
    "areal_density": 0.05
}

predictor_variables = list(observation.keys())
data_and_configs = setup_dataset(data, configs_df, predictor_variables, target, spark)

In [None]:
# data_without_nulls = data_and_configs.sample(0.05).dropna().cache()
data_without_nulls = data_and_configs.dropna()

In [None]:
scaling_factors = {
    "slope": 1,
    "log_mean_c2c_nn_dist": 1,
    "areal_density": 0.25
}
res1 = get_nearest_neighbors(data_without_nulls, observation, predictor_variables, scaling_factors, 1000)
res1

In [None]:
scaling_factors = {
    "slope": 1.5,
    "log_mean_c2c_nn_dist": 1,
    "areal_density": 0.25
}
res2 = get_nearest_neighbors(data_without_nulls, observation, predictor_variables, scaling_factors, 1000)
res2

In [None]:
scaling_factors = {
    "slope": 2,
    "log_mean_c2c_nn_dist": 1,
    "areal_density": 0.25
}
res3 = get_nearest_neighbors(data_without_nulls, observation, predictor_variables, scaling_factors, 100)
res3

In [None]:
scaling_factors = {
    "slope": 1,
    "log_mean_c2c_nn_dist": 1,
    "areal_density": 0.1
}
res4 = get_nearest_neighbors(data_without_nulls, observation, predictor_variables, scaling_factors, 100)
res4

In [None]:
plt.hist(res4.slope)

In [None]:
nnd_filtered = data_without_nulls.where(
    F.col("log_mean_c2c_nn_dist").between(0.7, 0.9)
).cache()

In [None]:
nnd_filtered.count()

In [None]:
observation

In [None]:
df = nnd_filtered.select(F.percentile_approx("areal_density", [0.025, 0.50, 0.975], 1000), F.percentile_approx("slope", [0.025, 0.50, 0.975], 1000)).toPandas()
df

In [None]:
np.log10(0.12)

In [None]:
# 1, DEE
d = data_without_nulls.withColumn("log_ad", F.log10("areal_density"))

observation = {
    "slope": 2.10,
    "log_mean_c2c_nn_dist": 0.8136,
    # "log_intensity": -4.402922308048596
    "log_ad": np.log10(0.06)
}

bandwidths = {
    "slope": 0.2,
    "log_mean_c2c_nn_dist": 0.1,
    # "log_intensity": -4.402922308048596
    "log_ad": 0.15
}

res = d
for k, v in observation.items():
    res = res.where(F.col(k).between(v - bandwidths[k], v + bandwidths[k]))

res_df = res.toPandas()
dee_df = res_df

In [None]:
n_orders_mag = np.log10(res_df.target.quantile(0.975) / res_df.target.quantile(0.025))
n_orders_mag

In [None]:
# 2, DSP
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685

observation = {
    "slope": 2.66,
    "log_mean_c2c_nn_dist": 0.6685,
    "log_ad": np.log10(0.13)
}

bandwidths = {
    "slope": 0.2,
    "log_mean_c2c_nn_dist": 0.1,
    "log_ad": 0.15
}

res = d
for k, v in observation.items():
    res = res.where(F.col(k).between(v - bandwidths[k], v + bandwidths[k]))

res_df = res.toPandas()
dsp_df = res_df

In [None]:
n_orders_mag = np.log10(res_df.target.quantile(0.975) / res_df.target.quantile(0.025))
n_orders_mag

In [None]:
# 3, DICP
observation = {
    "slope": 2.08,
    "log_mean_c2c_nn_dist": 0.9338405903389017,
    # "log_intensity": -4.402922308048596
    "log_ad": np.log10(0.31)
}

bandwidths = {
    "slope": 0.2,
    "log_mean_c2c_nn_dist": 0.1,
    # "log_intensity": -4.402922308048596
    "log_ad": 0.15
}

res = d
for k, v in observation.items():
    res = res.where(F.col(k).between(v - bandwidths[k], v + bandwidths[k]))

res_df = res.toPandas()
dicp_df = res_df

In [None]:
n_orders_mag = np.log10(res_df.target.quantile(0.975) / res_df.target.quantile(0.025))
n_orders_mag

In [None]:
# 4, DDCP
d = data_without_nulls.withColumn("log_ad", F.log10("areal_density"))

observation = {
    "slope": 2.15,
    "log_mean_c2c_nn_dist": 0.6685,
    "log_ad": np.log10(0.35)
}

bandwidths = {
    "slope": 0.2,
    "log_mean_c2c_nn_dist": 0.1,
    "log_ad": 0.15
}

res = d
for k, v in observation.items():
    res = res.where(F.col(k).between(v - bandwidths[k], v + bandwidths[k]))

In [None]:
percentiles = res.select(F.percentile_approx("target", [0.025, 0.975], 1000)).toPandas()

In [None]:
np.log10(percentiles.iloc[0, 0][1] / percentiles.iloc[0, 0][0])

In [None]:
n_orders_mag = np.log10(res_df.target.quantile(0.975) / res_df.target.quantile(0.025))
n_orders_mag

In [None]:
columns_to_scale = ["slope", "log_mean_c2c_nn_dist", "areal_density"]

assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scaler = pipeline.fit(data_without_nulls)
scaled = scaler.transform(data_without_nulls)

In [None]:
scaling_factors = {
    "slope": 0.25,
    "log_mean_c2c_nn_dist": 1,
    "areal_density": 0.25
}
scaled_fixed = scaled.select(
    *data_without_nulls.columns,
    *[(vector_to_array(F.col(f"{x}_scaled")).getItem(0) * F.lit(scaling_factors[x])).alias(f"{x}_scaled") for x in columns_to_scale]
)

assembler = VectorAssembler(inputCols=[f"{x}_scaled" for x in columns_to_scale], outputCol="features")
ready_to_model = assembler.transform(scaled_fixed)
ready_to_model = ready_to_model.select(*[x for x in ready_to_model.columns if "scaled_vec" not in x])

In [None]:
brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", seed=12345, bucketLength=1.0)
model = brp.fit(ready_to_model)

In [None]:
vec = pd.DataFrame([[2.10, 0.8136, 0.06]], columns=columns_to_scale)
vec_scaled = assembler.transform(scaler.transform(spark.createDataFrame(vec)))

In [None]:
res = model.approxNearestNeighbors(ready_to_model, vec_scaled.select("features").collect()[0].features, 50).toPandas()

In [None]:
res

In [None]:
res2 = model.approxNearestNeighbors(ready_to_model, Vectors.dense 50).collect()

In [None]:
assembler = VectorAssembler(outputCol="features")

assemblers = 
assembler.setInputCols(["slope", "log_mean_c2c_nn_dist", "areal_density"])

scaler = MinMaxScaler(outputCol="scaled_features")
scaler.setInputCol("features")
scaler.fit(assembler.transform(data_and_configs))

In [None]:
assembler.transform(data_and_configs)

In [None]:
df = data_and_configs.where((F.col("areal_density") < 0.08) & (F.col("log_mean_c2c_nn_dist").between(0.7, 0.8))).toPandas()

In [None]:
df.slope.describe()

In [None]:
total: 2.5b
ad limit: 23,500,945
ad + log_c2c: 1,056,227

In [None]:
data.select(F.percentile_approx("log_intensity", [0.05, 0.25, 0.5, 0.75, 0.95], 1000), F.mean("log_intensity"), F.median("log_intensity"), F.stddev_samp("log_intensity")).toPandas()

In [None]:
data_sample_fraction = 0.01
n_steps = 5
target = "n_craters_added_in_study_region"
predictor_variables = ["slope", "log_mean_c2c_nn_dist", "areal_density"]

# data_and_configs = setup_datasets(data, configs_df, predictor_variables, target, spark)
# data_and_configs = data_and_configs.sample(data_sample_fraction)

observation = {
    "slope": 2.10,
    "target_log_mean_c2c_nn_dist": 0.8136322010294104,
    "areal_density": 0.13
}

steps = {
    "slope": 0.05,
    "target_log_mean_c2c_nn_dist": 0.05,
    "areal_density": 0.01
}





In [None]:
steps_df

In [None]:
pd.DataFrame([dict(x) for x in steps])

In [None]:
# DSP
target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
    [target_slope, target_log_mean_c2c_nn_dist],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.001],
    [0.01, 0.005],
    [0.005, 0.001],
    [0.005, 0.005],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result

In [None]:
# DEE
target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
    [target_slope, target_log_mean_c2c_nn_dist],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.0005],
    [0.01, 0.001],
    [0.01, 0.005],
    [0.025, 0.0005],
    [0.025, 0.001],
    [0.025, 0.005],
    [0.05, 0.0005],
    [0.05, 0.001],
    [0.05, 0.005],
    [0.1, 0.0001],
    [0.1, 0.00025],
    [0.1, 0.0005],
    [0.1, 0.001],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result.sort_values([f"observation_{x}" for x in observations.columns])

In [None]:
# DEE

target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
    [target_slope, target_log_mean_c2c_nn_dist],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.0005],
    [0.01, 0.001],
    [0.01, 0.005],
    [0.025, 0.0005],
    [0.025, 0.001],
    [0.025, 0.005],
    [0.05, 0.0005],
    [0.05, 0.001],
    [0.05, 0.005],
    [0.1, 0.0001],
    [0.1, 0.00025],
    [0.1, 0.0005],
    [0.1, 0.001],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result.sort_values([f"observation_{x}" for x in observations.columns])

In [None]:
# Dione 3 (DICP)
target = "n_craters_added_in_study_region"
target_slope = 2.08
slope_ci_width = 0.09
target_log_mean_c2c_nn_dist = 0.9338405903389017

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
    [target_slope, target_log_mean_c2c_nn_dist],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.0005],
    [0.01, 0.001],
    [0.01, 0.005],
    [0.025, 0.0005],
    [0.025, 0.001],
    [0.025, 0.005],
    [0.05, 0.0005],
    [0.05, 0.001],
    [0.05, 0.005],
    [0.1, 0.0001],
    [0.1, 0.00025],
    [0.1, 0.0005],
    [0.1, 0.001],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result.sort_values([f"observation_{x}" for x in observations.columns])

In [None]:
# Dione 4 (DDCP)
target = "n_craters_added_in_study_region"
target_slope = 2.15
slope_ci_width = 0.05
target_log_mean_c2c_nn_dist = 0.5480628421533491

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
    [target_slope, target_log_mean_c2c_nn_dist],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.0005],
    [0.01, 0.001],
    [0.01, 0.005],
    [0.025, 0.0005],
    [0.025, 0.001],
    [0.025, 0.005],
    [0.05, 0.0005],
    [0.05, 0.001],
    [0.05, 0.005],
    [0.1, 0.0001],
    [0.1, 0.00025],
    [0.1, 0.0005],
    [0.1, 0.001],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result.sort_values([f"observation_{x}" for x in observations.columns])

In [None]:
# The Grid
target = "n_craters_added_in_study_region"

log_mean_c2c_nn_dist_delta = 0.05
slope_delta = 0.05

min_slope = configs_df.select(F.min("slope")).toPandas().iloc[0,0]
max_slope = configs_df.select(F.max("slope")).toPandas().iloc[0,0]

min_log_mean_c2c_nn_dist = data.select(F.min("log_mean_c2c_nn_dist")).toPandas().iloc[0,0]
max_log_mean_c2c_nn_dist = data.select(F.max("log_mean_c2c_nn_dist")).toPandas().iloc[0,0]

min_log_mean_c2c_nn_dist = quantize_value(min_log_mean_c2c_nn_dist + log_mean_c2c_nn_dist_delta * 2, log_mean_c2c_nn_dist_delta)
max_log_mean_c2c_nn_dist = quantize_value(max_log_mean_c2c_nn_dist - log_mean_c2c_nn_dist_delta * 2, log_mean_c2c_nn_dist_delta)

log_c2c_nn_dist_choices = [round(min_log_mean_c2c_nn_dist + x * log_mean_c2c_nn_dist_delta, 4) for x in range(int((max_log_mean_c2c_nn_dist - min_log_mean_c2c_nn_dist) / log_mean_c2c_nn_dist_delta))]

min_slope = quantize_value(min_slope + 2 * slope_delta, slope_delta)
max_slope = quantize_value(max_slope - 2 * slope_delta, slope_delta)
slope_choices = [round(min_slope + x * slope_delta, 4) for x in range(int((max_slope - min_slope) / slope_delta))]

observations_data = [
    [x, y]
    for x in slope_choices
    for y in log_c2c_nn_dist_choices
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [0.01, 0.0005],
    [0.01, 0.001],
    [0.01, 0.005],
    [0.025, 0.0005],
    [0.025, 0.001],
    [0.025, 0.005],
    [0.05, 0.0005],
    [0.05, 0.001],
    [0.05, 0.005],
    [0.1, 0.0001],
    [0.1, 0.00025],
    [0.1, 0.0005],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result.sort_values([f"observation_{x}" for x in observations.columns])

In [None]:
result.to_parquet("quantile_kernel_regression_grid_multiple_bandwidths.parquet")

In [None]:
result = result.sort_values([f"bandwith_{x}" for x in bandwidths.columns])

In [None]:
# Dione region 1, DEE
target_slope = 2.10
target_log_mean_c2c_nn_dist = 0.8136322010294104
r = result[(result.observation_slope == 2.1) & (result.observation_log_mean_c2c_nn_dist == 0.80)].copy()
r["95_percentile_absolute_range"] = r.percentiles.apply(lambda x: x[-1]) - r.percentiles.apply(lambda x: x[0])
r["95_percentile_ratio"] = r.percentiles.apply(lambda x: x[-1]) / r.percentiles.apply(lambda x: x[0])
r

In [None]:
# Dione region 2, DSP
target_slope = 2.66
target_log_mean_c2c_nn_dist = 0.6685316230432883
r = result[(result.observation_slope == 2.65) & (result.observation_log_mean_c2c_nn_dist == 0.65)].copy()
r["95_percentile_absolute_range"] = r.percentiles.apply(lambda x: x[-1]) - r.percentiles.apply(lambda x: x[0])
r["95_percentile_ratio"] = r.percentiles.apply(lambda x: x[-1]) / r.percentiles.apply(lambda x: x[0])
r

In [None]:
# Dione region 3, DICP
target_slope = 2.08
target_log_mean_c2c_nn_dist = 0.9338405903389017
r = result[(result.observation_slope == 2.05) & (result.observation_log_mean_c2c_nn_dist == 0.95)].copy()
r["95_percentile_absolute_range"] = r.percentiles.apply(lambda x: x[-1]) - r.percentiles.apply(lambda x: x[0])
r["95_percentile_ratio"] = r.percentiles.apply(lambda x: x[-1]) / r.percentiles.apply(lambda x: x[0])
r

In [None]:
# DICP with lower nn dist:
r = result[(result.observation_slope == 2.05) & (result.observation_log_mean_c2c_nn_dist == 0.90)].copy()
r["95_percentile_absolute_range"] = r.percentiles.apply(lambda x: x[-1]) - r.percentiles.apply(lambda x: x[0])
r["95_percentile_ratio"] = r.percentiles.apply(lambda x: x[-1]) / r.percentiles.apply(lambda x: x[0])
r

In [None]:
# Dione 4 (DDCP)
target_slope = 2.15
target_log_mean_c2c_nn_dist = 0.5480628421533491
r = result[(result.observation_slope == 2.15) & (result.observation_log_mean_c2c_nn_dist == 0.55)].copy()
r["95_percentile_absolute_range"] = r.percentiles.apply(lambda x: x[-1]) - r.percentiles.apply(lambda x: x[0])
r["95_percentile_ratio"] = r.percentiles.apply(lambda x: x[-1]) / r.percentiles.apply(lambda x: x[0])
r

In [None]:
# Dione region 1: DEE - extended
target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
slope_step = 0.05
target_log_mean_c2c_nn_dist = 0.8136
target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
dee_extended = result

In [None]:
result.sort_values("95_ci_ratio")[:25]

In [None]:
# Dione region 1: DEE
target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136

# observations_data = [
#     [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
#     [target_slope, target_log_mean_c2c_nn_dist],
#     [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
# ]
# observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

# bandwidths_data = [
#     [slope_ci_width, 0.005],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

# result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
# result[result.observation_slope == target_slope]
target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
dee = result

In [None]:
percentiles = result[result.observation_slope == target_slope].percentiles.iloc[0]
print(f"Orders of magnitude: {np.log10(percentiles[-1] / percentiles[0])}")
print(f"95% CI: ({percentiles[0]}, {percentiles[-1]})")

In [None]:
dee.sort_values("95_ci_ratio")

In [None]:
dee[(dee.observation_slope == 2.10) & (dee.observation_log_mean_c2c_nn_dist == 0.8136)]

In [None]:
# Dione region 2: DSP
target = "n_craters_added_in_study_region"
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685

# observations_data = [
#     [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
#     [target_slope, target_log_mean_c2c_nn_dist],
#     [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
# ]
# observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

# bandwidths_data = [
#     [slope_ci_width, 0.005],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

# result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
# result[result.observation_slope == target_slope]

target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
dsp_extended = result

In [None]:
percentiles = result[result.observation_slope == target_slope].percentiles.iloc[0]
print(f"Orders of magnitude: {np.log10(percentiles[-1] / percentiles[0])}")
print(f"95% CI: ({percentiles[0]}, {percentiles[-1]})")

In [None]:
dsp_extended.sort_values("95_ci_ratio")

In [None]:
# Dione region 3: DICP
target = "n_craters_added_in_study_region"
target_slope = 2.08
slope_ci_width = 0.09
target_log_mean_c2c_nn_dist = 0.9338

# observations_data = [
#     [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
#     [target_slope, target_log_mean_c2c_nn_dist],
#     [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
# ]
# observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

# bandwidths_data = [
#     [slope_ci_width, 0.005],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

# result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
# result[result.observation_slope == target_slope]

target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
dicp = result

In [None]:
percentiles = result[result.observation_slope == target_slope].percentiles.iloc[0]
print(f"Orders of magnitude: {np.log10(percentiles[-1] / percentiles[0])}")
print(f"95% CI: ({percentiles[0]}, {percentiles[-1]})")

In [None]:
dicp.sort_values("95_ci_ratio")

In [None]:
# Dione region 3: DICP, D > 4000m
target = "n_craters_added_in_study_region"
target_slope = 2.08
slope_ci_width = 0.09
target_log_mean_c2c_nn_dist = 0.6304

# observations_data = [
#     [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
#     [target_slope, target_log_mean_c2c_nn_dist],
#     [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
# ]
# observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

# bandwidths_data = [
#     [slope_ci_width, 0.005],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

# result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
# result[result.observation_slope == target_slope]

target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
dicp_4k_plus = result

In [None]:
percentiles = result[result.observation_slope == target_slope].percentiles.iloc[0]
print(f"Orders of magnitude: {np.log10(percentiles[-1] / percentiles[0])}")
print(f"95% CI: ({percentiles[0]}, {percentiles[-1]})")

In [None]:
dicp_4k_plus.sort_values("95_ci_ratio")

In [None]:
# Dione region 4: DDCP
target = "n_craters_added_in_study_region"
target_slope = 2.15
slope_ci_width = 0.05
target_log_mean_c2c_nn_dist = 0.5481

# observations_data = [
#     [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist],
#     [target_slope, target_log_mean_c2c_nn_dist],
#     [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist],
# ]
# observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

# bandwidths_data = [
#     [slope_ci_width, 0.005],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

# result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
# result[result.observation_slope == target_slope]
target_log_mean_c2c_nn_dist_step = 0.05


observations_data = [
    [target_slope + x * slope_step, target_log_mean_c2c_nn_dist + y * target_log_mean_c2c_nn_dist_step]
    for x in range(-5, 6)
    for y in range(-5, 6)
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist"])

bandwidths_data = [
    [slope_ci_width / 2, 0.0025],
    [slope_ci_width / 2, 0.005],
    [slope_ci_width, 0.0025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result["95_ci_ratio"] = result.percentiles.map(lambda x: np.log10(x[-1] / x[0]))
result["95_ci_range"] = result.percentiles.map(lambda x: [x[0], x[1]])
ddcp = result

In [None]:
percentiles = result[result.observation_slope == target_slope].percentiles.iloc[0]
print(f"Orders of magnitude: {np.log10(percentiles[-1] / percentiles[0])}")
print(f"95% CI: ({percentiles[0]}, {percentiles[-1]})")

In [None]:
ddcp.sort_values("95_ci_ratio")

In [None]:
# DEE w/AD

target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104
target_ad = 0.06

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist, target_ad],
    [target_slope, target_log_mean_c2c_nn_dist, target_ad],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist, target_ad],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist", "areal_density"])

bandwidths_data = [
    [0.01, 0.001, 0.1],
    [0.01, 0.005, 0.1],
    [0.005, 0.001, 0.1],
    [0.005, 0.005, 0.1],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist", "areal_density"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result

In [None]:
# DEE w/Za

target = "n_craters_added_in_study_region"
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104
target_za = -0.23

observations_data = [
    [target_slope - slope_ci_width, target_log_mean_c2c_nn_dist, target_za],
    [target_slope, target_log_mean_c2c_nn_dist, target_za],
    [target_slope + slope_ci_width, target_log_mean_c2c_nn_dist, target_za],
]
observations = pd.DataFrame(observations_data, columns=["slope", "log_mean_c2c_nn_dist", "za"])

bandwidths_data = [
    [0.01, 0.001, 3],
    [0.01, 0.005, 3],
    [0.005, 0.001, 3],
    [0.005, 0.005, 3],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist", "za"])

result = get_confidence_intervals(data, configs_df, target, observations, bandwidths, spark)
result

In [None]:
target = "n_craters_added_in_study_region"

observation = {
    "slope": 1.5,
    "log_mean_c2c_nn_dist": 1.8
}
bandwidths = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.05
}

result = get_confidence_interval(data, configs_df, target, observation, bandwidths)

In [None]:
result

In [None]:
simulation_id = None
target = "n_craters_added_in_study_region"

bandwidths = {
    "slope": 0.01,
    "log_mean_c2c_nn_dist": 0.001
}

In [None]:
# Dione 1 (DEE)
target_slope = 2.10
slope_ci_width = 0.08
target_log_mean_c2c_nn_dist = 0.8136322010294104

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione1_result = result
result

In [None]:
# Dione 2 (DSP)
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685316230432883
target_effective_radius_multiplier = 1.3

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione2_result = result
result

In [None]:
# Dione 2 (DSP)
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685316230432883
target_effective_radius_multiplier = 1.3

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione2_result = result
result

In [None]:
# Dione 2 (DSP)
# Wider slope bandwidth
target_slope = 2.8
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685316230432883
target_effective_radius_multiplier = 1.3

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione2_result = result
result

In [None]:
# Dione 3 (DICP)
target_slope = 2.08
slope_ci_width = 0.09
target_log_mean_c2c_nn_dist = 0.9338405903389017

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione3_result = result
result

In [None]:
# Dione 4 (DDCP)
target_slope = 2.15
slope_ci_width = 0.05
target_log_mean_c2c_nn_dist = 0.5480628421533491

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist
}
result = get_confidence_interval(data, configs_df, target, observation, bandwidths)
dione4_result = result
result

In [None]:
# Dione 2 (DSP)
# With eff radius multiplier
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685316230432883
target_effective_radius_multiplier = 1.3

observation = {
    "slope": target_slope,
    "log_mean_c2c_nn_dist": target_log_mean_c2c_nn_dist,
    "effective_radius_multiplier": target_effective_radius_multiplier
}
bandwidths_with_eff = bandwidths.copy()
bandwidths_with_eff.update({"effective_radius_multiplier": 0.05})
result = get_confidence_interval(data, configs_df, target, observation, bandwidths_with_eff)
dione2_with_eff_result = result
result

In [None]:
# Try using SQL
simulation_id = None
target = "n_craters_added_in_study_region"
bandwidths = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.05
}

observation = {
    "slope": 1.5,
    "log_mean_c2c_nn_dist": 1.8
}


data.createOrReplaceTempView("data")
configs_df.createOrReplaceTempView("configs")

simulation_id_clause = f"AND data.simulation_id <> {simulation_id}\n" if simulation_id else ""
bandwidth_clauses = ""
for var, bandwidth in bandwidths.items():
    observation_value = observation[var]
    bandwidth_clauses += f"AND {var} BETWEEN {observation_value - bandwidth} AND {observation_value + bandwidth}\n"

result = spark.sql(f"""
SELECT
    *
    -- approx_percentile({target}, array(0.025, 0.5, 0.975), 10000) as percentiles
FROM
    data
    INNER JOIN configs
        ON data.simulation_id = configs.simulation_id
WHERE
    1=1
    {simulation_id_clause}
    {bandwidth_clauses}
""").toPandas()

In [None]:
data.columns

In [None]:
slope_delta = 0.1
log_mean_c2c_nn_dist_delta = 0.1
effective_radius_multiplier_delta = 0.1

min_slope, max_slope, n_slope_buckets = get_min_max_n_buckets("slope", slope_delta, configs_df)
min_log_mean_c2c_nn_dist, max_log_mean_c2c_nn_dist, n_log_mean_c2c_nn_dist_buckets = get_min_max_n_buckets("log_mean_c2c_nn_dist", log_mean_c2c_nn_dist_delta, data)
min_effective_radius_multiplier, max_effective_radius_multiplier, n_effective_radius_multiplier_buckets = get_min_max_n_buckets("effective_radius_multiplier", effective_radius_multiplier_delta, configs_df)

In [None]:
targets_array = [
    [np.round(x, decimals=5), np.round(y, decimals=5), np.round(z, decimals=5)]
    for x in np.linspace(min_slope, max_slope, n_slope_buckets)
    for y in np.linspace(min_log_mean_c2c_nn_dist, max_log_mean_c2c_nn_dist, n_log_mean_c2c_nn_dist_buckets)
    for z in np.linspace(min_effective_radius_multiplier, max_effective_radius_multiplier, n_effective_radius_multiplier_buckets)
]

targets = pd.DataFrame(targets_array, columns=["target_slope", "target_log_mean_c2c_nn_dist", "target_effective_radius_multiplier"])
targets_spark_df = spark.createDataFrame(targets)
data_with_targets = data.join(F.broadcast(targets_spark_df), how="cross")

In [None]:
data_with_configs = data_with_targets.join(F.broadcast(configs_df), on="simulation_id")

In [None]:
# Parameters
simulation_id = None
slope_ci_width = 0.05
log_mean_c2c_nn_dist_width = 0.01
effective_radius_multiplier_width = 0.1

# Select where not equal to a simulation ID
data_to_query = data_with_configs
if simulation_id is not None:
    data_to_query = data_to_query.where(F.col("simulation_id") != simulation_id)

# Select within a range of slope and log_c2c_nn_dist_mean
result = (
    data_to_query
        .where((F.col("slope") > F.col("target_slope") - F.lit(slope_width))
               & (F.col("slope") <= F.col("target_slope") + F.lit(slope_width))
               & (F.col("log_mean_c2c_nn_dist") > F.col("target_log_mean_c2c_nn_dist") - F.lit(log_mean_c2c_nn_dist_width))
               & (F.col("log_mean_c2c_nn_dist") <= F.col("target_log_mean_c2c_nn_dist") + F.lit(log_mean_c2c_nn_dist_width))
               & (F.col("effective_radius_multiplier") > F.col("target_effective_radius_multiplier") - F.lit(effective_radius_multiplier_width))
               & (F.col("effective_radius_multiplier") <= F.col("target_effective_radius_multiplier") + F.lit(effective_radius_multiplier_width)))
        .groupby("target_slope", "target_log_mean_c2c_nn_dist", "target_effective_radius_multiplier")
        .agg(
            F.percentile_approx("n_craters_added_in_study_region", F.array(*[F.lit((x) / 40.0) for x in range(40)]), 10000).alias("N_percentiles"),
            F.mean("n_craters_added_in_study_region").alias("N_mean"),
            F.stddev("n_craters_added_in_study_region").alias("N_stdev"),
            F.count("n_craters_added_in_study_region").alias("count")
        )
)

In [None]:
df = result.toPandas()
df

In [None]:
df[df.target_slope == 2.1].sort_values(["target_log_mean_c2c_nn_dist", "target_effective_radius_multiplier"])

In [None]:
def get_single_confidence_interval(target_slope: float,
                                   slope_delta: float,
                                   target_log_mean_c2c_nn_dist: float,
                                   log_mean_c2c_nn_dist_delta: float,
                                   target_effective_radius_multiplier: float,
                                   effective_radius_multiplier_delta: float,
                                   data: pd.DataFrame) -> Tuple[float, float, float]:
    target_slope = quantize_value(target_slope, slope_delta)
    target_log_mean_c2c_nn_dist = quantize_value(target_log_mean_c2c_nn_dist, log_mean_c2c_nn_dist_delta)
    target_effective_radius_multiplier = quantize_value(target_effective_radius_multiplier, effective_radius_multiplier_delta)
    
    row = data[(np.abs(data.target_slope - target_slope) < 0.001)
                & (np.abs(data.target_log_mean_c2c_nn_dist - target_log_mean_c2c_nn_dist) < 0.001)
                & (np.abs(data.target_effective_radius_multiplier - target_effective_radius_multiplier) < 0.001)
            ].iloc[0]

    # 2.5th and 97.5th percentiles
    return row.N_percentiles[1], row.N_percentiles[-2]

def get_confidence_intervals(target_slope: float,
                             slope_delta: float,
                             slope_ci_width: float,
                             target_log_mean_c2c_nn_dist: float,
                             log_mean_c2c_nn_dist_delta: float,
                             target_effective_radius_multiplier: float,
                             effective_radius_multiplier_delta: float,
                             data: pd.DataFrame) -> Tuple[float, float]:
    lower_ci = get_single_confidence_interval(target_slope - slope_ci_width, slope_delta, target_log_mean_c2c_nn_dist, log_mean_c2c_nn_dist_delta, target_effective_radius_multiplier, effective_radius_multiplier_delta, data)
    mean_ci = get_single_confidence_interval(target_slope, slope_delta, target_log_mean_c2c_nn_dist, log_mean_c2c_nn_dist_delta, target_effective_radius_multiplier, effective_radius_multiplier_delta, data)
    upper_ci = get_single_confidence_interval(target_slope + slope_ci_width, slope_delta, target_log_mean_c2c_nn_dist, log_mean_c2c_nn_dist_delta, target_effective_radius_multiplier, effective_radius_multiplier_delta, data)

    return [lower_ci, mean_ci, upper_ci]

In [None]:
# Dione 1 (DEE)
# target_slope = 2.10
# slope_ci_width = 0.08
# target_log_mean_c2c_nn_dist = 0.8136322010294104

# Dione 2 (DSP)
target_slope = 2.66
slope_plus_minus = 0.05
target_log_mean_c2c_nn_dist = 0.6685316230432883
target_effective_radius_multiplier = 1.3

# Dione 3 (DICP)
# target_slope = 2.08
# slope_ci_width = 0.09
# target_log_mean_c2c_nn_dist = 0.9338405903389017

# Dione 4 (DDCP)
# target_slope = 2.15
# slope_ci_width = 0.05
# target_log_mean_c2c_nn_dist = 0.5480628421533491


get_confidence_intervals(target_slope, slope_delta, slope_ci_width, target_log_mean_c2c_nn_dist, log_mean_c2c_nn_dist_delta, target_effective_radius_multiplier, effective_radius_multiplier_delta, df)