In [1]:
import sys

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
from typing import *

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Window
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
import itertools

from pyspark.ml.feature import VectorAssembler, MinMaxScaler, BucketedRandomProjectionLSH
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array

from utils import *

pio.renderers.default = "iframe"

In [2]:
def quantize_value(value: float, delta: float) -> float:
    return np.round(int(np.round(value / delta, decimals=5)) * delta, decimals=5)


def get_min_max_n_buckets(column: str, delta: float, df) -> Tuple[float, float, int]:
    min_max_df = df.select(F.min(column), F.max(column)).toPandas()
    
    min_val = quantize_value(min_max_df.iloc[0, 0], delta)
    max_val = quantize_value(min_max_df.iloc[0, 1], delta)
    n_buckets = int(np.round((max_val - min_val) / delta, decimals=5)) + 1

    return min_val, max_val, n_buckets

def get_confidence_intervals(train: DataFrame,
                             test: DataFrame,
                             predictor_variables: List[str],
                             bandwidths: pd.DataFrame,
                             spark_session: SparkSession) -> pd.DataFrame:
    train.createOrReplaceTempView("train")
    test.createOrReplaceTempView("test")
    
    bandwidths_df = spark_session.createDataFrame(bandwidths).cache()
    bandwidths_df.createOrReplaceTempView("bandwidths")

    # Trick it into caching and broadcasting
    bandwidths_df.count()

    simulation_id_present = "simulation_id" in test.columns
    
    group_by_clause = ",\n ".join([f"te.{x}" for x in predictor_variables] + [f"b.{x}" for x in bandwidths.columns])

    if "target" in test.columns:
        group_by_clause += "\n, te.target"
        target_select_clause = "te.target AS target,"
    else:
        target_select_clause = ""
        
    if simulation_id_present:
        simulation_id_clause = "AND tr.simulation_id <> te.simulation_id\n"
        bandwidths_select_clause = "te.simulation_id AS simulation_id, te.crater_id AS crater_id,\n"
        group_by_clause += "\n, te.simulation_id, te.crater_id"
    else:
        simulation_id_clause = ""
        bandwidths_select_clause = ""


    test_select_clause = ""
    bandwidth_clauses = "1=1 "
    for var in predictor_variables:
        test_select_clause += f"te.{var} as test_{var},\n"
        if var in bandwidths.columns:
            bandwidth_clauses += f"AND tr.{var} BETWEEN (te.{var} - b.{var}) AND (te.{var} + b.{var})\n"

    for var in predictor_variables:
        bandwidths_select_clause += f"b.{var} as bandwidth_{var},\n"

    query = f"""
    SELECT
        {test_select_clause}
        {bandwidths_select_clause}
        {target_select_clause}
        mean(tr.target) as target_mean,
        count(tr.target) as n_obs,
        count(distinct tr.simulation_id) as n_unique_sims,
        approx_percentile(tr.target, array(0.025, 0.975), 5000) as percentiles
    FROM
        test te
        INNER JOIN bandwidths b
        LEFT JOIN train tr
            ON {bandwidth_clauses}
    WHERE
        1=1
        {simulation_id_clause}
    GROUP BY
        {group_by_clause}
    """
    result = spark.sql(query)

    return result.toPandas()


def create_objective_function(train: DataFrame,
                              test: DataFrame,
                              predictor_variables: List[str],
                              spark: SparkSession):
    """
    Creates an objective function for HyperOpt optimization
    """
    def objective_function(args):
        min_obs = 100
        n_too_many_obs = 500000
        
        bandwidths_data = [[args[f"bandwidth_{x}"] for x in predictor_variables]]
        bandwidths = pd.DataFrame(bandwidths_data, columns=predictor_variables)
        
        result = get_confidence_intervals(train, test, predictor_variables, bandwidths, spark)

        ci_high = result.percentiles.map(lambda x: x[-1])
        ci_low = result.percentiles.map(lambda x: x[0])
        orders_of_magnitude_rms = np.sqrt((np.log10(ci_high / ci_low) ** 2).mean())
        orders_of_magnitude = np.log10(ci_high / ci_low).mean()
        orders_of_magnitude_stdev = np.log10(ci_high / ci_low).std()
        mean_n_unique_sims = result.n_unique_sims.mean()
        mean_n_obs = result.n_obs.mean()
        
        percent_inside_ci = ((result.target >= ci_low) & (result.target <= ci_high)).mean()

        # Penalize if more or less than 5% are outside of the CI
        deviance_from_95 = np.abs(0.95 - percent_inside_ci)
        deviance_loss_multiplier = 1.0 if deviance_from_95 < 0.01 else 1 + deviance_from_95 * 10.0
        
        # Penalize for the fraction with too few observations
        too_few = result.n_obs[(result.n_obs < min_obs) | (result.n_obs.isna())]
        n_obs_loss_multiplier = 1 + too_few.mean() * 100 if too_few.shape[0] > 0 else 1.0

        # Penalize for the fraction with too many observations
        too_many = result.n_obs > n_too_many_obs
        n_too_many_obs_loss_multiplier = 1 + too_many.mean() * 10 if too_many.shape[0] > 0 else 1.0

        loss = orders_of_magnitude_rms * deviance_loss_multiplier * n_obs_loss_multiplier * n_too_many_obs_loss_multiplier

        print(f"{args}, {loss:.3f}, {orders_of_magnitude:.4f}, {orders_of_magnitude_stdev:.4f}, {orders_of_magnitude_rms:.4f}, {n_obs_loss_multiplier:.4f}, {n_too_many_obs_loss_multiplier:.4f}, {deviance_loss_multiplier:.4f}, {mean_n_unique_sims}, {mean_n_obs:.4f}, ")
        
        return {
            "loss": loss,
            "params": args,
            "orders_of_magnitude": orders_of_magnitude,
            "orders_of_magnitude_rms": orders_of_magnitude_rms,
            "n_obs_loss_multiplier": n_obs_loss_multiplier,
            "deviance_loss_multiplier": deviance_loss_multiplier,
            "mean_n_unique_sims": mean_n_unique_sims,
            "mean_n_obs": mean_n_obs,
            "status": STATUS_OK
        }
        
    return objective_function

In [3]:
n_cores = 26

spark = (SparkSession.builder
         .master(f"local[{n_cores}]")
         .appName("Saturation")
         .config("spark.sql.shuffle.partitions", "500")
         .config("spark.driver.memory", "60g")
         .config("spark.driver.maxResultSize", "8g")
         .getOrCreate())

24/02/01 15:05:38 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.86.20 instead (on interface enp8s0)
24/02/01 15:05:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/01 15:05:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/01 15:05:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/02/01 15:05:38 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Prepare and save the dataset to disk

In [4]:
# base_path = "/data/saturation/n_craters_stop_condition_20230918"
base_path = "/data/saturation/thesis_run_20240130"

In [5]:
r_stat = 5
# study_region_size = 4000 * 4000 / r_stat ** 2
study_region_size = 2000 ** 2 / r_stat ** 2

configs_df = create_configs_df(read_configs(base_path, spark))
data = spark.read.parquet(f"{base_path}/*/statistics_*.parquet")

# State c2c nn dist in terms of r_stat and log-scale
data = data.select(
    "*",
    F.log10(F.col("center_to_center_nearest_neighbor_distance_mean") / F.lit(r_stat)).alias("log_mean_c2c_nn_dist"),
    F.log10("areal_density").alias("log_ad"),
    (F.col("n_craters_in_study_region") / F.col("n_craters_added_in_study_region")).alias("information_remaining")
)

                                                                                

In [6]:
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    "log_ad"
]

In [7]:
params_sets = [
    # {
    #     "n_test_samples": 1000,
    #     "train_sample_fraction": 0.05,
    # },
    # {
    #     "n_test_samples": int(1e9),
    #     "train_sample_fraction": 0.05,
    # },
    # {
    #     "n_test_samples": int(1e9),
    #     "train_sample_fraction": 0.25,
    # },
    # {
    #     "n_test_samples": int(1e9),
    #     "train_sample_fraction": 1.00,
    # },
    # {
    #     "n_test_samples": int(1e9),
    #     "train_sample_fraction": 1.00,
    # },
    {
        "n_test_samples": int(1e9),
        "train_sample_fraction": 0.25,
    },
    # {
    #     "n_test_samples": 1000,
    #     "train_sample_fraction": 0.1,
    # },
    # {
    #     "n_test_samples": 1000,
    #     "train_sample_fraction": 0.25,
    # },
    # {
    #     "n_test_samples": 1000,
    #     "train_sample_fraction": 1.0,
    # },
    # {
    #     "n_test_samples": 5000,
    #     "train_sample_fraction": 0.05,
    # },
    # {
    #     "n_test_samples": 5000,
    #     "train_sample_fraction": 0.1,
    # },
    # {
    #     "n_test_samples": 5000,
    #     "train_sample_fraction": 0.25,
    # },
    # {
    #     "n_test_samples": 10000,
    #     "train_sample_fraction": 0.05,
    # },
    # {
    #     "n_test_samples": 10000,
    #     "train_sample_fraction": 0.1,
    # },
    # {
    #     "n_test_samples": 10000,
    #     "train_sample_fraction": 0.25,
    # },
    # {
    #     "n_test_samples": 10000,
    #     "train_sample_fraction": 0.1,
    # },
    # {
    #     "n_test_samples": 10000,
    #     "train_sample_fraction": 1,
    # },
]

data_subset = data.where(
    (F.col("n_craters_added_in_study_region") > F.lit(50))
)

for params in params_sets:
    train_sample_fraction = params["train_sample_fraction"]
    n_test_samples = params["n_test_samples"]

    train, test = setup_datasets_for_model(
        data_subset,
        configs_df,
        0.1,
        predictor_variables,
        target,
        train_sample_fraction,
        n_test_samples,
        spark,
        cache_train=False,
        cache_test=False
    )

    train.coalesce(500).write.format("parquet").mode("overwrite").save(f"data/train_{train_sample_fraction:.3f}_{n_test_samples}.parquet")
    test.coalesce(50).write.format("parquet").mode("overwrite").save(f"data/test_{train_sample_fraction:.3f}_{n_test_samples}.parquet")

                                                                                

## Calibrating of bandwidths

In [None]:
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    "log_ad"
]

# Force caching of train and test data, as they are small
train = spark.read.parquet("data/train_0.050_1000.parquet").cache()
train.count()

test = spark.read.parquet("data/test_0.050_1000.parquet").cache()
test.count()

trials = Trials()
space = {
    "bandwidth_slope": hp.uniform("bandwidth_slope", 0.05, 0.3),
    "bandwidth_log_mean_c2c_nn_dist": hp.uniform("bandwidth_log_mean_c2c_nn_dist", 0.0005, 0.2),
    "bandwidth_log_ad": hp.uniform("bandwidth_log_ad", 0.001, 0.3),
}

best = fmin(create_objective_function(train, test, predictor_variables, spark),
            space=space,
            algo=tpe.suggest,
            trials=trials,
            max_evals=500)

In [None]:
trials

In [None]:
"""
Oddly good losses for 0.050_1000:
{'bandwidth_log_ad': 0.09507393547881537, 'bandwidth_log_mean_c2c_nn_dist': 0.004143332056813422, 'bandwidth_slope': 0.2}, 5.547, 1.4063, 0.3337, 1.4453, 1.0000, 3.8381, 1.0000, 972.1454918032787, 380108.4395, 
{'bandwidth_log_ad': 0.0734955411086102, 'bandwidth_log_mean_c2c_nn_dist': 0.0038103947728301826, 'bandwidth_slope': 0.2}, 3.836, 1.4028, 0.3352, 1.4422, 1.0000, 2.6598, 1.0000, 878.1967213114754, 299167.2951,     

For 0.100_1000:
{'bandwidth_log_ad': 0.10846260454421022, 'bandwidth_log_mean_c2c_nn_dist': 0.0018803208882686748, 'bandwidth_slope': 0.2}, 6.136, 1.3785, 0.3344, 1.4185, 1.0000, 4.3260, 1.0000, 991.5437636761488, 398640.2495,           
"""

## Scoring Dione Surfaces

In [8]:
# train = spark.read.parquet("data/train_0.250_1000.parquet")
# train = spark.read.parquet("data/train_1.000_1000.parquet")
# train = spark.read.parquet(f"data/train_1.000_{int(1e9)}.parquet")
train = spark.read.parquet(f"data/train_0.250_{int(1e9)}.parquet")

In [9]:
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    "log_ad"
]

observation_dee = {
    "slope": 2.10,
    "log_mean_c2c_nn_dist": 0.8136,
    "log_ad": np.log10(0.06)
}

observation_dsp = {
    "slope": 2.66,
    "log_mean_c2c_nn_dist": 0.6685,
    "log_ad": np.log10(0.13)
}

observation_dicp = {
    "slope": 2.08,
    "log_mean_c2c_nn_dist": 0.9338405903389017,
    "log_ad": np.log10(0.31)
}

observation_ddcp = {
    "slope": 2.15,
    "log_mean_c2c_nn_dist": 0.6685,
    "log_ad": np.log10(0.35)
}

observation_test1 = {
    "slope": 2.15,
    "log_mean_c2c_nn_dist": 0.5585,
    "log_ad": np.log10(0.50)
}

# bandwidths_data = [
#     # [0.1, 0.002, 0.30],
#     # [0.1, 0.001, 0.30],
#     # [0.1, 0.0005, 0.30],
#     # [0.1, 0.0001, 0.30],
    
#     # [0.1, 0.005, 0.30],
#     # [0.1, 0.0025, 0.30],
#     # [0.1, 0.005, 0.15],
#     # [0.1, 0.0025, 0.15],

#     # [0.1, 0.01, 0.15],
#     # [0.1, 0.05, 0.15],
#     # [0.1, 0.02, 0.30],

#     # This set relatively successful
#     # Wide seems to be good
#     # [0.15, 0.05, 0.25],
#     # [0.15, 0.075, 0.25],
#     # [0.15, 0.1, 0.25],

#     [0.1, 0.05, 0.2],
#     [0.1, 0.1, 0.2],
# ]
# bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist", "log_ad"])

bandwidths_data = [
    [0.1, 0.05, 0.2],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist", "log_ad"])

sensitivity_steps = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.05,
    "log_ad": 0.2
}

n_steps = 2

In [10]:
# Dione Region 1: DEE
observation = observation_dee

test = pd.DataFrame(
    map(dict, itertools.product(*[
        [
            (k, observation[k] + x * sensitivity_steps[k])
            for x in range(-n_steps, n_steps + 1)
        ]
        for k, v in observation.items()
    ]))
)

test = spark.createDataFrame(test).cache()
test.count()

result = get_confidence_intervals(train, test, predictor_variables, bandwidths, spark)
dee_result = result
result

                                                                                

Unnamed: 0,test_slope,test_log_mean_c2c_nn_dist,test_log_ad,bandwidth_slope,bandwidth_log_mean_c2c_nn_dist,bandwidth_log_ad,target_mean,n_obs,n_unique_sims,percentiles
0,2.3,0.7136,-1.621849,0.1,0.05,0.2,,0,0,
1,2.0,0.8136,-1.421849,0.1,0.05,0.2,,0,0,
2,2.1,0.8136,-1.621849,0.1,0.05,0.2,,0,0,
3,2.1,0.9136,-0.821849,0.1,0.05,0.2,11249.281114,631017,869,"[2574, 31623]"
4,2.1,0.9136,-1.621849,0.1,0.05,0.2,,0,0,
...,...,...,...,...,...,...,...,...,...,...
120,1.9,0.7136,-1.421849,0.1,0.05,0.2,,0,0,
121,2.3,0.7136,-1.421849,0.1,0.05,0.2,,0,0,
122,1.9,0.8636,-1.621849,0.1,0.05,0.2,,0,0,
123,2.3,0.8636,-1.621849,0.1,0.05,0.2,,0,0,


In [11]:
result = dee_result
observation = observation_dee
result.to_parquet("data/dee.parquet")

result[
    (result.test_slope == observation["slope"])
    & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
    & (result.test_log_ad == observation["log_ad"])
]

Unnamed: 0,test_slope,test_log_mean_c2c_nn_dist,test_log_ad,bandwidth_slope,bandwidth_log_mean_c2c_nn_dist,bandwidth_log_ad,target_mean,n_obs,n_unique_sims,percentiles
88,2.1,0.8136,-1.221849,0.1,0.05,0.2,3689.881951,3219,47,"[3269, 4405]"


In [12]:
np.log10(
    result[
        (result.test_slope == observation["slope"])
        & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
        & (result.test_log_ad == observation["log_ad"])
    ].percentiles.iloc[0]
)

array([3.51441492, 3.64394591])

In [13]:
magnitudes = np.log10(
    result[
        (result.test_slope == observation["slope"])
        & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
        & (result.test_log_ad == observation["log_ad"])
    ].percentiles.iloc[0]
)
magnitudes[1] - magnitudes[0]

0.1295309921676977

In [14]:
# Dione region 2: DSP
observation = observation_dsp

test = pd.DataFrame(
    map(dict, itertools.product(*[
        [
            (k, observation[k] + x * sensitivity_steps[k])
            for x in range(-n_steps, n_steps + 1)
        ]
        for k, v in observation.items()
    ]))
)

test = spark.createDataFrame(test).cache()
test.count()

result = get_confidence_intervals(train, test, predictor_variables, bandwidths, spark)
dsp_result = result
result

                                                                                

Unnamed: 0,test_slope,test_log_mean_c2c_nn_dist,test_log_ad,bandwidth_slope,bandwidth_log_mean_c2c_nn_dist,bandwidth_log_ad,target_mean,n_obs,n_unique_sims,percentiles
0,2.76,0.6685,-0.886057,0.1,0.05,0.2,16660.765661,570081,208,"[7799, 32845]"
1,2.56,0.7185,-1.086057,0.1,0.05,0.2,6799.546719,111626,308,"[5327, 9159]"
2,2.66,0.6685,-0.486057,0.1,0.05,0.2,52845.705449,3185290,229,"[16767, 96428]"
3,2.46,0.7685,-1.286057,0.1,0.05,0.2,4337.163399,5814,83,"[3940, 5181]"
4,2.76,0.7185,-1.286057,0.1,0.05,0.2,5446.275021,3527,47,"[4937, 6204]"
...,...,...,...,...,...,...,...,...,...,...
120,2.46,0.7685,-1.086057,0.1,0.05,0.2,6067.519954,289516,565,"[4275, 9229]"
121,2.86,0.6685,-0.486057,0.1,0.05,0.2,56402.598293,2091260,172,"[22785, 94303]"
122,2.56,0.7185,-0.886057,0.1,0.05,0.2,14136.854893,912280,442,"[5946, 37213]"
123,2.46,0.6185,-1.086057,0.1,0.05,0.2,,0,0,


In [15]:
result = dsp_result
observation = observation_dsp
result.to_parquet("data/dsp.parquet")

result[
    (result.test_slope == observation["slope"])
    & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
    & (result.test_log_ad == observation["log_ad"])
]

Unnamed: 0,test_slope,test_log_mean_c2c_nn_dist,test_log_ad,bandwidth_slope,bandwidth_log_mean_c2c_nn_dist,bandwidth_log_ad,target_mean,n_obs,n_unique_sims,percentiles
80,2.66,0.6685,-0.886057,0.1,0.05,0.2,13536.430665,402817,222,"[7520, 23801]"


In [16]:
np.log10(
    result[
        (result.test_slope == observation["slope"])
        & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
        & (result.test_log_ad == observation["log_ad"])
    ].percentiles.iloc[0]
)

array([3.87621784, 4.3765952 ])

In [17]:
magnitudes = np.log10(
    result[
        (result.test_slope == observation["slope"])
        & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
        & (result.test_log_ad == observation["log_ad"])
    ].percentiles.iloc[0]
)
magnitudes[1] - magnitudes[0]

0.5003773637488331

In [18]:
# Dione region 3: DICP
observation = observation_dicp

test = pd.DataFrame(
    map(dict, itertools.product(*[
        [
            (k, observation[k] + x * sensitivity_steps[k])
            for x in range(-n_steps, n_steps + 1)
        ]
        for k, v in observation.items()
    ]))
)

test = spark.createDataFrame(test).cache()
test.count()

result = get_confidence_intervals(train, test, predictor_variables, bandwidths, spark)
dicp_result = result
result

                                                                                

Unnamed: 0,test_slope,test_log_mean_c2c_nn_dist,test_log_ad,bandwidth_slope,bandwidth_log_mean_c2c_nn_dist,bandwidth_log_ad,target_mean,n_obs,n_unique_sims,percentiles
0,1.98,0.933841,-0.708638,0.1,0.05,0.2,9507.845943,433436,695,"[2387, 27070]"
1,1.98,0.933841,-0.108638,0.1,0.05,0.2,,0,0,
2,1.88,0.833841,-0.708638,0.1,0.05,0.2,17111.216676,1650287,847,"[3838, 41659]"
3,2.08,1.033841,-0.708638,0.1,0.05,0.2,2235.601426,13606,96,"[1370, 4976]"
4,1.88,0.933841,-0.508638,0.1,0.05,0.2,10594.824722,363315,508,"[2651, 25100]"
...,...,...,...,...,...,...,...,...,...,...
120,1.98,0.833841,-0.708638,0.1,0.05,0.2,18200.073482,1978131,872,"[3903, 44052]"
121,1.98,0.933841,-0.308638,0.1,0.05,0.2,12660.859887,17807,49,"[3316, 24582]"
122,2.28,0.883841,-0.308638,0.1,0.05,0.2,25562.101586,14756,17,"[5191, 36008]"
123,2.28,1.033841,-0.708638,0.1,0.05,0.2,5590.167619,5775,29,"[1362, 19977]"


In [19]:
result = dicp_result
observation = observation_dicp
result.to_parquet("data/dicp.parquet")

result[
    (result.test_slope == observation["slope"])
    & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
    & (result.test_log_ad == observation["log_ad"])
]

Unnamed: 0,test_slope,test_log_mean_c2c_nn_dist,test_log_ad,bandwidth_slope,bandwidth_log_mean_c2c_nn_dist,bandwidth_log_ad,target_mean,n_obs,n_unique_sims,percentiles
101,2.08,0.933841,-0.508638,0.1,0.05,0.2,13856.210884,103635,142,"[2891, 27336]"


In [20]:
np.log10(
    result[
        (result.test_slope == observation["slope"])
        & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
        & (result.test_log_ad == observation["log_ad"])
    ].percentiles.iloc[0]
)

array([3.46104809, 4.43673497])

In [21]:
magnitudes = np.log10(
    result[
        (result.test_slope == observation["slope"])
        & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
        & (result.test_log_ad == observation["log_ad"])
    ].percentiles.iloc[0]
)
magnitudes[1] - magnitudes[0]

0.9756868741200488

In [22]:
# Dione region 4: DDCP
observation = observation_ddcp

test = pd.DataFrame(
    map(dict, itertools.product(*[
        [
            (k, observation[k] + x * sensitivity_steps[k])
            for x in range(-n_steps, n_steps + 1)
        ]
        for k, v in observation.items()
    ]))
)

test = spark.createDataFrame(test).cache()
test.count()

result = get_confidence_intervals(train, test, predictor_variables, bandwidths, spark)
ddcp_result = result
result

                                                                                

Unnamed: 0,test_slope,test_log_mean_c2c_nn_dist,test_log_ad,bandwidth_slope,bandwidth_log_mean_c2c_nn_dist,bandwidth_log_ad,target_mean,n_obs,n_unique_sims,percentiles
0,2.25,0.7685,-0.255932,0.1,0.05,0.2,41207.646334,699057,216,"[15031, 66104]"
1,2.15,0.6185,-0.255932,0.1,0.05,0.2,58497.585641,4412934,305,"[15491, 112096]"
2,2.25,0.7185,-0.855932,0.1,0.05,0.2,8961.182452,414636,523,"[5748, 15369]"
3,2.35,0.7685,-0.455932,0.1,0.05,0.2,36926.204441,3332035,470,"[11426, 64885]"
4,2.15,0.7185,-0.455932,0.1,0.05,0.2,37890.026736,5402010,662,"[8934, 74706]"
...,...,...,...,...,...,...,...,...,...,...
120,2.05,0.5685,-0.855932,0.1,0.05,0.2,,0,0,
121,2.05,0.7685,-0.255932,0.1,0.05,0.2,33962.303169,1520423,434,"[9767, 61051]"
122,2.25,0.6185,-0.855932,0.1,0.05,0.2,9942.858333,4320,28,"[8706, 11483]"
123,2.25,0.5685,-0.055932,0.1,0.05,0.2,87914.685538,2002324,133,"[32745, 147049]"


In [23]:
result = ddcp_result
observation = observation_ddcp
result.to_parquet("data/ddcp.parquet")

result[
    (result.test_slope == observation["slope"])
    & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
    & (result.test_log_ad == observation["log_ad"])
]

Unnamed: 0,test_slope,test_log_mean_c2c_nn_dist,test_log_ad,bandwidth_slope,bandwidth_log_mean_c2c_nn_dist,bandwidth_log_ad,target_mean,n_obs,n_unique_sims,percentiles
61,2.15,0.6685,-0.455932,0.1,0.05,0.2,45055.170166,5104284,491,"[10331, 89955]"


In [24]:
np.log10(
    result[
        (result.test_slope == observation["slope"])
        & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
        & (result.test_log_ad == observation["log_ad"])
    ].percentiles.iloc[0]
)

array([4.01414236, 4.95402531])

In [25]:
magnitudes = np.log10(
    result[
        (result.test_slope == observation["slope"])
        & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
        & (result.test_log_ad == observation["log_ad"])
    ].percentiles.iloc[0]
)
magnitudes[1] - magnitudes[0]

0.9398829463484546

In [30]:
# Test hypothetical region 1
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    # "log_ad"
]

observation_test1 = {
    "slope": 2.15,
    "log_mean_c2c_nn_dist": 0.90,
    # "log_ad": np.log10(0.50)
}

bandwidths_data = [
    [0.1, 0.025],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist"])

sensitivity_steps = {
    "slope": 0.1,
    "log_mean_c2c_nn_dist": 0.01,
}
n_steps = 3


observation = observation_test1

test = pd.DataFrame(
    map(dict, itertools.product(*[
        [
            (k, observation[k] + x * sensitivity_steps[k])
            for x in range(-n_steps, n_steps + 1)
        ]
        for k, v in observation.items()
    ]))
)

test = spark.createDataFrame(test).cache()
test.count()

result = get_confidence_intervals(train, test, predictor_variables, bandwidths, spark)
test1_result = result
result.sort_values(["test_slope", "test_log_mean_c2c_nn_dist"])

                                                                                

Unnamed: 0,test_slope,test_log_mean_c2c_nn_dist,bandwidth_slope,bandwidth_log_mean_c2c_nn_dist,target_mean,n_obs,n_unique_sims,percentiles
33,1.85,0.87,0.1,0.025,14028.249113,1027742,903,"[3480, 32243]"
47,1.85,0.88,0.1,0.025,13107.534601,919938,909,"[3275, 30206]"
13,1.85,0.89,0.1,0.025,12143.076156,808162,912,"[3079, 28469]"
6,1.85,0.9,0.1,0.025,11149.820396,698148,915,"[2894, 27201]"
12,1.85,0.91,0.1,0.025,10106.314557,598172,916,"[2724, 25552]"
17,1.85,0.92,0.1,0.025,9012.532412,505107,917,"[2561, 24445]"
24,1.85,0.93,0.1,0.025,7817.618727,420166,917,"[2409, 22526]"
46,1.95,0.87,0.1,0.025,14224.638606,900762,902,"[3327, 33722]"
44,1.95,0.88,0.1,0.025,13185.927923,792780,909,"[3130, 31756]"
29,1.95,0.89,0.1,0.025,11974.640805,684280,912,"[2947, 29920]"


In [31]:
magnitudes = np.log10(
    result[
        (result.test_slope == observation["slope"])
        & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
        # & (result.test_log_ad == observation["log_ad"])
    ].percentiles.iloc[0]
)
magnitudes[1] - magnitudes[0]

1.0756664237346723

In [None]:
# Reload results
dee_result = pd.read_parquet("data/dee.parquet")
ddcp_result = pd.read_parquet("data/ddcp.parquet")
dsp_result = pd.read_parquet("data/dsp.parquet")
dicp_result = pd.read_parquet("data/dicp.parquet")

In [None]:
dee_result["percentiles"] = [[int(y) for y in x.strip("[]").split(", ")] if type(x) != float else None for x in dee_result.percentiles]
ddcp_result["percentiles"] = [[int(y) for y in x.strip("[]").split(", ")] if type(x) != float else None for x in ddcp_result.percentiles]
dsp_result["percentiles"] = [[int(y) for y in x.strip("[]").split(", ")] if type(x) != float else None for x in dsp_result.percentiles]
dicp_result["percentiles"] = [[int(y) for y in x.strip("[]").split(", ")] if type(x) != float else None for x in dicp_result.percentiles]

In [None]:
# Plots of orders of magnitude
observation = observation_dee
result = dee_result
result["orders_of_magnitude"] = [np.log10(x[1] / x[0]) if x else None for x in result.loc[:, "percentiles"]] 

r = result[
    (result.bandwidth_log_mean_c2c_nn_dist == 0.1)
    & (~result.orders_of_magnitude.isna())
].copy()
fig = px.scatter_3d(
    r,
    x="test_slope",
    y="test_log_mean_c2c_nn_dist",
    z="test_log_ad",
    color="orders_of_magnitude",
    size_max=1,
    width=1000,
    height=800
)
fig.update_traces(marker={"size":7})
fig.show()

In [None]:
observation = observation_dsp
result = dsp_result
result["orders_of_magnitude"] = [np.log10(x[1] / x[0]) if x else None for x in result.loc[:, "percentiles"]] 

In [None]:
result[
    #(result.bandwidth_log_mean_c2c_nn_dist == 0.1)
    (result.test_slope == observation["slope"])
    & (result.test_log_mean_c2c_nn_dist == observation["log_mean_c2c_nn_dist"])
    & (result.test_log_ad == observation["log_ad"])
]

## Scoring the selected parameters

In [None]:
train = spark.read.parquet("data/train_0.250_10000.parquet")
test = spark.read.parquet("data/test_0.250_10000.parquet").limit(5000).cache()
test.count()

In [None]:
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    "log_ad"
]

bandwidths_data = [
    [0.1, 0.1, 0.2],
]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist", "log_ad"])

result = get_confidence_intervals(train, test, predictor_variables, bandwidths, spark)

In [None]:
# result = pd.read_csv("data/scored.csv")
# # result["percentiles"] = [[int(y) for y in x.strip("[]").split(", ")] if type(x) != float else None for x in result.percentiles]
# result

In [None]:
# result.to_csv("data/scored.csv")
result.to_parquet("data/scored.parquet")

In [None]:
result = pd.read_parquet("data/scored.parquet")
result

In [None]:
result["orders_of_magnitude"] = [np.log10(x[1] / x[0]) if x else None for x in result.loc[:, "percentiles"]] 

In [None]:
plt.hist(result.target, bins=50)
plt.show()

In [None]:
ci_high = result.percentiles.map(lambda x: x[-1])
ci_low = result.percentiles.map(lambda x: x[0])
inside_ci = ((result.target >= ci_low) & (result.target <= ci_high))
percent_inside_ci = inside_ci.mean()

In [None]:
plt.scatter(
    
)

## Evaluating the model by ranges of N

In [None]:
train = spark.read.parquet(f"data/train_1.000_{int(1e9)}.parquet")
test = spark.read.parquet(f"data/test_1.000_{int(1e9)}.parquet").cache()
test.count()

In [None]:
target = "n_craters_added_in_study_region"
predictor_variables = [
    "slope",
    "log_mean_c2c_nn_dist",
    "log_ad"
]

log_n_start = np.log10(100)
log_n_end = np.log10(200000)
n_steps = 25
n_tests_per_step = 1000
step_size = (log_n_end - log_n_start) / (n_steps - 1)


# bandwidths_data = [
#     [0.05, 0.01, 0.2],
# ]

# Result: CI's cover way more than 95%
# bandwidths_data = [
#     [0.1, 0.05, 0.2],
# ]
bandwidths = pd.DataFrame(bandwidths_data, columns=["slope", "log_mean_c2c_nn_dist", "log_ad"])

results = dict()
for step in range(n_steps):
    print(f"Step {step}...")

    log_n_target = log_n_start + step_size * step
    test_subset = test.where(
        test.target.between(10**(log_n_target - step_size / 2), 10**(log_n_target + step_size / 2))
    ).orderBy(F.rand()).limit(n_tests_per_step).cache()
    test_subset.count()

    r = get_confidence_intervals(train, test_subset, predictor_variables, bandwidths, spark)
    r["orders_of_magnitude"] = [np.log10(x[1] / x[0]) if x else None for x in r.loc[:, "percentiles"]]
    ci_high = r.percentiles.map(lambda x: x[-1] if x else None)
    ci_low = r.percentiles.map(lambda x: x[0] if x else None)
    r["inside_ci"] = ((r.target >= ci_low) & (r.target <= ci_high))
    
    print(f"% inside CI: {r.inside_ci.mean()}")
    print(f"Mean orders of magnitude: {r.orders_of_magnitude.mean()}")
    
    results[step] = r

In [None]:
# old_results = results
# old_results_90_big = results
# results_only_two_vars = results

In [None]:
results = old_results

In [None]:
results[0]

In [None]:
all = pd.concat([x for x in results.values()], axis=0)
all.inside_ci.mean()

In [None]:
all.orders_of_magnitude.mean()

In [None]:
[results[x].inside_ci.mean() for x in range(len(results))]

In [None]:
n_steps = 25
plt.scatter(
    [log_n_start + x * step_size for x in range(n_steps)],
    [results[x].inside_ci.mean() for x in range(n_steps)],
)

In [None]:
[results[x].orders_of_magnitude.mean() for x in range(len(results))]

### Visualization of log_N vs NN_d

In [None]:
def join_configs(data: DataFrame, configs: DataFrame) -> DataFrame:
    data.createOrReplaceTempView("data")
    configs.createOrReplaceTempView("configs")
    
    # Join data and config
    query = f"""
    SELECT
        *
    FROM
        data
        INNER JOIN configs
            ON data.simulation_id = configs.simulation_id
    """
    return spark.sql(query)

In [None]:
data_subset = data.where(
    (F.col("information_remaining") > F.lit(0.25))
    & (F.col("n_craters_added_in_study_region") > F.lit(50))
).sample(0.00025)

df = join_configs(data_subset, configs_df).toPandas()

In [None]:
df["log_center_to_center_nearest_neighbor_distance_mean"] = np.log10(df.center_to_center_nearest_neighbor_distance_mean)
df["log_n_craters_added_in_study_region"] = np.log10(df.n_craters_added_in_study_region)

In [None]:
fig = px.scatter(
    df,
    x="log_mean_c2c_nn_dist",
    y="log_n_craters_added_in_study_region",
    color="slope",
    hover_data=["slope", "n_craters_added_in_study_region", "areal_density", "n_craters_in_study_region"],
    size_max=1,
    width=1600,
    height=600,
)
fig.update_layout(
    xaxis_title=dict(
        text="$log_{10}(\overline{NN_d})$",
        font=dict(size=18)
    ),
    yaxis_title=dict(
        text="$log_{10}(N_{tot})$",
        font=dict(size=18)
    ),
)
fig.update_traces(marker={"size":3})
fig.show()

In [None]:
# Trying a range selector
to_show = df.copy()
to_show["slope_selector"] = ((to_show.slope - 1) * 10).astype("int") / 10 + 1
to_show = to_show.sort_values("slope_selector")
to_show["index"] = range(to_show.shape[0])

range_x = [to_show.log_mean_c2c_nn_dist.min(), to_show.log_mean_c2c_nn_dist.max()]
range_y = [to_show.log_n_craters_added_in_study_region.min(), to_show.log_n_craters_added_in_study_region.max()]

fig = px.scatter(
    to_show,
    x="log_mean_c2c_nn_dist",
    y="log_n_craters_added_in_study_region",
    hover_data=["slope", "n_craters_added_in_study_region"],
    size_max=1,
    animation_frame="slope_selector",
    animation_group="index",
    range_x=range_x,
    range_y=range_y,
    width=1000,
    height=500
)
fig.update_layout(
    xaxis_title=dict(
        text="$log_{10}(\overline{NN_d})$",
        font=dict(size=18)
    ),
    yaxis_title=dict(
        text="$log(N_{tot})$",
        font=dict(size=18)
    ),
)
fig.update_traces(marker={"size":3})
fig.show()

## Plotting single simulations

In [None]:
cdf = configs_df.toPandas()

In [None]:
cdf

In [None]:
df = data.where(
    (F.col("information_remaining") > F.lit(0.05))
    & (F.col("n_craters_added_in_study_region") > F.lit(50))
    & (df.simulation_id == 1742)
    # & (df.simulation_id == 7230)
)

In [None]:
n_pts = 10000
n_obs = df.count()

if n_obs > n_pts:
    sample_fraction = n_pts / n_obs
    df = df.sample(sample_fraction)

pandas_df = df.toPandas()

In [None]:
pandas_df

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(
    np.log10(pandas_df.n_craters_added_in_study_region),
    pandas_df.log_mean_c2c_nn_dist,    
)
plt.title("Single Simulation, b=1.65")
plt.xlabel("$log_{10}(N_{tot})$", size=14)
plt.ylabel("$log_{10}(\overline{NN_d})$", size=14)
plt.show()

In [None]:
n_pts = 10000

simulation_ids = list(cdf[cdf.slope.between(1.66, 1.68)].sample(4).simulation_id)
simulation_ids = [1742] + simulation_ids

plt.figure(figsize=(10, 4))

for idx, simulation_id in enumerate(simulation_ids):
    df = data.where(
        (F.col("information_remaining") > F.lit(0.05))
        & (F.col("n_craters_added_in_study_region") > F.lit(50))
        & (df.simulation_id == simulation_id)
    )

    n_obs = df.count()
    sample_fraction = n_pts / n_obs
    df = df.sample(sample_fraction)
    pandas_df = df.toPandas()
    pandas_df = pandas_df.sort_values("n_craters_added_in_study_region")
    
    plt.plot(
        np.log10(pandas_df.n_craters_added_in_study_region),
        pandas_df.log_mean_c2c_nn_dist,
        label = f"Sim {idx + 1}"
    )

plt.title("Simulations With $b \in (1.65, 1.70)$")
plt.xlabel("$log_{10}(N_{tot})$", size=14)
plt.ylabel("$log_{10}(\overline{NN_d})$", size=14)
plt.legend()
plt.show()