In [1]:
import sys

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
from typing import *

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Window

In [2]:
n_cores = 30

spark = (SparkSession.builder
         .master(f"local[{n_cores}]")
         .appName("Saturation")
         .config("spark.sql.shuffle.partitions", "1000")
         .config("spark.driver.memory", "64g")
         .config("spark.driver.maxResultSize", "8g")
         .getOrCreate())

23/10/22 17:13:40 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.86.20 instead (on interface enp8s0)
23/10/22 17:13:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/22 17:13:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/22 17:13:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/10/22 17:13:41 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config


def read_configs(base_path: str, spark_session: SparkSession) -> pyspark.RDD:
    completed_filenames = list(Path(base_path).glob("*/completed.txt"))
    configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
    configs = map(read_config, configs)
    return spark_session.sparkContext.parallelize(configs)


def create_configs_df(configs: pyspark.RDD) -> DataFrame:
    config_columns = [
        "simulation_id",
        "slope",
        "r_stat_multiplier",
        "effective_radius_multiplier",
        "min_rim_percentage"
    ]
    return configs.map(lambda x: {k: v for k, v in x.items() if k in config_columns}).toDF()

def add_post_saturation_percentiles(data: DataFrame, column: str):
    """
    Calculates the post-saturation percentile of a given column.
    """
    col_dtype = dict(data.dtypes)[column]

    # Select all points post-saturation - last 1/3 of each simulation
    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region"))
    with_row_number = data.withColumn("row_number", F.row_number().over(window))

    saturation_points = with_row_number.groupby("simulation_id").agg(F.max("row_number").alias("n_rows"))
    saturation_points = saturation_points.withColumn("saturation_point", (F.col("n_rows") / 3 * 2).cast("int"))

    with_saturation_points = with_row_number.join(saturation_points, on="simulation_id", how="inner")
    post_saturation = (
        with_saturation_points
        .filter(F.col("row_number") - F.col("saturation_point") >= 0)
        .drop("row_number")
        .drop("saturation_point")
        .drop("n_rows")
    )

    # Calculate post-saturation percentiles for each simulation
    # Create a "lookup table" of percentiles by simulation to join to
    percentile_lookup = (
        post_saturation
        .groupby("simulation_id")
        .agg(
            F.percentile_approx(column, F.array(*[F.lit(x / 100.0) for x in range(1, 100)]), 10000).alias("percentiles")
        )
        .select(
            "simulation_id",
            F.explode(
                F.arrays_zip(
                    F.array(*[F.lit(x / 100) for x in range(0, 100)]),
                    F.array_insert("percentiles", 1, F.lit(-2 ** 33).cast(col_dtype)),
                    F.array_insert("percentiles", 100, F.lit(2 ** 33).cast(col_dtype)),
                )
            ).alias("percentile_array")
        )
        .select(
            "simulation_id",
            F.col("percentile_array")["0"].alias(f"post_saturation_{column}_percentile"),
            F.col("percentile_array")["1"].alias("lower"),
            F.col("percentile_array")["2"].alias("upper"),
        )
    ).cache()

    # Join back to the full dataframe to add percentiles to each observation
    result = (
        data
        .join(percentile_lookup, on="simulation_id")
        .filter(data[column] >= percentile_lookup.lower)
        .filter(data[column] < percentile_lookup.upper)
        .drop("lower", "upper")
    )

    return result


def quantize_value(value: float, delta: float) -> float:
    return np.round(int(np.round(value / delta, decimals=5)) * delta, decimals=5)


def get_min_max_n_buckets(column: str, delta: float, df) -> Tuple[float, float, int]:
    min_max_df = df.select(F.min(column), F.max(column)).toPandas()
    
    min_val = quantize_value(min_max_df.iloc[0, 0], delta)
    max_val = quantize_value(min_max_df.iloc[0, 1], delta)
    n_buckets = int(np.round((max_val - min_val) / delta, decimals=5)) + 1

    return min_val, max_val, n_buckets

In [4]:
base_path = "/data/saturation/n_craters_stop_condition_20230918"

In [9]:
r_stat = 5

configs_df = create_configs_df(read_configs(base_path, spark))
data = spark.read.parquet(f"{base_path}/*/statistics_*.parquet")
data_with_configs = data.join(F.broadcast(configs_df), on="simulation_id")

                                                                                

In [57]:
# Predictor variables
features = [
    "log_mean_c2c_nn_dist",
    "slope"
]
additional_variables = [
    "simulation_id",
    "crater_id",
    "n_craters_added_in_study_region"
]

# Add log mean c2c nn distance
data_with_features = data_with_configs.select("*",
                                              F.log10(F.col("center_to_center_nearest_neighbor_distance_mean") / F.lit(r_stat)).alias("log_mean_c2c_nn_dist"))
data_with_features = data_with_features.select(*(features + additional_variables))

# Get means/standard deviations in order to standardize features
means = [F.mean(x).alias(f"{x}_mean") for x in features]
stdevs = [F.stddev_samp(x).alias(f"{x}_stdev") for x in features]
mean_and_stdevs_df = data_with_features.select(*(means + stdevs)).toPandas()
mean_and_stdevs_dict = mean_and_stdevs_df.iloc[0].to_dict()

                                                                                

In [58]:
data_with_features.columns

['log_mean_c2c_nn_dist',
 'slope',
 'simulation_id',
 'crater_id',
 'n_craters_added_in_study_region']

In [78]:
# Standardize predictor variables
data_with_standardized_features = data_with_features
for features in features:
    data_with_standardized_features = data_with_standardized_features.withColumn(
        feature,
        (F.col(feature) - F.lit(mean_and_stdevs_dict[f"{feature}_mean"])) / F.lit(mean_and_stdevs_dict[f"{feature}_stdev"])
    )

In [90]:
# Get the nearest neighbors to a specified feature vector
feature_vector = {
    "log_mean_c2c_nn_dist": 1.8,
    "slope": 2.2
}

# Standardize the feature vector
feature_vector_standardized = {
    k: (v - mean_and_stdevs_dict[f"{k}_mean"]) / mean_and_stdevs_dict[f"{k}_stdev"]
    for k, v in feature_vector.items()
}

# Filter to values within 0.5 * stdev for performance reasons
filtered = data_with_standardized_features.withColumn("distance", F.lit(0.0))

sum_col = F.lit("distance")
for feature, value in feature_vector_standardized.items():
    filtered = filtered.where(F.abs(F.lit(value) - F.col(feature)) < 1)
    sum_col += F.pow(F.col(feature) - F.lit(value), 2)

filtered = filtered.withColumn("distance", sum_col)

In [92]:
n_nearest_neighbors = 1000
nearest_neighbors = filtered.sort(F.col("distance").desc()).limit(n_nearest_neighbors).toPandas()

                                                                                

In [93]:
nearest_neighbors

Unnamed: 0,log_mean_c2c_nn_dist,slope,simulation_id,crater_id,n_craters_added_in_study_region,distance
0,5.174134,1.342782,8789,200,11,
1,5.145896,1.342782,8789,234,12,
2,4.929405,1.342782,8789,249,13,
3,4.889420,1.342782,8789,259,14,
4,4.726162,1.342782,8789,267,15,
...,...,...,...,...,...,...
995,4.321786,1.337579,1218,261,37,
996,4.316844,1.337579,1218,263,38,
997,4.314902,1.337579,1218,269,39,
998,4.274624,1.337579,1218,277,40,


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 35792)
Traceback (most recent call last):
  File "/usr/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/home/mason/code/saturation/venv_311/lib/python3.11/site-packages/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/home/mason/code/saturation/venv_311/lib/python3.11/site-packages/pyspark/accumulators.py", line 253, in poll
    if func():
       ^^^^^^
  File "/home/mason/code/saturation/venv_311/lib/python3.11/site-packages/pyspark/

In [87]:
sum_col

Column<'((distance + POWER((log_mean_c2c_nn_dist - 4.3369333798880945), 2)) + POWER((slope - 0.35396230932620254), 2))'>