In [1]:
import pandas as pd
from pathlib import Path
import glob
import yaml
from typing import *
from functools import reduce

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Row, Window

In [2]:
# base_path = "/data/saturation/central_composite_design/ccd6"
# base_path_holdout = "/data/saturation/central_composite_design/ccd6_holdout_larger"
base_path = "/data/saturation/central_composite_design/ccd9"
base_path_holdout = "/data/saturation/central_composite_design/ccd9_holdout"
n_cores = 28

In [3]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config


def get_aggregations_for_column(col: str) -> Iterable:
    # Percentiles 5 to 95 step 5
    for quantile in range(1, 20):
        yield F.percentile_approx(col, quantile / 20, accuracy=int(1e6)).alias(f"{col}_{quantile*5:.0f}_percentile")
    
    yield F.percentile_approx(col, .99, accuracy=int(1e6)).alias(f"{col}_99_percentile")
    yield F.percentile_approx(col, .50, accuracy=int(1e6)).alias(f"{col}_median")
    yield F.min(col).alias(f"{col}_min")
    yield F.max(col).alias(f"{col}_max")
    yield F.mean(col).alias(f"{col}_mean")
    yield F.stddev_samp(col).alias(f"{col}_stdev")   
    

def calculate_stats(data: DataFrame) -> DataFrame:
    # Number of craters from the end of the simulation to consider as in saturation
    N_CRATERS_IN_SATURATION = 50000
    
    columns_to_calculate_stats = [
        "areal_density",
        "z",
        "za",
        "n_craters_in_study_region",
        "n_craters_added_in_study_region"
    ]
    
    # Grab the last N_CRATERS_IN_SATURATION craters from each simulation
    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region").desc())
    last_n_craters_by_sim = data.withColumn("row_number", F.row_number().over(window)) \
        .filter(F.col("row_number") <= 50) \
        .drop("row_number")

    # Set up the aggregations for each column of interest
    aggregations = [
        aggregation
        for col in columns_to_calculate_stats
        for aggregation in get_aggregations_for_column(col)
    ]

    return last_n_craters_by_sim.groupBy("simulation_id").agg(*aggregations)

In [4]:
spark = SparkSession.builder \
                    .master(f"local[{n_cores}]") \
                    .appName("Saturation") \
                    .config("spark.driver.memory", "48g") \
                    .getOrCreate()
sc = spark.sparkContext

23/04/28 12:25:56 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.86.20 instead (on interface enp8s0)
23/04/28 12:25:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/28 12:25:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Read statistics
stats_df = spark.read.parquet(f"{base_path}/*/statistics_*.parquet").cache()

data = calculate_stats(stats_df)

completed_filenames = list(Path(base_path).glob("*/completed.txt"))
configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
configs = map(read_config, configs)
configs = sc.parallelize(configs)

config_columns = [
    "simulation_id",
    "slope",
    "r_stat_multiplier",
    "effective_radius_multiplier",
    "min_rim_percentage"
]
configs_df = configs.toDF().select(config_columns)

joined = configs_df.join(data, on="simulation_id")

# Write out the result
joined.toPandas().to_csv(f"{base_path}/post_saturation_statistics.csv", index=False)

                                                                                

23/04/28 12:26:21 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [6]:
# Read statistics
stats_df = spark.read.parquet(f"{base_path_holdout}/*/statistics_*.parquet").cache()

data = calculate_stats(stats_df)

completed_filenames = list(Path(base_path_holdout).glob("*/completed.txt"))
configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
configs = map(read_config, configs)
configs = sc.parallelize(configs)

config_columns = [
    "simulation_id",
    "slope",
    "r_stat_multiplier",
    "effective_radius_multiplier",
    "min_rim_percentage"
]
configs_df = configs.toDF().select(config_columns)

joined = configs_df.join(data, on="simulation_id")

# Write out the result
joined.toPandas().to_csv(f"{base_path_holdout}/post_saturation_statistics.csv", index=False)

                                                                                

In [7]:
# Prepare a post-saturation sample from each simulation

In [8]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config

def read_configs(base_path: str) -> pyspark.RDD:
    completed_filenames = list(Path(base_path).glob("*/completed.txt"))
    configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
    configs = map(read_config, configs)
    return sc.parallelize(configs)

def create_configs_df(configs: pyspark.RDD) -> DataFrame:
    config_columns = [
        "simulation_id",
        "slope",
        "r_stat_multiplier",
        "effective_radius_multiplier",
        "min_rim_percentage"
    ]
    return configs.toDF().select(config_columns)

def sample_by_simulation(data: DataFrame,
                         configs: pyspark.RDD,
                         n_craters_to_sample: int) -> DataFrame:
    configs_df = create_configs_df(configs)

    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region"))
    craters_with_row_number = data.withColumn("row_number", F.row_number().over(window))
    
    saturation_points = craters_with_row_number.groupby("simulation_id").agg(F.max("row_number").alias("n_craters_max"))
    saturation_points = saturation_points.withColumn("saturation_point", (F.col("n_craters_max") / 3 * 2).cast("int"))
    
    with_saturation_points = craters_with_row_number.join(saturation_points, on="simulation_id", how="inner")
    
    filtered = with_saturation_points \
        .filter(F.col("row_number") - F.col("saturation_point") >= 0) \
        .cache() \
        .filter((F.col("row_number") - F.col("saturation_point")) % ((F.col("n_craters_max") - F.col("saturation_point")) / n_craters_to_sample).cast("int") == 0) \
        .drop("row_number") \
        .drop("saturation_point") \
        .drop("n_craters_max")
    
    return configs_df.join(filtered, on="simulation_id")

In [9]:
train_df = spark.read.parquet(f"{base_path}/*/statistics_*.parquet").cache()
train_configs = read_configs(base_path)

holdout_df = spark.read.parquet(f"{base_path_holdout}/*/statistics_*.parquet").cache()
holdout_configs = read_configs(base_path_holdout)

n_craters_to_sample = [
    50,
    100,
    500,
    1000,
    5000
]
for n in n_craters_to_sample:
    sample = sample_by_simulation(train_df, train_configs, n)
    sample.toPandas().to_csv(f"{base_path}/post_saturation_sample_{n}.csv", index=False)
    
    sample = sample_by_simulation(holdout_df, holdout_configs, n)
    sample.toPandas().to_csv(f"{base_path_holdout}/post_saturation_sample_{n}.csv", index=False)

                                                                                

23/04/28 12:29:34 WARN CacheManager: Asked to cache already cached data.


                                                                                

23/04/28 12:29:42 WARN CacheManager: Asked to cache already cached data.


[Stage 28:>                                                      (0 + 28) / 200]

23/04/28 12:30:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:02 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 28:>                                                      (2 + 28) / 200]

23/04/28 12:30:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 28:==>                                                    (9 + 28) / 200]

23/04/28 12:30:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 28:===>                                                  (13 + 28) / 200]

23/04/28 12:30:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 28:====>                                                 (15 + 28) / 200]

23/04/28 12:30:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:30:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:30:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 35:>                                                      (0 + 28) / 200]

23/04/28 12:31:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 35:=>                                                     (5 + 28) / 200]

23/04/28 12:31:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 35:===>                                                  (13 + 28) / 200]

23/04/28 12:31:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 35:====>                                                 (15 + 28) / 200]

23/04/28 12:31:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:31:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:31:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:32:10 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:32:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:32:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:18 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:32:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/28 12:32:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




23/04/28 12:32:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


                                                                                

23/04/28 12:32:45 WARN CacheManager: Asked to cache already cached data.


                                                                                

23/04/28 12:32:47 WARN CacheManager: Asked to cache already cached data.


                                                                                

23/04/28 12:32:49 WARN CacheManager: Asked to cache already cached data.


                                                                                

23/04/28 12:32:56 WARN CacheManager: Asked to cache already cached data.


                                                                                

23/04/28 12:33:00 WARN CacheManager: Asked to cache already cached data.


                                                                                

23/04/28 12:33:13 WARN CacheManager: Asked to cache already cached data.


                                                                                

23/04/28 12:33:19 WARN CacheManager: Asked to cache already cached data.


                                                                                

23/04/28 12:34:27 WARN CacheManager: Asked to cache already cached data.


                                                                                

In [10]:
holdout_simulation_ids = [x["simulation_id"] for x in holdout_configs.collect()]

for holdout_simulation_id in holdout_simulation_ids:
    filtered = holdout_df.filter(F.col("simulation_id") == holdout_simulation_id)
    joined = configs_df.join(filtered, on="simulation_id").sort(F.col("n_craters_added_in_study_region"))
    joined.toPandas().to_parquet(f"{base_path_holdout}/simulation_{holdout_simulation_id}.parquet")

                                                                                

23/04/28 13:18:46 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /tmp/blockmgr-62b0aa3f-c20c-4d5b-9802-fbbf5317e991. Falling back to Java IO way
java.io.IOException: Failed to delete: /tmp/blockmgr-62b0aa3f-c20c-4d5b-9802-fbbf5317e991
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:171)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:110)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:91)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1206)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:374)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockManager.scala:370)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach