In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import glob
import yaml
from typing import *
from functools import reduce

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Row, Window

In [3]:
base_path = "/data/saturation/random_runs_20230812/"

n_cores = 28

In [4]:
spark = SparkSession.builder \
                    .master(f"local[{n_cores}]") \
                    .appName("Saturation") \
                    .config("spark.driver.memory", "64g") \
                    .getOrCreate()
sc = spark.sparkContext

23/08/18 16:21:31 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.86.20 instead (on interface enp8s0)
23/08/18 16:21:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/18 16:21:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Prepare samples from each simulation, overall and post-saturation

In [7]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config

def read_configs(base_path: str) -> pyspark.RDD:
    completed_filenames = list(Path(base_path).glob("*/completed.txt"))
    configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
    configs = map(read_config, configs)
    return sc.parallelize(configs)

def create_configs_df(configs: pyspark.RDD) -> DataFrame:
    config_columns = [
        "simulation_id",
        "slope",
        "r_stat_multiplier",
        "effective_radius_multiplier",
        "min_rim_percentage"
    ]
    return configs.map(lambda x: {k: v for k, v in x.items() if k in config_columns}).toDF()

def sample_post_saturation_by_simulation(data: DataFrame,
                                         configs: pyspark.RDD,
                                         n_craters_to_sample: int) -> DataFrame:
    configs_df = create_configs_df(configs)

    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region"))
    craters_with_row_number = data.withColumn("row_number", F.row_number().over(window))
    
    saturation_points = craters_with_row_number.groupby("simulation_id").agg(F.max("row_number").alias("n_craters_max"))
    saturation_points = saturation_points.withColumn("saturation_point", (F.col("n_craters_max") / 3 * 2).cast("int"))
    
    with_saturation_points = craters_with_row_number.join(saturation_points, on="simulation_id", how="inner")
    
    filtered = with_saturation_points \
        .filter(F.col("row_number") - F.col("saturation_point") >= 0) \
        .cache() \
        .filter((F.col("row_number") - F.col("saturation_point")) % ((F.col("n_craters_max") - F.col("saturation_point")) / n_craters_to_sample).cast("int") == 0) \
        .drop("row_number") \
        .drop("saturation_point") \
        .drop("n_craters_max")
    
    return configs_df.join(filtered, on="simulation_id")

def sample_by_simulation(data: DataFrame,
                         configs: pyspark.RDD,
                         n_craters_to_sample: int) -> DataFrame:
    configs_df = create_configs_df(configs)

    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region"))
    craters_with_row_number = data.withColumn("row_number", F.row_number().over(window))
    
    saturation_points = craters_with_row_number.groupby("simulation_id").agg(F.max("row_number").alias("n_craters_max"))
    with_saturation_points = craters_with_row_number.join(saturation_points, on="simulation_id", how="inner")
    
    filtered = with_saturation_points \
        .filter(
            ((F.col("row_number") < F.lit(50000)) & (F.col("row_number") % (F.lit(50000) / n_craters_to_sample).cast("int") == 0))
            | ((F.col("row_number") % (F.col("n_craters_max") / n_craters_to_sample)).cast("int") == 0)
        ) \
        .drop("row_number") \
        .drop("n_craters_max")
    
    return configs_df.join(filtered, on="simulation_id")

def stratified_sample_by_simulation(data: DataFrame,
                                    n_craters_to_sample_per_bin: int,
                                    n_bins: int) -> DataFrame:    
    w = Window.partitionBy(F.col("sampling_key")).orderBy(F.col("rnd_"))

    filtered = (data
                   .withColumn("sampling_key", F.col("simulation_id") * 10000 + (F.col("information_remaining") * n_bins + 1).cast("int"))
                   .withColumn("rnd_", F.rand())
                   .withColumn("rn_", F.row_number().over(w))
                   .where(F.col("rn_") <= n_craters_to_sample_per_bin)
                   .drop("rn_")
                   .drop("rnd_")
                   .drop("sampling_key")
    )
    return filtered

In [8]:
all_data_df = spark.read.parquet(f"{base_path}/*/statistics_*.parquet").repartition("simulation_id")
all_data_df = all_data_df.withColumn("information_remaining", F.col("n_craters_in_study_region") / F.col("n_craters_added_in_study_region"))

configs_df = create_configs_df(read_configs(base_path))
all_data_df = all_data_df.join(F.broadcast(configs_df), on="simulation_id").cache()

                                                                                

In [9]:
simulation_ids = list(configs_df.select("simulation_id").toPandas()["simulation_id"])
train_simulation_ids = set(np.random.choice(simulation_ids, replace=False, size=int(len(simulation_ids) * 0.8)))
train_df = all_data_df.filter(all_data_df.simulation_id.isin(train_simulation_ids)).cache()
test_df = all_data_df.filter(~all_data_df.simulation_id.isin(train_simulation_ids)).cache()
all_data_df.unpersist()

DataFrame[simulation_id: bigint, crater_id: bigint, n_craters_added_in_study_region: bigint, n_craters_in_study_region: bigint, areal_density: float, areal_density_overlap_2: float, areal_density_overlap_3: float, center_to_center_nearest_neighbor_distance_mean: float, center_to_center_nearest_neighbor_distance_stdev: float, center_to_center_nearest_neighbor_distance_min: float, center_to_center_nearest_neighbor_distance_max: float, rim_to_rim_nearest_neighbor_distance_mean: float, rim_to_rim_nearest_neighbor_distance_stdev: float, rim_to_rim_nearest_neighbor_distance_max: float, n_non_zero_rim_to_rim_nearest_neighbor_distances: bigint, z: float, za: float, information_remaining: double, effective_radius_multiplier: double, min_rim_percentage: double, r_stat_multiplier: double, slope: double]

In [10]:
n_craters_to_sample_per_bin = [
    10,
    50,
    100,
]
n_bins = 20

for n in n_craters_to_sample_per_bin:
    print(n)
    
    sample = stratified_sample_by_simulation(train_df, n, n_bins)
    sample.toPandas().to_parquet(f"{base_path}/train_{n}.parquet")
    
    sample = stratified_sample_by_simulation(test_df, n, n_bins)
    sample.toPandas().to_parquet(f"{base_path}/test_{n}.parquet")

10


23/08/18 16:24:12 WARN MemoryStore: Not enough space to cache rdd_22_194 in memory! (computed 292.9 MiB so far)
23/08/18 16:24:12 WARN BlockManager: Persisting block rdd_22_194 to disk instead.
23/08/18 16:24:13 WARN MemoryStore: Not enough space to cache rdd_22_194 in memory! (computed 292.9 MiB so far)
                                                                                

50


23/08/18 16:27:34 WARN MemoryStore: Not enough space to cache rdd_22_194 in memory! (computed 292.9 MiB so far)
                                                                                

100


23/08/18 16:29:59 WARN MemoryStore: Not enough space to cache rdd_22_194 in memory! (computed 292.9 MiB so far)
                                                                                

500


23/08/18 16:32:36 WARN MemoryStore: Not enough space to cache rdd_22_194 in memory! (computed 292.9 MiB so far)
23/08/18 16:33:37 ERROR TaskSetManager: Total size of serialized results of 169 tasks (1025.5 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB)
23/08/18 16:33:37 WARN TaskSetManager: Lost task 174.0 in stage 40.0 (TID 10341) (muninn.lan executor driver): TaskKilled (Tasks result size has exceeded maxResultSize)
23/08/18 16:33:37 WARN TaskSetManager: Lost task 178.0 in stage 40.0 (TID 10345) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08/18 16:33:37 WARN TaskSetManager: Lost task 176.0 in stage 40.0 (TID 10343) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08/18 16:33:37 WARN TaskSetManager: Lost task 161.0 in stage 40.0 (TID 10328) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08/18 16:33:37 WARN TaskSetManager: Lost task 158.0 in stage 40.0 (TID 10325) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08

Py4JJavaError: An error occurred while calling o4355.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Total size of serialized results of 169 tasks (1025.5 MiB) is bigger than spark.driver.maxResultSize (1024.0 MiB)
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1019)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1018)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:354)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:382)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:354)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3997)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3994)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


23/08/18 16:33:38 WARN TaskSetManager: Lost task 185.0 in stage 40.0 (TID 10352) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08/18 16:33:38 WARN TaskSetManager: Lost task 179.0 in stage 40.0 (TID 10346) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08/18 16:33:38 WARN TaskSetManager: Lost task 182.0 in stage 40.0 (TID 10349) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08/18 16:33:38 WARN TaskSetManager: Lost task 187.0 in stage 40.0 (TID 10354) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08/18 16:33:38 WARN TaskSetManager: Lost task 183.0 in stage 40.0 (TID 10350) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08/18 16:33:38 WARN TaskSetManager: Lost task 181.0 in stage 40.0 (TID 10348) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08/18 16:33:39 WARN TaskSetManager: Lost task 188.0 in stage 40.0 (TID 10355) (muninn.lan executor driver): TaskKilled (Stage cancelled)
23/08/18 16:33:39 WA

In [12]:
configs_df.show()

+---------------------------+-------------------+------------------+-------------+------------------+
|effective_radius_multiplier| min_rim_percentage| r_stat_multiplier|simulation_id|             slope|
+---------------------------+-------------------+------------------+-------------+------------------+
|         1.5720289373324587| 0.7420289256812818|6.2409269696352165|         1742| 1.670690634676432|
|         1.6967911971431182| 0.6917147956304781| 8.746842174402397|         2442|1.0274706363665504|
|         1.3417438720222559| 0.6608971907564327| 4.757405687085767|         2127|2.4396714290305175|
|          1.565964322050054| 0.4087464541246743| 3.282286438481391|          760|2.7273478754480296|
|         1.4016706394635967| 0.6055612704439735| 8.611877609107639|         2241| 2.469363317002159|
|         1.7281046486739586|0.38432545785826916| 4.536620648680108|         2232|1.4878623044134602|
|          1.657998561315478|0.45368734003822697| 8.005115581059153|          396|

                                                                                

In [14]:
configs_df_repartitioned = configs_df.repartition("simulation_id").cache()

for simulation_id in simulation_ids:
    filtered = all_data_df.filter(F.col("simulation_id") == simulation_id).sort(F.col("n_craters_added_in_study_region"))
    filtered.toPandas().to_parquet(f"{base_path}/simulation_{simulation_id}.parquet")

23/08/18 16:42:35 WARN CacheManager: Asked to cache already cached data.
23/08/19 21:55:40 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /tmp/blockmgr-970a4c6c-577a-41c5-89e4-1285f97bc752. Falling back to Java IO way
java.io.IOException: Failed to delete: /tmp/blockmgr-970a4c6c-577a-41c5-89e4-1285f97bc752
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:177)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:113)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:94)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1231)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:368)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockManager.scala:364)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedS