In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import glob
import yaml
from typing import *
from functools import reduce

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Row, Window

In [2]:
base_path = "/data/saturation/random_runs_20230812/"

n_cores = 28

In [3]:
spark = SparkSession.builder \
                    .master(f"local[{n_cores}]") \
                    .appName("Saturation") \
                    .config("spark.driver.memory", "64g") \
                    .getOrCreate()
sc = spark.sparkContext

23/08/20 16:58:40 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.86.20 instead (on interface enp8s0)
23/08/20 16:58:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/20 16:58:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Prepare samples from each simulation, overall and post-saturation

In [4]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config

def read_configs(base_path: str) -> pyspark.RDD:
    completed_filenames = list(Path(base_path).glob("*/completed.txt"))
    configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
    configs = map(read_config, configs)
    return sc.parallelize(configs)

def create_configs_df(configs: pyspark.RDD) -> DataFrame:
    config_columns = [
        "simulation_id",
        "slope",
        "r_stat_multiplier",
        "effective_radius_multiplier",
        "min_rim_percentage"
    ]
    return configs.map(lambda x: {k: v for k, v in x.items() if k in config_columns}).toDF()

def sample_post_saturation_by_simulation(data: DataFrame,
                                         configs: pyspark.RDD,
                                         n_craters_to_sample: int) -> DataFrame:
    configs_df = create_configs_df(configs)

    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region"))
    craters_with_row_number = data.withColumn("row_number", F.row_number().over(window))
    
    saturation_points = craters_with_row_number.groupby("simulation_id").agg(F.max("row_number").alias("n_craters_max"))
    saturation_points = saturation_points.withColumn("saturation_point", (F.col("n_craters_max") / 3 * 2).cast("int"))
    
    with_saturation_points = craters_with_row_number.join(saturation_points, on="simulation_id", how="inner")
    
    filtered = with_saturation_points \
        .filter(F.col("row_number") - F.col("saturation_point") >= 0) \
        .cache() \
        .filter((F.col("row_number") - F.col("saturation_point")) % ((F.col("n_craters_max") - F.col("saturation_point")) / n_craters_to_sample).cast("int") == 0) \
        .drop("row_number") \
        .drop("saturation_point") \
        .drop("n_craters_max")
    
    return configs_df.join(filtered, on="simulation_id")

# def sample_by_simulation(data: DataFrame,
#                          configs: pyspark.RDD,
#                          n_craters_to_sample: int) -> DataFrame:
#     configs_df = create_configs_df(configs)

#     window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region"))
#     craters_with_row_number = data.withColumn("row_number", F.row_number().over(window))
    
#     saturation_points = craters_with_row_number.groupby("simulation_id").agg(F.max("row_number").alias("n_craters_max"))
#     with_saturation_points = craters_with_row_number.join(saturation_points, on="simulation_id", how="inner")
    
#     filtered = with_saturation_points \
#         .filter(
#             ((F.col("row_number") < F.lit(50000)) & (F.col("row_number") % (F.lit(50000) / n_craters_to_sample).cast("int") == 0))
#             | ((F.col("row_number") % (F.col("n_craters_max") / n_craters_to_sample)).cast("int") == 0)
#         ) \
#         .drop("row_number") \
#         .drop("n_craters_max")
    
#     return configs_df.join(filtered, on="simulation_id")

def sample_by_simulation(data: DataFrame, n_craters_to_sample: int) -> DataFrame:    
    w = Window.partitionBy(F.col("simulation_id")).orderBy(F.col("rnd_"))

    filtered = (data
                   .withColumn("rnd_", F.rand())
                   .withColumn("rn_", F.row_number().over(w))
                   .where(F.col("rn_") <= n_craters_to_sample)
                   .drop("rn_")
                   .drop("rnd_")
    )
    return filtered

In [5]:
all_data_df = spark.read.parquet(f"{base_path}/*/statistics_*.parquet").repartition("simulation_id")
all_data_df = all_data_df.withColumn("information_remaining", F.col("n_craters_in_study_region") / F.col("n_craters_added_in_study_region"))

configs_df = create_configs_df(read_configs(base_path))
all_data_df = all_data_df.join(F.broadcast(configs_df), on="simulation_id").cache()

                                                                                

In [6]:
simulation_ids = list(configs_df.select("simulation_id").toPandas()["simulation_id"])
train_simulation_ids = set(np.random.choice(simulation_ids, replace=False, size=int(len(simulation_ids) * 0.8)))
train_df = all_data_df.filter(all_data_df.simulation_id.isin(train_simulation_ids)).cache()
test_df = all_data_df.filter(~all_data_df.simulation_id.isin(train_simulation_ids)).cache()
all_data_df.unpersist()

DataFrame[simulation_id: bigint, crater_id: bigint, n_craters_added_in_study_region: bigint, n_craters_in_study_region: bigint, areal_density: float, areal_density_overlap_2: float, areal_density_overlap_3: float, center_to_center_nearest_neighbor_distance_mean: float, center_to_center_nearest_neighbor_distance_stdev: float, center_to_center_nearest_neighbor_distance_min: float, center_to_center_nearest_neighbor_distance_max: float, rim_to_rim_nearest_neighbor_distance_mean: float, rim_to_rim_nearest_neighbor_distance_stdev: float, rim_to_rim_nearest_neighbor_distance_max: float, n_non_zero_rim_to_rim_nearest_neighbor_distances: bigint, z: float, za: float, information_remaining: double, effective_radius_multiplier: double, min_rim_percentage: double, r_stat_multiplier: double, slope: double]

In [7]:
n_craters_to_sample = [
    1000,
    2500,
    5000,
]

for n in n_craters_to_sample:
    print(n)
    
    sample = sample_by_simulation(train_df, n)
    sample.toPandas().to_parquet(f"{base_path}/train_{n}.parquet")
    
    sample = sample_by_simulation(test_df, n)
    sample.toPandas().to_parquet(f"{base_path}/test_{n}.parquet")

1000


23/08/20 17:05:35 WARN MemoryStore: Not enough space to cache rdd_22_174 in memory! (computed 106.0 MiB so far)
23/08/20 17:05:35 WARN BlockManager: Persisting block rdd_22_174 to disk instead.
23/08/20 17:05:48 WARN MemoryStore: Not enough space to cache rdd_22_191 in memory! (computed 59.7 MiB so far)
23/08/20 17:05:48 WARN BlockManager: Persisting block rdd_22_191 to disk instead.
23/08/20 17:05:49 WARN MemoryStore: Not enough space to cache rdd_22_190 in memory! (computed 60.1 MiB so far)
23/08/20 17:05:49 WARN BlockManager: Persisting block rdd_22_190 to disk instead.
23/08/20 17:05:50 WARN MemoryStore: Not enough space to cache rdd_22_189 in memory! (computed 59.6 MiB so far)
23/08/20 17:05:50 WARN MemoryStore: Not enough space to cache rdd_22_182 in memory! (computed 104.8 MiB so far)
23/08/20 17:05:51 WARN MemoryStore: Not enough space to cache rdd_22_185 in memory! (computed 103.0 MiB so far)
23/08/20 17:05:51 WARN MemoryStore: Not enough space to cache rdd_22_186 in memory! (

2500


23/08/20 17:09:36 WARN MemoryStore: Not enough space to cache rdd_22_150 in memory! (computed 162.7 MiB so far)
23/08/20 17:09:36 WARN MemoryStore: Not enough space to cache rdd_22_152 in memory! (computed 103.1 MiB so far)
23/08/20 17:09:36 WARN MemoryStore: Not enough space to cache rdd_22_154 in memory! (computed 160.1 MiB so far)
23/08/20 17:09:37 WARN MemoryStore: Not enough space to cache rdd_22_156 in memory! (computed 102.9 MiB so far)
23/08/20 17:09:45 WARN MemoryStore: Not enough space to cache rdd_22_169 in memory! (computed 208.3 MiB so far)
23/08/20 17:09:45 WARN MemoryStore: Not enough space to cache rdd_22_171 in memory! (computed 159.3 MiB so far)
23/08/20 17:09:47 WARN MemoryStore: Not enough space to cache rdd_22_177 in memory! (computed 161.5 MiB so far)
23/08/20 17:09:51 WARN MemoryStore: Not enough space to cache rdd_22_186 in memory! (computed 58.3 MiB so far)
23/08/20 17:09:51 WARN MemoryStore: Not enough space to cache rdd_22_185 in memory! (computed 160.8 MiB s

5000


23/08/20 17:12:17 WARN MemoryStore: Not enough space to cache rdd_22_146 in memory! (computed 58.8 MiB so far)
23/08/20 17:12:17 WARN MemoryStore: Not enough space to cache rdd_22_140 in memory! (computed 337.5 MiB so far)
23/08/20 17:12:32 WARN MemoryStore: Not enough space to cache rdd_22_170 in memory! (computed 100.9 MiB so far)
23/08/20 17:12:41 WARN MemoryStore: Not enough space to cache rdd_22_190 in memory! (computed 104.4 MiB so far)
23/08/20 17:12:42 WARN MemoryStore: Not enough space to cache rdd_22_189 in memory! (computed 161.6 MiB so far)
23/08/20 17:12:43 WARN MemoryStore: Not enough space to cache rdd_22_195 in memory! (computed 103.6 MiB so far)
23/08/20 17:12:45 WARN MemoryStore: Not enough space to cache rdd_22_196 in memory! (computed 100.1 MiB so far)
                                                                                

In [12]:
configs_df.show()

+---------------------------+-------------------+------------------+-------------+------------------+
|effective_radius_multiplier| min_rim_percentage| r_stat_multiplier|simulation_id|             slope|
+---------------------------+-------------------+------------------+-------------+------------------+
|         1.5720289373324587| 0.7420289256812818|6.2409269696352165|         1742| 1.670690634676432|
|         1.6967911971431182| 0.6917147956304781| 8.746842174402397|         2442|1.0274706363665504|
|         1.3417438720222559| 0.6608971907564327| 4.757405687085767|         2127|2.4396714290305175|
|          1.565964322050054| 0.4087464541246743| 3.282286438481391|          760|2.7273478754480296|
|         1.4016706394635967| 0.6055612704439735| 8.611877609107639|         2241| 2.469363317002159|
|         1.7281046486739586|0.38432545785826916| 4.536620648680108|         2232|1.4878623044134602|
|          1.657998561315478|0.45368734003822697| 8.005115581059153|          396|

                                                                                

In [14]:
configs_df_repartitioned = configs_df.repartition("simulation_id").cache()

for simulation_id in simulation_ids:
    filtered = all_data_df.filter(F.col("simulation_id") == simulation_id).sort(F.col("n_craters_added_in_study_region"))
    filtered.toPandas().to_parquet(f"{base_path}/simulation_{simulation_id}.parquet")

23/08/18 16:42:35 WARN CacheManager: Asked to cache already cached data.
23/08/19 21:55:40 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /tmp/blockmgr-970a4c6c-577a-41c5-89e4-1285f97bc752. Falling back to Java IO way
java.io.IOException: Failed to delete: /tmp/blockmgr-970a4c6c-577a-41c5-89e4-1285f97bc752
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:177)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:113)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:94)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1231)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:368)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockManager.scala:364)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedS