In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import glob
import yaml
from typing import *
from functools import reduce

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Row, Window

In [2]:
base_path = "/data/saturation/ir_random_1/"

n_cores = 14

In [3]:
spark = SparkSession.builder \
                    .master(f"local[{n_cores}]") \
                    .appName("Saturation") \
                    .config("spark.driver.memory", "32g") \
                    .getOrCreate()
sc = spark.sparkContext

23/09/17 01:10:02 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.86.20 instead (on interface enp8s0)
23/09/17 01:10:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/17 01:10:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/09/17 01:10:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# Prepare samples from each simulation, overall and post-saturation

In [5]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config

def read_configs(base_path: str) -> pyspark.RDD:
    completed_filenames = list(Path(base_path).glob("*/completed.txt"))
    configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
    configs = map(read_config, configs)
    return sc.parallelize(configs)

def create_configs_df(configs: pyspark.RDD) -> DataFrame:
    config_columns = [
        "simulation_id",
        "slope",
        "r_stat_multiplier",
        "effective_radius_multiplier",
        "min_rim_percentage"
    ]
    return configs.map(lambda x: {k: v for k, v in x.items() if k in config_columns}).toDF()

def sample_post_saturation_by_simulation(data: DataFrame,
                                         configs: pyspark.RDD,
                                         n_craters_to_sample: int) -> DataFrame:
    configs_df = create_configs_df(configs)

    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region"))
    craters_with_row_number = data.withColumn("row_number", F.row_number().over(window))
    
    saturation_points = craters_with_row_number.groupby("simulation_id").agg(F.max("row_number").alias("n_craters_max"))
    saturation_points = saturation_points.withColumn("saturation_point", (F.col("n_craters_max") / 3 * 2).cast("int"))
    
    with_saturation_points = craters_with_row_number.join(saturation_points, on="simulation_id", how="inner")
    
    filtered = with_saturation_points \
        .filter(F.col("row_number") - F.col("saturation_point") >= 0) \
        .cache() \
        .filter((F.col("row_number") - F.col("saturation_point")) % ((F.col("n_craters_max") - F.col("saturation_point")) / n_craters_to_sample).cast("int") == 0) \
        .drop("row_number") \
        .drop("saturation_point") \
        .drop("n_craters_max")
    
    return configs_df.join(filtered, on="simulation_id")

def sample_by_simulation(data: DataFrame,
                         configs: pyspark.RDD,
                         n_craters_to_sample: int) -> DataFrame:
    configs_df = create_configs_df(configs)

    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region"))
    craters_with_row_number = data.withColumn("row_number", F.row_number().over(window))
    
    saturation_points = craters_with_row_number.groupby("simulation_id").agg(F.max("row_number").alias("n_craters_max"))
    with_saturation_points = craters_with_row_number.join(saturation_points, on="simulation_id", how="inner")
    
    filtered = with_saturation_points \
        .filter(
            ((F.col("row_number") < F.lit(50000)) & (F.col("row_number") % (F.lit(50000) / n_craters_to_sample).cast("int") == 0))
            | ((F.col("row_number") % (F.col("n_craters_max") / n_craters_to_sample)).cast("int") == 0)
        ) \
        .drop("row_number") \
        .drop("n_craters_max")
    
    return configs_df.join(filtered, on="simulation_id")

def stratified_sample_by_simulation(data: DataFrame,
                                    n_craters_to_sample_per_bin: int,
                                    n_bins: int) -> DataFrame:    
    w = Window.partitionBy(F.col("sampling_key")).orderBy(F.col("rnd_"))

    filtered = (data
                   .withColumn("sampling_key", F.col("simulation_id") * 10000 + (F.col("information_remaining") * n_bins + 1).cast("int"))
                   .withColumn("rnd_", F.rand())
                   .withColumn("rn_", F.row_number().over(w))
                   .where(F.col("rn_") <= n_craters_to_sample_per_bin)
                   .drop("rn_")
                   .drop("rnd_")
                   .drop("sampling_key")
    )
    return filtered

In [6]:
all_data_df = spark.read.parquet(f"{base_path}/*/statistics_*.parquet").repartition("simulation_id")
all_data_df = all_data_df.withColumn("information_remaining", F.col("n_craters_in_study_region") / F.col("n_craters_added_in_study_region"))

configs_df = create_configs_df(read_configs(base_path))
all_data_df = all_data_df.join(F.broadcast(configs_df), on="simulation_id").cache()

                                                                                

In [7]:
simulation_ids = list(configs_df.select("simulation_id").toPandas()["simulation_id"])
train_simulation_ids = set(np.random.choice(simulation_ids, replace=False, size=int(len(simulation_ids) * 0.8)))
train_df = all_data_df.filter(all_data_df.simulation_id.isin(train_simulation_ids)).cache()
test_df = all_data_df.filter(~all_data_df.simulation_id.isin(train_simulation_ids)).cache()
all_data_df.unpersist()

DataFrame[simulation_id: bigint, crater_id: bigint, n_craters_added_in_study_region: bigint, n_craters_in_study_region: bigint, areal_density: double, areal_density_overlap_2: double, areal_density_overlap_3: double, center_to_center_nearest_neighbor_distance_mean: double, center_to_center_nearest_neighbor_distance_stdev: double, center_to_center_nearest_neighbor_distance_min: double, center_to_center_nearest_neighbor_distance_max: double, radius_mean: double, radius_stdev: double, z: double, za: double, information_remaining: double, effective_radius_multiplier: double, min_rim_percentage: double, r_stat_multiplier: double, slope: double]

In [8]:
n_craters_to_sample_per_bin = [
    10,
    50,
    100,
]
n_bins = 20

for n in n_craters_to_sample_per_bin:
    print(n)
    
    sample = stratified_sample_by_simulation(train_df, n, n_bins)
    sample.toPandas().to_parquet(f"{base_path}/train_{n}.parquet")
    
    sample = stratified_sample_by_simulation(test_df, n, n_bins)
    sample.toPandas().to_parquet(f"{base_path}/test_{n}.parquet")

10


23/09/17 01:11:24 WARN MemoryStore: Not enough space to cache rdd_22_95 in memory! (computed 225.4 MiB so far)
23/09/17 01:11:24 WARN BlockManager: Persisting block rdd_22_95 to disk instead.
23/09/17 01:11:24 WARN MemoryStore: Not enough space to cache rdd_22_95 in memory! (computed 144.3 MiB so far)
23/09/17 01:11:24 WARN MemoryStore: Not enough space to cache rdd_22_103 in memory! (computed 1327.0 KiB so far)
23/09/17 01:11:24 WARN BlockManager: Persisting block rdd_22_103 to disk instead.
23/09/17 01:11:24 WARN MemoryStore: Not enough space to cache rdd_22_99 in memory! (computed 146.5 MiB so far)
23/09/17 01:11:24 WARN BlockManager: Persisting block rdd_22_99 to disk instead.
23/09/17 01:11:24 WARN MemoryStore: Not enough space to cache rdd_22_83 in memory! (computed 358.8 MiB so far)
23/09/17 01:11:24 WARN BlockManager: Persisting block rdd_22_83 to disk instead.
23/09/17 01:11:24 WARN MemoryStore: Not enough space to cache rdd_22_101 in memory! (computed 22.2 MiB so far)
23/09/1

50


23/09/17 01:14:41 WARN MemoryStore: Not enough space to cache rdd_22_71 in memory! (computed 189.5 MiB so far)
23/09/17 01:14:41 WARN MemoryStore: Not enough space to cache rdd_22_78 in memory! (computed 84.0 MiB so far)
23/09/17 01:14:43 WARN MemoryStore: Not enough space to cache rdd_22_83 in memory! (computed 149.1 MiB so far)
23/09/17 01:14:43 WARN MemoryStore: Not enough space to cache rdd_22_101 in memory! (computed 42.6 MiB so far)
23/09/17 01:14:43 WARN MemoryStore: Not enough space to cache rdd_22_95 in memory! (computed 82.9 MiB so far)
23/09/17 01:14:43 WARN MemoryStore: Not enough space to cache rdd_22_102 in memory! (computed 22.0 MiB so far)
23/09/17 01:14:44 WARN MemoryStore: Not enough space to cache rdd_22_103 in memory! (computed 22.0 MiB so far)
23/09/17 01:14:44 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_22_107 in memory.
23/09/17 01:14:44 WARN MemoryStore: Not enough space to cache rdd_22_106 in memory! (compu

100


23/09/17 01:16:34 WARN MemoryStore: Not enough space to cache rdd_22_64 in memory! (computed 355.3 MiB so far)
23/09/17 01:16:34 WARN MemoryStore: Not enough space to cache rdd_22_76 in memory! (computed 355.2 MiB so far)
23/09/17 01:16:35 WARN MemoryStore: Not enough space to cache rdd_22_78 in memory! (computed 42.4 MiB so far)
23/09/17 01:16:35 WARN MemoryStore: Not enough space to cache rdd_22_83 in memory! (computed 1352.0 KiB so far)
23/09/17 01:16:38 WARN MemoryStore: Not enough space to cache rdd_22_101 in memory! (computed 83.8 MiB so far)
23/09/17 01:16:38 WARN MemoryStore: Not enough space to cache rdd_22_95 in memory! (computed 144.3 MiB so far)
23/09/17 01:16:38 WARN MemoryStore: Not enough space to cache rdd_22_102 in memory! (computed 84.2 MiB so far)
23/09/17 01:16:38 WARN MemoryStore: Not enough space to cache rdd_22_107 in memory! (computed 42.7 MiB so far)
23/09/17 01:16:38 WARN MemoryStore: Not enough space to cache rdd_22_109 in memory! (computed 41.9 MiB so far)
2

In [None]:
configs_df.show()

In [None]:
configs_df_repartitioned = configs_df.repartition("simulation_id").cache()

for simulation_id in simulation_ids:
    filtered = all_data_df.filter(F.col("simulation_id") == simulation_id).sort(F.col("n_craters_added_in_study_region"))
    filtered.toPandas().to_parquet(f"{base_path}/simulation_{simulation_id}.parquet")