In [1]:
import pandas as pd
from pathlib import Path
import glob
import yaml
from typing import *
from functools import reduce

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Row, Window

In [2]:
base_path = "/data/saturation/central_composite_design/ccd5"
n_cores = 30

In [3]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config


def get_aggregations_for_column(col: str) -> Iterable:
    # Percentiles 5 to 95 step 5
    for quantile in range(1, 20):
        yield F.percentile_approx(col, quantile / 20, accuracy=int(1e6)).alias(f"{col}_{quantile*5:.0f}_percentile")
    
    yield F.percentile_approx(col, .99, accuracy=int(1e6)).alias(f"{col}_99_percentile")
    yield F.percentile_approx(col, .50, accuracy=int(1e6)).alias(f"{col}_median")
    yield F.min(col).alias(f"{col}_min")
    yield F.max(col).alias(f"{col}_max")
    yield F.mean(col).alias(f"{col}_mean")
    yield F.stddev_samp(col).alias(f"{col}_stdev")   
    

def calculate_stats(data: DataFrame) -> DataFrame:
    # Number of craters from the end of the simulation to consider as in saturation
    N_CRATERS_IN_SATURATION = 50000
    
    columns_to_calculate_stats = [
        "areal_density",
        "z",
        "za",
        "n_craters_in_study_region",
        "n_craters_added_in_study_region"
    ]
    
    # Grab the last N_CRATERS_IN_SATURATION craters from each simulation
    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region").desc())
    last_n_craters_by_sim = data.withColumn("row_number", F.row_number().over(window)) \
        .filter(F.col("row_number") <= 50) \
        .drop("row_number")

    # Set up the aggregations for each column of interest
    aggregations = [
        aggregation
        for col in columns_to_calculate_stats
        for aggregation in get_aggregations_for_column(col)
    ]

    return last_n_craters_by_sim.groupBy("simulation_id").agg(*aggregations)

In [4]:
spark = SparkSession.builder \
                    .master(f"local[{n_cores}]") \
                    .appName("Saturation") \
                    .config("spark.driver.memory", "48g") \
                    .getOrCreate()
sc = spark.sparkContext

23/01/31 19:24:22 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.86.20 instead (on interface enp8s0)
23/01/31 19:24:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/31 19:24:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/31 19:24:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
# Read configs
completed_filenames = list(Path(base_path).glob("*/completed.txt"))
configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
configs = map(read_config, configs)
configs = sc.parallelize(configs).toDF()

configs.limit(5).toPandas()

Unnamed: 0,effective_radius_multiplier,max_crater_radius,min_crater_radius,min_rim_percentage,output_path,r_stat_multiplier,simulation_id,simulation_name,slope,spatial_hash_cell_size,stop_condition,study_region_padding,study_region_size,write_crater_removals_cadence,write_craters_cadence,write_image_cadence,write_state_cadence,write_statistics_cadence
0,1.901652,2500,10,0.504785,/data/saturation/central_composite_design/ccd5...,2.999645,18,ccd_1.902_0.505_3.000_1.103,1.103217,50,"{'name': None, 'min_craters': 500000, 'percent...",1250,10000,100000,100000,10000,0,100000
1,1.102979,2500,10,0.498455,/data/saturation/central_composite_design/ccd5...,9.003282,146,ccd_1.103_0.498_9.003_2.047,2.046712,50,"{'name': None, 'min_craters': 500000, 'percent...",1250,10000,100000,100000,10000,0,100000
2,1.901115,2500,10,0.502025,/data/saturation/central_composite_design/ccd5...,6.015037,44,ccd_1.901_0.502_6.015_1.093,1.093015,50,"{'name': None, 'min_craters': 500000, 'percent...",1250,10000,100000,100000,10000,0,100000
3,1.094694,2500,10,0.697573,/data/saturation/central_composite_design/ccd5...,3.012605,102,ccd_1.095_0.698_3.013_2.062,2.06158,50,"{'name': None, 'min_craters': 500000, 'percent...",1250,10000,100000,100000,10000,0,100000
4,1.1,2500,10,0.5,/data/saturation/central_composite_design/ccd5...,9.0,145,ccd_1.100_0.500_9.000_2.050,2.05,50,"{'name': None, 'min_craters': 500000, 'percent...",1250,10000,100000,100000,10000,0,100000


In [6]:
# Read statistics
stats_df = spark.read.parquet(f"{base_path}/*/statistics_*.parquet").cache()

                                                                                

In [8]:
data = calculate_stats(stats_df)

In [9]:
completed_filenames = list(Path(base_path).glob("*/completed.txt"))
configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
configs = map(read_config, configs)
configs = sc.parallelize(configs)

In [10]:
config_columns = [
    "simulation_id",
    "slope",
    "r_stat_multiplier",
    "effective_radius_multiplier",
    "min_rim_percentage"
]
configs_df = configs.toDF().select(config_columns)

In [11]:
joined = configs_df.join(data, on="simulation_id")

In [12]:
# Write out the result
joined.toPandas().to_csv(f"{base_path}/post_saturation_statistics.csv", index=False)

                                                                                

In [46]:
# Prepare a dataframe with all data points

In [None]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config


def calculate_stats(data: DataFrame) -> DataFrame:
    columns_to_calculate_stats = [
        "areal_density",
        "z",
        "za",
        "n_craters_in_study_region",
        "n_craters_added_in_study_region"
    ]
    
    # Grab the last N_CRATERS_IN_SATURATION craters from each simulation
    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region").desc())
    last_n_craters_by_sim = data.withColumn("row_number", F.row_number().over(window)) \
        .filter(F.col("row_number") <= N_CRATERS_IN_SATURATION) \
        .drop("row_number")

    # Set up the aggregations for each column of interest
    aggregations = [
        aggregation
        for col in columns_to_calculate_stats
        for aggregation in get_aggregations_for_column(col)
    ]

    return last_n_craters_by_sim.groupBy("simulation_id").agg(*aggregations)

In [13]:
# Prepare a post-saturation sample from each simulation

In [75]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config

def read_configs(base_path: str) -> pyspark.RDD:
    completed_filenames = list(Path(base_path).glob("*/completed.txt"))
    configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
    configs = map(read_config, configs)
    return sc.parallelize(configs)

def create_configs_df(configs: pyspark.RDD) -> DataFrame:
    config_columns = [
        "simulation_id",
        "slope",
        "r_stat_multiplier",
        "effective_radius_multiplier",
        "min_rim_percentage"
    ]
    return configs.toDF().select(config_columns)

def sample_by_simulation(data: DataFrame,
                         configs: pyspark.RDD,
                         n_craters_in_saturation: int,
                         n_craters_to_sample: int) -> DataFrame:
    configs_df = create_configs_df(configs)
    
    window = Window.partitionBy("simulation_id").orderBy(F.col("n_craters_added_in_study_region").desc())
    last_n_craters_by_sim = data.withColumn("row_number", F.row_number().over(window)) \
        .filter(F.col("row_number") <= n_craters_in_saturation) \
        .drop("row_number") \
        .cache()
    
    fractions = {
        x.simulation_id: n_craters_to_sample / n_craters_in_saturation
        for x in last_n_craters_by_sim.select("simulation_id").distinct().collect()
    }
    sample_df = last_n_craters_by_sim.sampleBy("simulation_id", fractions=fractions, seed=0)
    
    return configs_df.join(sample_df, on="simulation_id")

def save_holdout_simulations(holdout_simulations: DataFrame,
                             holdout_simulation_ids: Iterable[int],
                             configs: pyspark.RDD) -> None:
    configs_df = create_configs_df(configs)
    
    for holdout_simulation_id in holdout_simulation_ids:
        filtered = holdout_simulations.filter(F.col("simulation_id") == holdout_simulation_id)
        joined = configs_df.join(filtered, on="simulation_id")
        joined.toPandas().to_parquet(f"{base_path}/simulation_{holdout_simulation_id}.parquet")

In [67]:
holdout_simulation_ids = {x * 9 + 4 for x in range(27)}

stats_df = spark.read.parquet(f"{base_path}/*/statistics_*.parquet").cache()
holdout_simulations = stats_df.filter(F.col("simulation_id").isin(holdout_simulation_ids))
model_simulations = stats_df.filter(~F.col("simulation_id").isin(holdout_simulation_ids))
configs = read_configs(base_path)

                                                                                

23/02/07 18:02:54 WARN CacheManager: Asked to cache already cached data.


In [68]:
sample = sample_by_simulation(model_simulations, configs, 50000, 5000)
sample.toPandas().to_csv(f"{base_path}/post_saturation_sample_5000.csv", index=False)

                                                                                

In [76]:
save_holdout_simulations(holdout_simulations, holdout_simulation_ids, configs)