In [17]:
# base_path = "/data/saturation/random_runs_large"
# base_path = "/data/saturation/random_runs_large_bungled"
# base_path = "/data/saturation/r_stat_changes"
# base_path = "/data/saturation/r_stat_changes_smaller_min"
# base_path = "/data/saturation/tests/test_timings_cell_sizes_buffer_31_boundaries"
base_path = "/data/saturation/central_composite_design/ccd4"
n_cores = 15

### Calculating post-saturation statistics

In [18]:
import pandas as pd
from pathlib import Path
import glob
import yaml
from typing import Dict
from functools import reduce

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame, Row

In [19]:
def read_config(path: Path) -> Dict:
    with path.open("r") as config_file:
        config = yaml.safe_load(config_file)
    return config

def normalize_config_output_path(config: Dict) -> Dict:
    config["output_path"] = Path(config["output_path"])
    return config

def get_statistics(series: pd.Series) -> Dict[str, float]:
    result = {
        f"{quantile:.0f}_percentile": series.quantile(quantile / 100)
        for quantile in range(1, 100)
    }
    result.update({
        "99.9_percentile": series.quantile(.999),
        "max": series.max(),
        "min": series.min(),
        "median": series.median(),
        "mean": series.mean(),
        "stdev": series.std()
    })
    return result

def read_parquet_and_calculate_statistics(config: Dict) -> Row:
    fields_to_add_from_config = [
        "slope",
        "effective_radius_multiplier",
        "r_stat_multiplier",
        "min_rim_percentage"
    ]
    columns_to_calculate_stats = [
        "areal_density",
        "z",
        "za",
        "n_craters_in_study_region",
        "n_craters_added_in_study_region"
    ]
    
    with (config["output_path"] / "completed.txt").open("r") as f:
        duration = float(f.readlines()[0].split(" ")[1])

    # Read all statistics parquet files
    filenames = config["output_path"].glob("statistics*.parquet")
    
    dataframes = []
    for filename in filenames:
        df = pd.read_parquet(filename)
        dataframes.append(df)
    df = pd.concat(dataframes)
    df = df.sort_values(by=["crater_id"]).reset_index(drop=True)
    n_craters_at_completion = df.shape[0]

    ad = df.areal_density
    
    # Find the max areal density; treat this as the saturation point
    saturation_point = max(df["areal_density"].idxmax(), df["n_craters_in_study_region"].idxmax())
    df = df.iloc[saturation_point:]

    # Calculate post-saturation statistics
    stats = {f"{x}_{y[0]}": float(y[1])
             for x in columns_to_calculate_stats
             for y in get_statistics(df[x]).items()}
    stats.update({
        x: config[x]
        for x in fields_to_add_from_config
    })
    
    stats["areal_density_index_90_percentile_max"] = float(ad[ad >= ad.quantile(.9)].index[0] / ad.shape[0])
    stats["areal_density_index_95_percentile_max"] = float(ad[ad >= ad.quantile(.95)].index[0] / ad.shape[0])
    stats["areal_density_index_97_percentile_max"] = float(ad[ad >= ad.quantile(.97)].index[0] / ad.shape[0])
    stats["areal_density_index_99_percentile_max"] = float(ad[ad >= ad.quantile(.99)].index[0] / ad.shape[0])
    stats["areal_density_index_995_percentile_max"] = float(ad[ad >= ad.quantile(.995)].index[0] / ad.shape[0])
    
    stats["path"] = str(config["output_path"])
    stats["n_craters_at_completion"] = n_craters_at_completion
    stats["n_craters_saturation"] = saturation_point
    stats["run_duration_seconds"] = duration
    return Row(**stats)

In [20]:
spark = SparkSession.builder \
                    .master(f"local[{n_cores}]") \
                    .appName("Saturation") \
                    .config("spark.driver.memory", "48g") \
                    .getOrCreate()
sc = spark.sparkContext

In [21]:
completed_filenames = list(Path(base_path).glob("*/*/completed.txt"))
# completed_filenames = completed_filenames + list(Path(base_path).glob("center*/*/completed.txt"))
configs = map(lambda x: x.parent / "config.yaml", completed_filenames)
configs = map(read_config, configs)
configs = list(map(normalize_config_output_path, configs))
configs = sc.parallelize(configs)

statistics = configs.map(read_parquet_and_calculate_statistics)
statistics = statistics.toDF().cache()

In [22]:
stats_df = statistics.toPandas().reset_index(drop=True).copy()

                                                                                

In [23]:
stats_df.to_csv(f"{base_path}/post_saturation_statistics_corners.csv", index=False)

In [24]:
stats_df.areal_density_index_99_percentile_max.max()

0.977988

In [None]:
config = configs.take(1)[0]

In [None]:
fields_to_add_from_config = [
    "slope",
    "effective_radius_multiplier",
    "r_stat_multiplier",
    "min_rim_percentage"
]
columns_to_calculate_stats = [
    "areal_density",
    "z",
    "za",
    "n_craters_in_study_region",
    "n_craters_added_in_study_region"
]

with (config["output_path"] / "completed.txt").open("r") as f:
    duration = float(f.readlines()[0].split(" ")[1])

# Read all statistics parquet files
filenames = config["output_path"].glob("statistics*.parquet")

dataframes = []
for filename in filenames:
    df = pd.read_parquet(filename)
    dataframes.append(df)
df = pd.concat(dataframes)
df = df.sort_values(by=["crater_id"]).reset_index(drop=True)
n_craters_at_completion = df.shape[0]

ad = df.areal_density

In [None]:
float(ad[ad >= ad.quantile(.9)].idxmin() / ad.shape[0])

In [None]:
float(ad[ad >= ad.quantile(.95)].idxmin() / ad.shape[0])

In [None]:
float(ad[ad >= ad.quantile(.97)].idxmin() / ad.shape[0])

In [None]:
float(ad[ad >= ad.quantile(.99)].idxmin() / ad.shape[0])

In [None]:
float(ad[ad >= ad.quantile(.9)].idxmin() / ad.shape[0])

In [None]:
ad.quantile(.97)

In [None]:
ad.quantile(.99)

In [None]:
ad[ad >= ad.quantile(.97)].index[0]

In [None]:
ad[ad >= ad.quantile(.99)].index[0]

### Creating a lookup function

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
from typing import *

In [None]:
pio.renderers.default = "iframe"

In [None]:
stats_df = pd.read_csv(f"{base_path}/post_saturation_statistics.csv")

In [None]:
def lookup_statistics(*,
                      data: pd.DataFrame,
                      slope_range: tuple[float, float],
                      effective_radius_multiplier_range: tuple[float, float],
                      r_stat_multiplier_range: tuple[float, float],
                      min_rim_percentage_range: tuple[float, float]) -> pd.DataFrame:
    result = data
    result = result[result.slope.between(*slope_range)]
    result = result[result.effective_radius_multiplier.between(*effective_radius_multiplier_range)]
    result = result[result.r_stat_multiplier.between(*r_stat_multiplier_range)]
    result = result[result.min_rim_percentage.between(*min_rim_percentage_range)]
    
    return result.reset_index(drop=True).copy()                      

In [None]:
lookup_result = lookup_statistics(
    data=stats_df,
    slope_range=(1.8, 2.0),
    effective_radius_multiplier_range=(1.3, 1.7),
    r_stat_multiplier_range=(5.5, 6.5),
    min_rim_percentage_range=(0.4, 0.5)
)

In [None]:
series = lookup_result.areal_density_max
print(f"Mean={series.mean():.3f}, Stdev={series.std():.3f}, IQR=({series.quantile(0.25):.3f}, {series.quantile(0.75):.3f})")

In [None]:
series = lookup_result.areal_density_95_percentile
print(f"Mean={series.mean():.3f}, Stdev={series.std():.3f}, IQR=({series.quantile(0.25):.3f}, {series.quantile(0.75):.3f})")

In [None]:
series = lookup_result.areal_density_median
print(f"Mean={series.mean():.3f}, Stdev={series.std():.3f}, IQR=({series.quantile(0.25):.3f}, {series.quantile(0.75):.3f})")

In [None]:
series = lookup_result.n_craters_in_study_region_median
print(f"Mean={series.mean():.3f}, Stdev={series.std():.3f}, IQR=({series.quantile(0.25):.3f}, {series.quantile(0.75):.3f})")

In [None]:
lookup_result.areal_density_median.min()

In [None]:
lookup_result.areal_density_median.max()