In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
from typing import *
import glob

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib
import matplotlib.pyplot as plt

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Window
import itertools

from saturation.utils import *

pio.renderers.default = "iframe"

In [3]:
n_cores = 28

spark = (SparkSession.builder
         .master(f"local[{n_cores}]")
         .appName("Saturation")
         .config("spark.sql.shuffle.partitions", "500")
         .config("spark.driver.memory", "60g")
         .config("spark.driver.maxResultSize", "8g")
         .getOrCreate())

In [29]:
# base_path = "/data/saturation/thesis_run_20240130"
base_path = "/data/saturation/thesis_run_shallow_slopes_20240420"

# study_region_size = 12000
# r_stat = 3
# study_region_padding = int(study_region_size * 0.125)

study_region_size = 4000
r_stat = 5
study_region_padding = int(study_region_size * 0.125)

In [30]:
configs_df = create_configs_df(read_configs(base_path, spark))
configs_pd = configs_df.toPandas()

In [31]:
configs_pd[configs_pd.slope.between(1.1, 1.2)].sort_values("slope")

In [32]:
configs_dict = dict()
for config_file in glob.glob(f"{base_path}/config/config_*.yaml"):
    for run_config in read_config(Path(config_file))["run_configurations"]:
        configs_dict.update(run_config)

## Example CSFD

In [10]:
# Steep slope, 2.7
simulation_id = 6930
target_n_craters_added_in_study_region = 3000
est_saturation_n = 40

sim_name = configs_dict[simulation_id]["simulation_name"]
sim_path = f"{base_path}/{sim_name}"

stats_df = spark.read.parquet(f"{sim_path}/statistics_*.parquet")
craters_df = spark.read.parquet(f"{sim_path}/craters_*.parquet")
removals_df = spark.read.parquet(f"{sim_path}/crater_removals_*.parquet")

state = get_state_at_time(
    stats_df,
    craters_df,
    removals_df,
    simulation_id,
    target_n_craters_added_in_study_region,
    study_region_size,
    study_region_padding,
    spark
)   

estimated_slope, sigma = estimate_cumulative_slope(state.radius,
                                                   state.radius.min(),
                                                   est_saturation_n)
intercept = estimate_intercept(state.radius[state.radius < est_saturation_n], estimated_slope)
print(f"estimated slope={estimated_slope:.3f}, sigma={sigma:.3f}")
plot_sfds(state, -estimated_slope, intercept=intercept)

In [12]:
# Plotting just the CSFD
radii = state.radius.sort_values()

# Track min and max radii
min_radius = radii.min()
max_radius = radii.max()

plt.plot(radii, range(len(radii) + 1, 1, -1), label="Observed")
plt.xlabel("$R$")
plt.ylabel("$N(\geq R)$")

plt.subplots_adjust(right=0.7)
plt.tight_layout(rect=[0, 0, 0.9, 1])

plt.xscale('log')
plt.yscale('log')
plt.savefig("figures/example_csfd.png")

plt.show()

## Shallow slope

In [34]:
# configs_pd[configs_pd.slope.between(1.0, 1.1)].sort_values("min_rim_percentage")
configs_pd

In [35]:
# Shallow slope, 1.065
simulation_id = 5
target_n_craters_added_in_study_region = [100, 500, 1000, 5000, 10000]
est_saturation_n = 100

sim_name = configs_dict[simulation_id]["simulation_name"]
sim_path = f"{base_path}/{sim_name}"

stats_df = spark.read.parquet(f"{sim_path}/statistics_*.parquet")
craters_df = spark.read.parquet(f"{sim_path}/craters_*.parquet")
removals_df = spark.read.parquet(f"{sim_path}/crater_removals_*.parquet")

In [36]:
target_n_craters_added_in_study_region = [100, 500, 1000, 5000, 10000]

states = {
    x: get_state_at_time(
        stats_df,
        craters_df,
        removals_df,
        simulation_id,
        x,
        study_region_size,
        study_region_padding,
        spark
    )
    for x in target_n_craters_added_in_study_region
}
fig = plot_csfds_for_multiple_n_tot(states, reference_slope=1.149, reference_intercept=7e3)
fig.show()

In [37]:
target_n_craters_added_in_study_region = [100000, 500000, 1000000, 5000000]
states = {
    x: get_state_at_time(
        stats_df,
        craters_df,
        removals_df,
        simulation_id,
        x,
        study_region_size,
        study_region_padding,
        spark
    )
    for x in target_n_craters_added_in_study_region
}
fig = plot_csfds_for_multiple_n_tot(states, reference_slope=1.149, reference_intercept=7e3)
fig.show()

In [43]:
target_n_craters_added_in_study_region = [10000000, 25000000, 50000000]
states = {
    x: get_state_at_time(
        stats_df,
        craters_df,
        removals_df,
        simulation_id,
        x,
        study_region_size,
        study_region_padding,
        spark
    )
    for x in target_n_craters_added_in_study_region
}
fig = plot_csfds_for_multiple_n_tot(states, reference_slope=1.149, reference_intercept=7e3)
fig.show()

In [44]:
target_n_craters_added_in_study_region = [20000000, 30000000, 40000000]
states = {
    x: get_state_at_time(
        stats_df,
        craters_df,
        removals_df,
        simulation_id,
        x,
        study_region_size,
        study_region_padding,
        spark
    )
    for x in target_n_craters_added_in_study_region
}
fig = plot_csfds_for_multiple_n_tot(states, reference_slope=1.149, reference_intercept=7e3)
fig.show()

In [19]:
state = states[1000]
# estimated_slope, sigma = estimate_cumulative_slope(state.radius,
#                                                    state.radius.min(),
#                                                    state.radius.max()) # est_saturation_n)
# intercept = estimate_intercept(state.radius[state.radius < est_saturation_n], estimated_slope)

estimated_slope, sigma = estimate_cumulative_slope(state.radius,
                                                   r_stat,
                                                   study_region_size // 4)
intercept = estimate_intercept(state.radius, estimated_slope)

print(f"estimated slope={estimated_slope:.3f}, sigma={sigma:.3f}")
plot_csfd_with_slope(state, -estimated_slope, intercept=intercept)

In [26]:
min_r = 200
max_r = 3000

estimated_slope, sigma = estimate_cumulative_slope(state.radius,
                                                   min_r,
                                                   max_r)
intercept = estimate_intercept(state.radius[state.radius > est_saturation_n], estimated_slope)
print(f"estimated slope={estimated_slope:.3f}, sigma={sigma:.3f}")
plot_sfds(state, -estimated_slope, intercept=intercept)

In [15]:
state[state.radius > est_saturation_n].sort_values("radius")

In [69]:
state.shape