In [None]:
import sys

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
from typing import *

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Window
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
import itertools

from pyspark.ml.feature import VectorAssembler, MinMaxScaler, BucketedRandomProjectionLSH
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array

from utils import *

pio.renderers.default = "iframe"

In [None]:
n_cores = 28

spark = (SparkSession.builder
         .master(f"local[{n_cores}]")
         .appName("Saturation")
         .config("spark.sql.shuffle.partitions", "500")
         .config("spark.driver.memory", "60g")
         .config("spark.driver.maxResultSize", "8g")
         .getOrCreate())

In [None]:
base_path = "/data/saturation/thesis_run_20240129"
r_stat = 5
study_region_size = 2000 * 2000 / r_stat ** 2

configs_df = create_configs_df(read_configs(base_path, spark))
data = spark.read.parquet(f"{base_path}/*/statistics_*.parquet")

# State c2c nn dist in terms of r_stat and log-scale
data = data.select(
    "*",
    F.log10(F.col("center_to_center_nearest_neighbor_distance_mean") / F.lit(r_stat)).alias("log_mean_c2c_nn_dist"),
    F.log10("areal_density").alias("log_ad"),
    F.log10("n_craters_added_in_study_region").alias("log_n_craters_added_in_study_region"),
    (F.col("n_craters_in_study_region") / F.col("n_craters_added_in_study_region")).alias("information_remaining")
)

In [None]:
data_subset = data.where(
    (F.col("n_craters_added_in_study_region") > F.lit(50))
).sample(0.0005)

df = join_configs(data_subset, configs_df, spark).toPandas()

In [None]:
configs_df.toPandas().sort_values("slope")

In [None]:
def plot_log_N_and_log_NNd_for_simulation(simulation_id: int, stats_df: pd.DataFrame):
    df = stats_df[stats_df.simulation_id == simulation_id].sort_values("n_craters_added_in_study_region")
    df.head()
    
    print(df[["slope", "effective_radius_multiplier", "r_stat_multiplier", "min_rim_percentage"]].iloc[0])
    
    fig = plt.figure(figsize=(12, 3))
    ax1 = fig.add_subplot(111)
    
    plt1 = ax1.plot(df.n_craters_added_in_study_region, df.log_mean_c2c_nn_dist, color='red', label="$log(\overline{NN_{d}})$")
    ax1.set_ylabel("$log(\overline{NN_{d}})$")
    ax1.set_xlabel("N Craters Added")
    
    ax2 = ax1.twinx()
    plt2 = ax2.plot(df.n_craters_added_in_study_region, np.log10(df.n_craters_in_study_region), color="blue", label="$log(N_{obs})$")
    ax2.set_ylabel("$log(N_{obs})$")
    
    lns = plt1 + plt2
    ax1.legend(lns, [l.get_label() for l in lns], loc=1)
    
    ax1.set_title(f"Simulation {simulation_id}")

    plt.show()

In [None]:
for sim_id in np.random.choice(df.simulation_id.drop_duplicates(), 20):
    plot_log_N_and_log_NNd_for_simulation(sim_id, df)

In [None]:
# Randomly arrange the points
# Ordering affects which points in the subsequent plots are "on top" of others
df["rnd"] = np.random.rand(df.shape[0])
df = df.sort_values("rnd")

In [None]:
fig = px.scatter(
    df,
    x="log_mean_c2c_nn_dist",
    y="log_n_craters_added_in_study_region",
    color="slope",
    hover_data=["simulation_id", "slope", "n_craters_added_in_study_region", "areal_density", "n_craters_in_study_region"],
    size_max=1,
    width=1600,
    height=600,
)
fig.update_layout(
    xaxis_title=dict(
        text="$log_{10}(\overline{NN_d})$",
        font=dict(size=18)
    ),
    yaxis_title=dict(
        text="$log_{10}(N_{tot})$",
        font=dict(size=18)
    ),
)
fig.update_traces(marker={"size":3})
fig.show()

In [None]:
# Trying a range selector
to_show = df.copy()
to_show["slope_selector"] = ((to_show.slope - 1) * 10).astype("int") / 10 + 1
to_show = to_show.sort_values("slope_selector")
to_show["index"] = range(to_show.shape[0])
to_show["log_n_craters_in_study_region"] = np.log10(to_show.n_craters_in_study_region)
to_show["log_areal_density_overlap_2"] = np.log10(to_show.areal_density_overlap_2)

range_x = [to_show.log_mean_c2c_nn_dist.min(), to_show.log_mean_c2c_nn_dist.max()]
range_y = [to_show.log_n_craters_added_in_study_region.min(), to_show.log_n_craters_added_in_study_region.max()]

fig = px.scatter(
    to_show,
    x="log_mean_c2c_nn_dist",
    y="log_n_craters_added_in_study_region",
    color="log_ad",
    hover_data=["slope", "n_craters_added_in_study_region"],
    size_max=1,
    animation_frame="slope_selector",
    animation_group="index",
    range_x=range_x,
    range_y=range_y,
    width=1000,
    height=500
)
fig.update_layout(
    xaxis_title=dict(
        text="$log_{10}(\overline{NN_d})$",
        font=dict(size=18)
    ),
    yaxis_title=dict(
        text="$log(N_{tot})$",
        font=dict(size=18)
    ),
)
fig.update_traces(marker={"size":3})
fig.show()

## Plotting single simulations

In [None]:
df = data.where(
    (F.col("n_craters_added_in_study_region") > F.lit(25))
    & (data.simulation_id == 4935)
).cache()

n_pts = 10000
n_obs = df.count()

if n_obs > n_pts:
    sample_fraction = n_pts / n_obs
    df = df.sample(sample_fraction)

pandas_df = df.toPandas()
pandas_df["log_n_craters_in_study_region"] = np.log10(pandas_df.n_craters_in_study_region)
pandas_df = pandas_df.sort_values("log_n_craters_in_study_region")

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(
    # np.log10(pandas_df.n_craters_added_in_study_region),
    np.log10(pandas_df.log_n_craters_in_study_region),
    pandas_df.log_mean_c2c_nn_dist,    
)
plt.title("Single Simulation, b=3")
plt.xlabel("$log_{10}(N_{tot})$", size=14)
plt.ylabel("$log_{10}(\overline{NN_d})$", size=14)
plt.show()

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(
    np.log10(pandas_df.n_craters_added_in_study_region),
    np.log10(pandas_df.n_craters_in_study_region),
)
plt.title("Single Simulation, b=3")
plt.xlabel("$log_{10}(N_{tot})$", size=14)
plt.ylabel("$log_{10}(N_obs)$", size=14)
plt.show()