#### Data preparation
- 243 simulations, 3 per point in the CCD design
- Hold out 1 simulation per CCD point for testing
- From the final 50,000 steps of each simulation, take a random sample of 5,000

#### Models
$\gamma = \beta * X + \epsilon$

$Z = \beta * X + \epsilon$

$Za = \beta * X + \epsilon$

where $\gamma$ is areal density and X is the design matrix.

Predictor variables (PVs) in the design matrix are:
- slope
- R_stat multiplier
- Min rim percentage
- Effective radius multiplier
- Number of craters in the study region

In [22]:
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import numpy as np

from pathlib import Path
import pandas as pd

import statsmodels.api as sm

import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

from synapse.ml.lightgbm import *

pio.renderers.default = "iframe"

In [132]:
def get_holdout_sim(index: int,
                    sample_cadence: int,
                    metric: str,
                    n_obs_from_end: int,
                    holdout_sim_dfs: List[pd.DataFrame],
                    min_max_scaler: MinMaxScaler,
                    poly_transform: sklearn.preprocessing.PolynomialFeatures) -> pd.DataFrame:
    holdout_sim = holdout_sim_dfs[holdout_index]
    holdout_sim = holdout_sim.iloc[[x * sample_cadence for x in range(holdout_sim.shape[0] // sample_cadence)]].copy()

    holdout_X = holdout_sim[ivs]
    holdout_X = min_max_scaler.transform(holdout_X)
    holdout_X = pd.DataFrame(holdout_X, columns=min_max_scaler.get_feature_names_out())
    holdout_X = poly_transform.transform(holdout_X)
    holdout_X = pd.DataFrame(holdout_X, columns=poly_transform.get_feature_names_out())
    holdout_X[metric] = list(holdout_sim[metric])
    
    return holdout_X.iloc[-n_obs_from_end:].copy()

In [48]:
base_path = "/data/saturation/central_composite_design/ccd5"
ivs = ["slope", "effective_radius_multiplier", "r_stat_multiplier", "min_rim_percentage", "n_craters_in_study_region"]

In [232]:
stats_df = pd.read_csv(f"{base_path}/post_saturation_sample_500.csv")
# stats_df = pd.read_csv(f"{base_path}/post_saturation_sample_5000.csv")

min_max_scaler = MinMaxScaler()
stats_df[ivs] = min_max_scaler.fit_transform(stats_df[ivs])

paths = list(Path(base_path).glob("simulation_*.parquet"))
holdout_sim_dfs = [pd.read_parquet(x) for x in paths]

# metric = "areal_density"
metric = "za"
# metric = "z"

In [233]:
poly_transform = sklearn.preprocessing.PolynomialFeatures(degree=2)
X = poly_transform.fit_transform(stats_df[ivs])
X = pd.DataFrame(X, columns=poly_transform.get_feature_names_out())

In [234]:
# Using RF CIs

In [235]:
from quantile_forest import RandomForestQuantileRegressor

In [236]:
model = RandomForestQuantileRegressor(n_estimators=500, random_state=42, n_jobs=24)
model_fit = model.fit(X, stats_df[metric])

In [237]:
holdout_index = 5
sample_cadence = 250

holdout_sim = holdout_sim_dfs[holdout_index]
holdout_sim = holdout_sim.iloc[[x * sample_cadence for x in range(holdout_sim.shape[0] // sample_cadence)]].copy()

holdout_X = holdout_sim[ivs]
holdout_X = min_max_scaler.transform(holdout_X)
holdout_X = pd.DataFrame(holdout_X, columns=min_max_scaler.get_feature_names_out())
holdout_X = poly_transform.transform(holdout_X)
holdout_X = pd.DataFrame(holdout_X, columns=poly_transform.get_feature_names_out())

In [238]:
predictions = model_fit.predict(holdout_X, quantiles=[0.05])
predictions = pd.DataFrame(predictions, columns=["predicted"])
pred_df = pd.concat([holdout_X, predictions], axis=1)
pred_df["actual"] = list(holdout_sim[metric])
pred_df["n_craters"] = list(holdout_sim.index)

In [239]:
fig = go.Figure()

fig.add_scatter(x=pred_df["n_craters"],
                y=pred_df["actual"],
                mode="lines",
                name="Actual")
fig.add_scatter(x=pred_df["n_craters"],
                y=pred_df["predicted"],
                mode="lines",
                name="Lower 5%")

fig.update_layout(
    xaxis_title="N Craters",
    yaxis_title=metric
)

fig.show()

In [240]:
sample_cadence = 250
n_obs_from_end = 500
holdout_dfs = []

for index in range(len(holdout_sim_dfs)):
    df = get_holdout_sim(index=index,
                         sample_cadence=sample_cadence,
                         metric=metric,
                         n_obs_from_end=n_obs_from_end,
                         holdout_sim_dfs=holdout_sim_dfs,
                         min_max_scaler=min_max_scaler,
                         poly_transform=poly_transform)
    holdout_dfs.append(df)

all_holdout_df = pd.concat(holdout_dfs, axis=0).reset_index(drop=True)

predictions = model_fit.predict(all_holdout_df[[x for x in all_holdout_df.columns if x != metric]], quantiles=[0.05])
predictions = pd.DataFrame(predictions, columns=["prediction"])
pred_df = pd.concat([all_holdout_df, predictions], axis=1)

In [241]:
# Score all holdout sims; check how many observations are below the 5% threshold late into simulation
pred_df[pred_df[metric] < pred_df.prediction].shape[0] / pred_df.shape[0]

0.232

In [242]:
predictions = model_fit.predict(X)
predictions = pd.DataFrame(predictions, columns=["prediction"])
pred_df = pd.concat([X, stats_df[metric], predictions], axis=1)

NameError: name 'y' is not defined

In [244]:
# Score all training sims; check how many observations are below the 5% threshold late into simulation
pred_df[pred_df[metric] < pred_df.prediction].shape[0] / pred_df.shape[0]

0.12499896953959029

In [245]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Row, Window
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

n_cores = 24

spark = SparkSession.builder \
                    .master(f"local[{n_cores}]") \
                    .appName("Saturation") \
                    .config("spark.driver.memory", "48g") \
                    .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:0.10.2") \
                    .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") \
                    .getOrCreate()
sc = spark.sparkContext

In [246]:
train_df = spark.createDataFrame(pd.concat([stats_df[metric], X], axis=1))

In [247]:
assembler = VectorAssembler(
    inputCols=poly_transform.get_feature_names_out(),
    outputCol="features")

train_df_featurized = assembler.transform(train_df)[metric, "features"]

In [248]:
model = (LightGBMRegressor()
    .setObjective('quantile')
    .setLabelCol(metric)
    .setAlpha(0.05)
    .setLearningRate(0.3)
    .setNumIterations(500)
    .setNumLeaves(1000)
    .fit(train_df_featurized))

23/03/05 14:42:15 WARN TaskSetManager: Stage 42 contains a task of very large size (1009 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [249]:
holdout_index = 5
sample_cadence = 250

holdout_sim = holdout_sim_dfs[holdout_index]
holdout_sim = holdout_sim.iloc[[x * sample_cadence for x in range(holdout_sim.shape[0] // sample_cadence)]].copy()

holdout_X = holdout_sim[ivs]
holdout_X = min_max_scaler.transform(holdout_X)
holdout_X = pd.DataFrame(holdout_X, columns=min_max_scaler.get_feature_names_out())
holdout_X = poly_transform.transform(holdout_X)
holdout_X = pd.DataFrame(holdout_X, columns=poly_transform.get_feature_names_out())

In [250]:
holdout_df = spark.createDataFrame(holdout_X)
holdout_df_featurized = assembler.transform(holdout_df)[["features"]]

In [251]:
predictions = model.transform(holdout_df_featurized).toPandas()["prediction"]

23/03/05 14:42:39 WARN DAGScheduler: Broadcasting large task binary with size 44.3 MiB


                                                                                

In [252]:
pred_df = pd.concat([holdout_X, predictions], axis=1)
pred_df["actual"] = list(holdout_sim[metric])
pred_df["n_craters"] = list(holdout_sim.index)

In [253]:
fig = go.Figure()

fig.add_scatter(x=pred_df["n_craters"],
                y=pred_df["actual"],
                mode="lines",
                name="Actual")
fig.add_scatter(x=pred_df["n_craters"],
                y=pred_df["prediction"],
                mode="lines",
                name="Lower 5%")

fig.update_layout(
    xaxis_title="N Craters",
    yaxis_title=metric
)

fig.show()

In [254]:
sample_cadence = 250
n_obs_from_end = 500
holdout_dfs = []

for index in range(len(holdout_sim_dfs)):
    df = get_holdout_sim(index=index,
                         sample_cadence=sample_cadence,
                         metric=metric,
                         n_obs_from_end=n_obs_from_end,
                         holdout_sim_dfs=holdout_sim_dfs,
                         min_max_scaler=min_max_scaler,
                         poly_transform=poly_transform)
    holdout_dfs.append(df)

all_holdout_df = pd.concat(holdout_dfs, axis=0).reset_index(drop=True)

In [255]:
holdout_df = spark.createDataFrame(all_holdout_df[[x for x in all_holdout_df.columns if x != metric]])
holdout_df_featurized = assembler.transform(holdout_df)[["features"]]

In [256]:
predictions = model.transform(holdout_df_featurized).toPandas()["prediction"]

23/03/05 14:42:41 WARN DAGScheduler: Broadcasting large task binary with size 44.3 MiB


                                                                                

In [257]:
pred_df = pd.concat([holdout_df.toPandas(), all_holdout_df[metric], predictions], axis=1)

In [258]:
# Score all holdout sims; check how many observations are below the 5% threshold late into simulation
pred_df[pred_df[metric] < pred_df.prediction].shape[0] / pred_df.shape[0]

0.09

In [259]:
predictions = model.transform(train_df_featurized).toPandas()["prediction"]
pred_df = pd.concat([stats_df[metric], X, predictions], axis=1)

23/03/05 14:42:43 WARN DAGScheduler: Broadcasting large task binary with size 44.3 MiB
23/03/05 14:42:43 WARN TaskSetManager: Stage 46 contains a task of very large size (1009 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [260]:
# Score all training sims; check how many observations are below the 5% threshold late into simulation
pred_df[pred_df[metric] < pred_df.prediction].shape[0] / pred_df.shape[0]

0.054004369152137174