In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
from typing import *

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import scipy.stats as stats
from scipy.optimize import minimize_scalar
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
from statsmodels.nonparametric.smoothers_lowess import lowess
import statsmodels.api as sm

import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

from utils import *

pio.renderers.default = "iframe"

In [2]:
n_cores = 28

spark = (SparkSession.builder
         .master(f"local[{n_cores}]")
         .appName("Saturation")
         .config("spark.sql.shuffle.partitions", "500")
         .config("spark.driver.memory", "60g")
         .config("spark.driver.maxResultSize", "8g")
         .getOrCreate())

24/03/02 13:56:03 WARN Utils: Your hostname, muninn resolves to a loopback address: 127.0.1.1; using 192.168.86.20 instead (on interface enp8s0)
24/03/02 13:56:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/02 13:56:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
base_path = "/data/saturation/thesis_run_20240130"
r_stat = 5
study_region_size = 2000 * 2000 / r_stat ** 2

configs_df = create_configs_df(read_configs(base_path, spark))
data = spark.read.parquet(f"{base_path}/*/statistics_*.parquet")

# State c2c nn dist in terms of r_stat and log-scale
data = data.where(
    (F.col("n_craters_added_in_study_region") > 100)
).select(
    "*",
    F.log10(F.col("center_to_center_nearest_neighbor_distance_mean") / F.lit(r_stat)).alias("log_mean_c2c_nn_dist"),
    F.log10("n_craters_added_in_study_region").alias("log_n_craters_added_in_study_region"),
    F.log10("n_craters_in_study_region").alias("log_n_craters_in_study_region"),
    (F.col("n_craters_in_study_region") / F.col("n_craters_added_in_study_region")).alias("information_remaining")
)

                                                                                

## Lowess regression on a narrow band of slopes

Using log-transformed values

In [None]:
data_with_configs = join_configs(data, configs_df, spark).where(
    (F.col("n_craters_added_in_study_region") > 250)
    & (F.col("slope").between(2.5, 2.6))
).cache()

In [None]:
df = data_with_configs.sample(0.01).toPandas()

In [None]:
df.to_parquet("data/df.parquet")

In [None]:
df = pd.read_parquet("data/df.parquet")

In [None]:
# Peek at the plot
plt.figure(figsize=(12,5))
plt.scatter(
    df.log_mean_c2c_nn_dist,
    df.log_n_craters_added_in_study_region,
    s=1
)

In [7]:
def fit_lowess_smoothed_ols(
    x: pd.Series,
    y: pd.Series
) -> Tuple[float, float]:
    """
    Smooths the supplied data using a Lowess smoother then fits an OLS regression on the result.
    Returns a tuple of (intercept, coefficient) of the resulting OLS regression.
    """
    predictions = lowess(y,
                         x,
                         frac=0.5,
                         return_sorted=False)

    # Fit an OLS regression
    model = sm.OLS(predictions, sm.add_constant(x)).fit()

    return model.params[0], model.params[1]

In [None]:
threshold = 1.25
model_df = df[df.log_mean_c2c_nn_dist > threshold].sort_values("log_mean_c2c_nn_dist")
intercept, coefficient = fit_lowess_smoothed_ols(model_df.log_mean_c2c_nn_dist,
                                                 model_df.log_n_craters_added_in_study_region)

In [None]:
thresholds = {
    1.05: 1.5,
    1.15: 1.4,
    1.25: 1.35,
    1.35: 1.3,
    1.45: 1.25,
    1.55: 1.2,
    1.65: 1.15,
    1.75: 1.1,
    1.85: 1.05,
    1.95: 1.05,
    2.05: 1.05,
    2.15: 1.05,
    2.25: 1.05,
    2.35: 1.05,
    2.45: 1.05,
    2.55: 1.05,
    2.65: 1.1,
    2.75: 1.15,
    2.85: 1.15,
    2.95: 1.2,
}

In [None]:
sample_percent = 0.5

slope_width = 0.05
comparison_offsets = [0.45, 0.35, 0.25, 0.15, 0.1, 0.05, -0.05]
comparison_width = 0.05

data_sample = join_configs(data, configs_df, spark).where(
        (F.col("n_craters_added_in_study_region") > 100)
).select(
        "log_mean_c2c_nn_dist",
        "log_n_craters_added_in_study_region"
)

for slope, threshold in thresholds.items():
    filtered_to_slope = data_sample.where(
        (F.col("slope").between(slope - slope_width, slope + slope_width))
    )
    model_df = filtered_to_slope.where(
        (F.col("log_mean_c2c_nn_dist") > threshold)
    ).sample(sample_percent).orderBy("log_mean_c2c_nn_dist").toPandas()
    intercept, coefficient = fit_lowess_smoothed_ols(model_df.log_mean_c2c_nn_dist,
                                                     model_df.log_n_craters_added_in_study_region)
    detrended_pd = filtered_to_slope.select(
        "*",
        (F.col("log_n_craters_added_in_study_region") - (F.col("log_mean_c2c_nn_dist") * coefficient + intercept)).alias("detrended"),
    ).toPandas()
    
    for comparison_offset in comparison_offsets:
        comparison_low = threshold - comparison_offset - comparison_width
        comparison_high = threshold - comparison_offset + comparison_width
    
        s = detrended_pd.detrended
        reference = s[detrended_pd.log_mean_c2c_nn_dist > threshold]
        # reference_var = reference.var()
        reference_var = reference.quantile(0.975) - reference.quantile(0.025) 

        comparison = s[detrended_pd.log_mean_c2c_nn_dist.between(comparison_low, comparison_high)]
        # comparison_var = comparison.var()
        comparison_var = comparison.quantile(0.975) - comparison.quantile(0.025) 
        ratio = comparison_var / reference_var

        print(f"slope={slope}, threshold={threshold}, comp offset={comparison_offset}, ref n={len(reference)}, comp n={len(comparison)}, comp var={comparison_var:.3f}, ref var={reference_var:.3f}, ratio={ratio:.3f}")

## N_obs vs N_tot instead

In [4]:
thresholds = {
    # 1.05: 2.15,
    # 1.15: 1.4,
    # 1.25: 1.35,
    # 1.35: 1.3,
    1.45: 2.6,
    # 1.55: 1.2,
    # 1.65: 1.15,
    # 1.75: 1.1,
    # 1.85: 1.05,
    1.95: 3.15,
    # 2.05: 1.05,
    # 2.15: 1.05,
    # 2.25: 1.05,
    # 2.35: 1.05,
    2.45: 3.15,
    # 2.55: 1.05,
    2.65: 3.0,
    # 2.75: 1.15,
    # 2.85: 1.15,
    2.95: 2.8,
}

In [25]:
sample_percent = 0.05

slope_width = 0.05
comparison_offsets = [1.0, 0.75, 0.45, 0.15, 0.1, 0.05, -0.05]
comparison_width = 0.1

data_sample = join_configs(data, configs_df, spark).select(
        "log_n_craters_in_study_region",
        "log_n_craters_added_in_study_region",
        (F.col("log_n_craters_added_in_study_region") - (F.col("log_n_craters_in_study_region"))).alias("detrended")
).sample(sample_percent).cache()

for slope, threshold in thresholds.items():
    filtered_to_slope = data_sample.where(
        (F.col("slope").between(slope - slope_width, slope + slope_width))
    ).toPandas()
    
    for comparison_offset in comparison_offsets:
        comparison_low = threshold + comparison_offset - comparison_width
        comparison_high = threshold + comparison_offset + comparison_width
    
        s = detrended_pd.detrended
        reference = s[detrended_pd.log_n_craters_in_study_region < threshold]
        # reference_var = reference.var()
        reference_var = reference.quantile(0.975) - reference.quantile(0.025) 

        comparison = s[detrended_pd.log_n_craters_in_study_region.between(comparison_low, comparison_high)]
        # comparison_var = comparison.var()
        comparison_var = comparison.quantile(0.975) - comparison.quantile(0.025) 
        ratio = comparison_var / reference_var

        print(f"slope={slope}, threshold={threshold}, comp offset={comparison_offset}, ref n={len(reference)}, comp n={len(comparison)}, comp var={comparison_var:.3f}, ref var={reference_var:.3f}, ratio={ratio:.3f}")

In [37]:
# Compare some F scores
slope_min = 2.6
slope_max = 2.7
sample_percent = 0.01

data_sample = join_configs(data, configs_df, spark).select(
        "log_n_craters_in_study_region",
        "log_n_craters_added_in_study_region",
        (F.col("log_n_craters_added_in_study_region") - (F.col("log_n_craters_in_study_region"))).alias("detrended")
).where(
    (F.col("slope").between(slope_min, slope_max))
).sample(sample_percent).toPandas()

                                                                                

In [49]:
threshold = 3.0
offset = 0.75
window_width = 0.2

s = detrended_pd.detrended[detrended_pd.log_n_craters_in_study_region < threshold]
var1 = s.var()
df1 = s.shape[0]

s = detrended_pd.detrended[detrended_pd.log_n_craters_in_study_region.between(threshold + offset - window_width / 2, threshold + offset + window_width / 2)]
var2 = s.var()
df2 = s.shape[0]

numer, numer_df, denom, denom_df = (var1, df1, var2, df2) if var1 > var2 else (var2, df2, var1, df1)
p_value = 1 - stats.f.cdf(numer / denom, numer_df, denom_df)
print(f"Var: {numer:.4f} / {denom:.4f}, score = {numer/denom:.4f}, p = {p_value:.4f}")

Var: 0.0710 / 0.0512, score = 1.3872, p = 0.0000


In [53]:
threshold = 2.9
offset = 1
window_width = 0.2

s = detrended_pd.detrended[detrended_pd.log_n_craters_in_study_region < threshold]
disp1 = s.quantile(0.975) - s.quantile(0.025)

s = detrended_pd.detrended[detrended_pd.log_n_craters_in_study_region.between(threshold + offset - window_width / 2, threshold + offset + window_width / 2)]
disp2 = s.quantile(0.975) - s.quantile(0.025)

print(f"disp1={disp1:.3f}, disp2={disp2:.3f}, ratio={disp1/disp2:.3f}")

disp1=0.588, disp2=0.882, ratio=0.667


In [None]:
# Use the model to detrend all data
detrended = data_with_configs.select(
    (F.col("log_n_craters_added_in_study_region") - (F.col("log_mean_c2c_nn_dist") * -2)).alias("detrended"),
    "log_mean_c2c_nn_dist",
    "log_n_craters_added_in_study_region",
    F.log10("n_craters_in_study_region").alias("log_n_craters_in_study_region")
)

In [None]:
detrended_pd = detrended.sample(0.001).toPandas()

In [None]:
plt.scatter(
    detrended_pd.log_mean_c2c_nn_dist,
    detrended_pd.detrended,
    s=1
)

In [None]:
plt.scatter(
    detrended_pd.log_mean_c2c_nn_dist,
    detrended_pd.detrended,
    s=1
)

In [None]:
reference_low = 1.25
reference_high = 2
comparison_low = 1.15
comparison_high = 1.25

reference_var = detrended_pd.log_n_craters_added_in_study_region[detrended_pd.log_mean_c2c_nn_dist.between(reference_low, reference_high)].var()
comparison_var = detrended_pd.log_n_craters_added_in_study_region[detrended_pd.log_mean_c2c_nn_dist.between(comparison_low, comparison_high)].var()
comparison_var / reference_var

In [None]:
reference_low = 1.25
reference_high = 1.35
comparison_low = 1.45
comparison_high = 1.55

reference_var = detrended_pd.log_n_craters_added_in_study_region[detrended_pd.log_mean_c2c_nn_dist.between(reference_low, reference_high)].var()
comparison_var = detrended_pd.log_n_craters_added_in_study_region[detrended_pd.log_mean_c2c_nn_dist.between(comparison_low, comparison_high)].var()
comparison_var / reference_var

In [None]:
reference_low = 1.25
reference_high = 2
comparison_low = 1.05
comparison_high = 1.25

reference_var = detrended_pd.log_n_craters_added_in_study_region[detrended_pd.log_mean_c2c_nn_dist.between(reference_low, reference_high)].var()
comparison_var = detrended_pd.log_n_craters_added_in_study_region[detrended_pd.log_mean_c2c_nn_dist.between(comparison_low, comparison_high)].var()
comparison_var / reference_var

In [None]:
reference_low = 1.25
reference_high = 2
comparison_low = .95
comparison_high = 1.05

reference_var = detrended_pd.log_n_craters_added_in_study_region[detrended_pd.log_mean_c2c_nn_dist.between(reference_low, reference_high)].var()
comparison_var = detrended_pd.log_n_craters_added_in_study_region[detrended_pd.log_mean_c2c_nn_dist.between(comparison_low, comparison_high)].var()
comparison_var / reference_var

In [None]:
reference_var, comparison_var

In [None]:
detrended_pd.log_n_craters_added_in_study_region[detrended_pd.log_mean_c2c_nn_dist.between(reference_low, reference_high)]

In [None]:
comparison_var

In [None]:
# Check 95% CI of detrended data
q95 = detrended_pd.detrended[detrended_pd.log_mean_c2c_nn_dist > threshold].quantile(0.95)

In [None]:
start = detrended_pd.log_mean_c2c_nn_dist.min()
end = 1.4 #detrended_pd.log_mean_c2c_nn_dist.max()
n_steps = 50
step_size = (end - start) / n_steps

for x in range(n_steps):
    left = start + x * step_size
    right = left + step_size
    s = detrended_pd.detrended[detrended_pd.log_mean_c2c_nn_dist.between(left, right)]
    percent = (s > q95).mean()
    print(f"Left = {left:.3f}, right = {right:.3f}, % = {percent}")

In [None]:
threshold = 1.1
lowess_model_df = df[df.log_mean_c2c_nn_dist > threshold].sort_values("log_mean_c2c_nn_dist")

In [None]:
# Check a plot to see if we cut off appropriately
x = lowess_model_df.log_mean_c2c_nn_dist
y = lowess_model_df.log_n_craters_added_in_study_region

plt.figure(figsize=(12,5))
plt.scatter(
    x,
    y,
    s=1
)


In [None]:
x = lowess_model_df.log_mean_c2c_nn_dist
y = lowess_model_df.log_n_craters_added_in_study_region
lowess_predictions = lowess(y,
                            x,
                            frac=0.2,
                            is_sorted=True,
                            return_sorted=False)
lowess_model_df["lowess_predictions"] = lowess_predictions
lowess_model_df["lowess_residuals"] = lowess_model_df.log_n_craters_added_in_study_region - lowess_model_df.lowess_predictions

In [None]:
# Plot results with lowess line
plt.plot(
    x,
    pred,
    color="green"
)

plt.scatter(
    x,
    y,
    s=1
)
plt.show()

In [None]:
# Plot lowess residuals
plt.scatter(
    lowess_model_df.log_mean_c2c_nn_dist,
    lowess_model_df.lowess_residuals,
    s=1
)
plt.show()

In [None]:
# Fit an OLS regression
ols_model = sm.OLS(lowess_model_df.lowess_predictions, sm.add_constant(lowess_model_df.log_mean_c2c_nn_dist)).fit()

In [None]:
lowess_model_df["ols_predictions"] = ols_model.predict(sm.add_constant(lowess_model_df.log_mean_c2c_nn_dist))

In [None]:
# Plot results with OLS line
plt.plot(
    lowess_model_df.log_mean_c2c_nn_dist,
    lowess_model_df.ols_predictions,
    color="green"
)

plt.scatter(
    lowess_model_df.log_mean_c2c_nn_dist,
    lowess_model_df.log_n_craters_added_in_study_region,
    s=1
)
plt.show()

In [None]:
ols_model.summary()

In [None]:
to_predict = df.sample(1000).copy()
interpolated = []
for point in to_predict.log_mean_c2c_nn_dist:
    f = interp1d(model_df.log_mean_c2c_nn_dist, model_df.pred)
    interpolated.append(f(point))
to_predict["pred"] = interpolated
to_predict["resid"] = to_predict.pred - to_predict.log_n_craters_added_in_study_region
to_predict["outside_ci"] = ~to_predict.resid.between(ref_5q, ref_95q)

In [None]:
plt.scatter(
    to_predict.log_mean_c2c_nn_dist,
    to_predict.resid,
    c=to_predict.outside_ci,
    s=1
)

In [None]:
to_predict.outside_ci.mean()

In [None]:
interpolated = []
for point in to_predict:
    f = interp1d(df.log_mean_c2c_nn_dist, df.pred)
    interpolated.append(f(point))

In [None]:
comparison_df = pd.DataFrame([], columns=["log_mean_c2c_nn_dist", "pred"])
comparison_df["log_mean_c2c_nn_dist"] = to_predict
comparison_df["pred"] = interpolated

In [None]:
comparison_df