In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.transforms as mtransforms
import numba
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

from typing import List, Tuple

### Plotting Mean Curves

In [2]:
def percentile(n):
    @numba.jit(forceobj=True, fastmath=True)
    def _percentile(x):
        return np.percentile(x, n)
    return _percentile

def aggregate_simulations(stats: pd.DataFrame, expected_count: int):
    columns_to_aggregate = [
        'n_craters_in_observed_area',
        'areal_density',
        'z',
        'za'
    ]
    aggregations = {
        x: [
            'min',
            pd.NamedAgg(column=x, aggfunc=percentile(25)),
            'mean',
            pd.NamedAgg(column=x, aggfunc=percentile(75)),
            'max'
        ]
        for x in columns_to_aggregate
    }
    col_names = [
        [
            f'{x}_min',
            f'{x}_25_percentile',
            f'{x}_mean',
            f'{x}_75_percentile',
            f'{x}_max'
        ]
        for x in columns_to_aggregate
    ]
    col_names = [y for x in col_names for y in x]

    grouped = stats.groupby(['n_craters_added_in_observed_area']).agg(
        aggregations
    )

    grouped.columns = col_names
    grouped['group_count'] = stats.groupby(['n_craters_added_in_observed_area']).n_craters_in_observed_area.agg('count')
    
    # Make sure all samples have full data
    grouped = grouped[grouped.group_count == expected_count]
    grouped.reset_index(inplace=True)
    
    return grouped


def plot_statistics(stats: pd.DataFrame):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

    # Plot crater count
    ax = axes[0][0]
    ax.plot(stats.n_craters_added_in_observed_area, stats.n_craters_in_observed_area_min, label='Min')
    ax.plot(stats.n_craters_added_in_observed_area, stats.n_craters_in_observed_area_25_percentile, label='25th')
    ax.plot(stats.n_craters_added_in_observed_area, stats.n_craters_in_observed_area_mean, label='Mean')
    ax.plot(stats.n_craters_added_in_observed_area, stats.n_craters_in_observed_area_75_percentile, label='75th')
    ax.plot(stats.n_craters_added_in_observed_area, stats.n_craters_in_observed_area_max, label='Max')
    ax.set_xlabel('Craters Generated')
    ax.set_ylabel('Craters in Study Region')
    ax.legend(loc='upper left')

    # Plot crater count
    ax = axes[0][1]
    ax.plot(stats.n_craters_added_in_observed_area, stats.areal_density_min, label='Min')
    ax.plot(stats.n_craters_added_in_observed_area, stats.areal_density_25_percentile, label='25th')
    ax.plot(stats.n_craters_added_in_observed_area, stats.areal_density_mean, label='Mean')
    ax.plot(stats.n_craters_added_in_observed_area, stats.areal_density_75_percentile, label='75th')
    ax.plot(stats.n_craters_added_in_observed_area, stats.areal_density_max, label='Max')
    ax.set_xlabel('Craters Generated')
    ax.set_ylabel('Areal Density')

    # Plot Z statistic
    ax = axes[1][0]
    ax.plot(stats.n_craters_added_in_observed_area, stats.z_min, label='Min')
    ax.plot(stats.n_craters_added_in_observed_area, stats.z_25_percentile, label='25th')
    ax.plot(stats.n_craters_added_in_observed_area, stats.z_mean, label='Mean')
    ax.plot(stats.n_craters_added_in_observed_area, stats.z_75_percentile, label='75th')
    ax.plot(stats.n_craters_added_in_observed_area, stats.z_max, label='Max')
    ax.axhline(-1.96, color='g', ls='--')
    ax.axhline(1.96, color='g', ls='--')
    ax.axhline(-2.58, color='r', ls='--')
    ax.axhline(2.58, color='r', ls='--')
    ax.set_xlabel('Craters Generated')
    ax.set_ylabel('Z Statistic')

    # Plot Za statistic
    ax = axes[1][1]
    ax.plot(stats.n_craters_added_in_observed_area, stats.za_min, label='Min')
    ax.plot(stats.n_craters_added_in_observed_area, stats.za_25_percentile, label='25th')
    ax.plot(stats.n_craters_added_in_observed_area, stats.za_mean, label='Mean')
    ax.plot(stats.n_craters_added_in_observed_area, stats.za_75_percentile, label='75th')
    ax.plot(stats.n_craters_added_in_observed_area, stats.za_max, label='Max')
    ax.axhline(-1.96, color='g', ls='--')
    ax.axhline(1.96, color='g', ls='--')
    ax.axhline(-2.58, color='r', ls='--')
    ax.axhline(2.58, color='r', ls='--')
    ax.set_xlabel('Craters Generated')
    ax.set_ylabel('Za Statistic')

    plt.show()

def plot_four_metric_plots(python_stats: pd.DataFrame,
                           idl_stats: pd.DataFrame,
                           n_craters_column: str,
                           areal_density_column: str,
                           z_column: str,
                           za_column: str,
                           output_filename: str):
    fig, axes = plt.subplot_mosaic([['(a)', '(b)'], ['(c)', '(d)']],
                               figsize=(12, 7),
                               constrained_layout=True)

    for label, ax in axes.items():
        # label physical distance in and down:
        trans = mtransforms.ScaledTranslation(10/72, -5/72, fig.dpi_scale_trans)
        ax.text(0.0,
                1.0,
                label,
                transform=ax.transAxes + trans,
                fontsize='large',
                verticalalignment='top',
                fontfamily='serif',
                bbox=dict(facecolor='1.0', edgecolor='none', pad=3.0))

    # Plot crater count
    ax = axes['(a)']
    ax.plot(python_stats.n_craters_added_in_observed_area, python_stats[n_craters_column], color='b', label='Python')
    ax.plot(idl_stats.n_craters_added_in_observed_area, idl_stats[n_craters_column], color='r', label='IDL')
    ax.set_xlabel('Craters Generated')
    ax.set_ylabel('Craters in Study Region')
    ax.legend(loc='lower right')

    # Plot crater count
    ax = axes['(b)']
    ax.plot(python_stats.n_craters_added_in_observed_area, python_stats[areal_density_column], color='b', label='Python')
    ax.plot(idl_stats.n_craters_added_in_observed_area, idl_stats[areal_density_column], color='r', label='IDL')
    ax.set_xlabel('Craters Generated')
    ax.set_ylabel('Areal Density')

    # Plot Z statistic
    ax = axes['(c)']
    ax.plot(python_stats.n_craters_added_in_observed_area, python_stats[z_column], color='b', label='Python')
    ax.plot(idl_stats.n_craters_added_in_observed_area, idl_stats[z_column], color='r', label='IDL')
    ax.axhline(-1.96, color='g', ls='--')
    ax.axhline(1.96, color='g', ls='--')
    ax.axhline(-2.58, color='r', ls='--')
    ax.axhline(2.58, color='r', ls='--')
    ax.set_xlabel('Craters Generated')
    ax.set_ylabel('Z Statistic')

    # Plot Za statistic
    ax = axes['(d)']
    ax.plot(python_stats.n_craters_added_in_observed_area, python_stats[za_column], color='b', label='Python')
    ax.plot(idl_stats.n_craters_added_in_observed_area, idl_stats[za_column], color='r', label='IDL')
    ax.axhline(-1.96, color='g', ls='--')
    ax.axhline(1.96, color='g', ls='--')
    ax.axhline(-2.58, color='r', ls='--')
    ax.axhline(2.58, color='r', ls='--')
    ax.set_xlabel('Craters Generated')
    ax.set_ylabel('Za Statistic')

    plt.savefig(output_filename)
    plt.show()

In [None]:
python_base_path = '/home/mason/full_runs/python'
idl_base_path = '/home/mason/full_runs/idl'
slope = 1.00
r_stat_multiplier = 3.00
min_rim_percentage = 0.40
effective_radius_multiplier = 1.50

filenames = [
    f'{python_base_path}/{slope:.2f}_{r_stat_multiplier:.2f}_{min_rim_percentage:.2f}_{effective_radius_multiplier:.2f}/{x}/statistics.csv'
    for x in range(1, 56)
]

python_sim_stats = []
for index, filename in enumerate(filenames):
    stats = pd.read_csv(filename)
    stats['simulation_id'] = index
    python_sim_stats.append(stats)

python_stats_combined = pd.concat(python_sim_stats, axis=0).dropna()
python_stats_aggregated = aggregate_simulations(python_stats_combined, len(sim_stats))


filenames = [
    f'{idl_base_path}/{slope:.2f}_{r_stat_multiplier:.2f}_{min_rim_percentage:.2f}_{effective_radius_multiplier:.2f}/run_{-slope:.2f}_{effective_radius_multiplier:.2f}_{min_rim_percentage:.2f}_{r_stat_multiplier:.2f}_{x}_1.csv'
    for x in range(1, 56)
]

idl_sim_stats = []
for index, filename in enumerate(filenames):
    stats = pd.read_csv(filename, skiprows=1)
    stats.columns = ['n_craters_added_in_observed_area', 'n_craters_in_observed_area', 'areal_density', 'z', 'za']
    stats.areal_density = stats.areal_density / 10000**2
    stats['simulation_id'] = index
    idl_sim_stats.append(stats)

idl_stats_combined = pd.concat(idl_sim_stats, axis=0).dropna()
idl_stats_aggregated = aggregate_simulations(idl_stats_combined, len(sim_stats))

plot_statistics(python_stats_aggregated)
plot_statistics(idl_stats_aggregated)

In [None]:
plot_four_metric_plots(python_stats_aggregated,
                       idl_stats_aggregated,
                       'n_craters_in_observed_area_mean',
                       'areal_density_mean',
                       'z_mean',
                       'za_mean',
                       f'figures/mean_curves_{slope:.2f}_{r_stat_multiplier:.2f}_{min_rim_percentage:.2f}_{effective_radius_multiplier:.2f}.png')

In [None]:
plot_four_metric_plots(python_stats_combined[python_stats_combined.simulation_id == 0],
                       idl_stats_combined[idl_stats_combined.simulation_id == 0],
                       'n_craters_in_observed_area',
                       'areal_density',
                       'z',
                       'za',
                       f'figures/single_sim_curves_{slope:.2f}_{r_stat_multiplier:.2f}_{min_rim_percentage:.2f}_{effective_radius_multiplier:.2f}.png')

In [None]:
# Another configuration
python_base_path = '/home/mason/full_runs/python'
idl_base_path = '/home/mason/full_runs/idl'
slope = 2.00
r_stat_multiplier = 3.00
min_rim_percentage = 0.40
effective_radius_multiplier = 1.70

filenames = [
    f'{python_base_path}/{slope:.2f}_{r_stat_multiplier:.2f}_{min_rim_percentage:.2f}_{effective_radius_multiplier:.2f}/{x}/statistics.csv'
    for x in range(1, 56)
]

sim_stats = []
for index, filename in enumerate(filenames):
    stats = pd.read_csv(filename)
    stats['simulation_id'] = index
    sim_stats.append(stats)

stats = pd.concat(sim_stats, axis=0).dropna()
python_stats = aggregate_simulations(stats, len(sim_stats))


filenames = [
    f'{idl_base_path}/{slope:.2f}_{r_stat_multiplier:.2f}_{min_rim_percentage:.2f}_{effective_radius_multiplier:.2f}/run_{-slope:.2f}_{effective_radius_multiplier:.2f}_{min_rim_percentage:.2f}_{r_stat_multiplier:.2f}_{x}_1.csv'
    for x in range(1, 56)
]

sim_stats = []
for index, filename in enumerate(filenames):
    stats = pd.read_csv(filename, skiprows=1)
    stats.columns = ['n_craters_added_in_observed_area', 'n_craters_in_observed_area', 'areal_density', 'z', 'za']
    stats.areal_density = stats.areal_density / 10000**2
    stats['simulation_id'] = index
    sim_stats.append(stats)

stats = pd.concat(sim_stats, axis=0).dropna()
idl_stats = aggregate_simulations(stats, len(sim_stats))

plot_statistics(python_stats)
plot_statistics(idl_stats)

In [None]:
# Plot just mean curves
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Plot crater count
ax = axes[0][0]
ax.plot(python_stats.n_craters_added_in_observed_area, python_stats.n_craters_in_observed_area_mean, color='b', label='Python')
ax.plot(idl_stats.n_craters_added_in_observed_area, idl_stats.n_craters_in_observed_area_mean, color='r', label='IDL')
ax.set_xlabel('Craters Generated')
ax.set_ylabel('Craters in Study Region')
ax.legend(loc='upper left')

# Plot crater count
ax = axes[0][1]
ax.plot(python_stats.n_craters_added_in_observed_area, python_stats.areal_density_mean, color='b', label='Python')
ax.plot(idl_stats.n_craters_added_in_observed_area, idl_stats.areal_density_mean, color='r', label='IDL')
ax.set_xlabel('Craters Generated')
ax.set_ylabel('Areal Density')

# Plot Z statistic
ax = axes[1][0]
ax.plot(python_stats.n_craters_added_in_observed_area, python_stats.z_mean, color='b', label='Python')
ax.plot(idl_stats.n_craters_added_in_observed_area, idl_stats.z_mean, color='r', label='IDL')
ax.axhline(-1.96, color='g', ls='--')
ax.axhline(1.96, color='g', ls='--')
ax.axhline(-2.58, color='r', ls='--')
ax.axhline(2.58, color='r', ls='--')
ax.set_xlabel('Craters Generated')
ax.set_ylabel('Z Statistic')

# Plot Za statistic
ax = axes[1][1]
ax.plot(python_stats.n_craters_added_in_observed_area, python_stats.za_mean, color='b', label='Python')
ax.plot(idl_stats.n_craters_added_in_observed_area, idl_stats.za_mean, color='r', label='IDL')
ax.axhline(-1.96, color='g', ls='--')
ax.axhline(1.96, color='g', ls='--')
ax.axhline(-2.58, color='r', ls='--')
ax.axhline(2.58, color='r', ls='--')
ax.set_xlabel('Craters Generated')
ax.set_ylabel('Za Statistic')

plt.show()

### Comparing mean curves using bootstrapping and KS/AD tests

In [3]:
def read_idl_stats(sim_number: int,
                   base_path: str,
                   slope: float,
                   r_stat_multiplier: float,
                   min_rim_percentage: float,
                   effective_radius_multiplier: float) -> pd.DataFrame:
    filename = f'{base_path}/{slope:.2f}_{r_stat_multiplier:.2f}_{min_rim_percentage:.2f}_{effective_radius_multiplier:.2f}/run_{-slope:.2f}_{effective_radius_multiplier:.2f}_{min_rim_percentage:.2f}_{r_stat_multiplier:.2f}_{sim_number}_1.csv'
    
    stats = pd.read_csv(filename, skiprows=1)
    stats.columns = ['n_craters_added_in_observed_area', 'n_craters_in_observed_area', 'areal_density', 'z', 'za']
    stats.areal_density = stats.areal_density / 10000**2

    return stats

def read_python_stats(sim_number: int,
                      base_path: str,
                      slope: float,
                      r_stat_multiplier: float,
                      min_rim_percentage: float,
                      effective_radius_multiplier: float) -> pd.DataFrame:
    filename = f'{base_path}/{slope:.2f}_{r_stat_multiplier:.2f}_{min_rim_percentage:.2f}_{effective_radius_multiplier:.2f}/{sim_number}/statistics.csv'
    
    stats = pd.read_csv(filename)
    stats.columns = ['crater_id', 'n_craters_added_in_observed_area', 'n_craters_in_observed_area', 'areal_density', 'z', 'za']

    return stats

def get_all_pairwise_ks_and_ad_stats(stats1: List[pd.DataFrame],
                                     stats2: List[pd.DataFrame],
                                     metric: str) -> List[Tuple[float, float]]:
    """
    Returns all pairwise KS and AD statistics
    """
    results = []
    
    for s1 in stats1:
        for s2 in stats2:
            ks_stat = stats.kstest(s1[metric], s2[metric]).statistic
            ad_stat = stats.anderson_ksamp([s1[metric], s2[metric]]).statistic
            
            results.append((ks_stat, ad_stat))
    
    return results

def get_boostrapped_mean_ci(samples: np.array, n_resamples: int = 5000) -> Tuple[float, float]:
    """
    Boostrap a 95% CI on the mean of the sample
    Using BCa - bias-accelerated boostrap confidence interval
    """
    samples = np.expand_dims(samples, axis=0)
    result = stats.bootstrap(samples,
                             np.mean,
                             confidence_level=0.95,
                             method='BCa',
                             n_resamples=n_resamples)
    return result.confidence_interval.low, result.confidence_interval.high

In [None]:
python_base_path = f'/home/mason/full_runs/python/'
idl_base_path = f'/home/mason/full_runs/idl/'

slope = 1.00
r_stat_multiplier = 3.00
min_rim_percentage = 0.40
effective_radius_multiplier = 1.50

In [None]:
# Comparing one half of the IDL simulations to the other
first_choices = np.random.choice(range(1, 55), 27, replace=False)
second_choices = [x for x in range(1, 55) if x not in first_choices]

idl_stats1 = [
    read_idl_stats(x,
                   idl_base_path,
                   slope,
                   r_stat_multiplier,
                   min_rim_percentage,
                   effective_radius_multiplier)
    for x in first_choices
]

idl_stats2 = [
    read_idl_stats(x,
                   idl_base_path,
                   slope,
                   r_stat_multiplier,
                   min_rim_percentage,
                   effective_radius_multiplier)
    for x in second_choices
]

metrics = ['n_craters_in_observed_area', 'areal_density', 'z', 'za']
for metric in metrics:
    pairwise_stats = get_all_pairwise_ks_and_ad_stats(idl_stats1, idl_stats2, metric)
    ks_mean_ci = get_boostrapped_mean_ci([x[0] for x in pairwise_stats])
    ad_mean_ci = get_boostrapped_mean_ci([x[1] for x in pairwise_stats])
    
    print(f'{metric} 95% CI on the mean:')
    print(f'  KS: {ks_mean_ci}')
    print(f'  AD: {ad_mean_ci}')

In [None]:
python_base_path = f'/home/mason/full_runs/python/'
idl_base_path = f'/home/mason/full_runs/idl/'

slope = 1.00
r_stat_multiplier = 3.00
min_rim_percentage = 0.40
effective_radius_multiplier = 1.50
n_sims = 55

idl_stats = [
    read_idl_stats(x,
                   idl_base_path,
                   slope,
                   r_stat_multiplier,
                   min_rim_percentage,
                   effective_radius_multiplier)
    for x in range(1, n_sims+1)
]

python_stats = [
    read_python_stats(x,
                      python_base_path,
                   slope,
                   r_stat_multiplier,
                   min_rim_percentage,
                   effective_radius_multiplier)
    for x in range(1, n_sims+1)
]

metrics = ['n_craters_in_observed_area', 'areal_density', 'z', 'za']
for metric in metrics:
    pairwise_stats = get_all_pairwise_ks_and_ad_stats(idl_stats, python_stats, metric)
    ks_mean_ci = get_boostrapped_mean_ci([x[0] for x in pairwise_stats])
    ad_mean_ci = get_boostrapped_mean_ci([x[1] for x in pairwise_stats])
    
    print(f'{metric} 95% CI on the mean:')
    print(f'  KS: {ks_mean_ci}')
    print(f'  AD: {ad_mean_ci}')

In [None]:
python_base_path = f'/home/mason/full_runs/python/'
idl_base_path = f'/home/mason/full_runs/idl/'

slope = 2.00
r_stat_multiplier = 3.00
min_rim_percentage = 0.40
effective_radius_multiplier = 1.70
n_sims = 55

idl_stats = [
    read_idl_stats(x,
                   idl_base_path,
                   slope,
                   r_stat_multiplier,
                   min_rim_percentage,
                   effective_radius_multiplier)
    for x in range(1, n_sims+1)
]

python_stats = [
    read_python_stats(x,
                      python_base_path,
                   slope,
                   r_stat_multiplier,
                   min_rim_percentage,
                   effective_radius_multiplier)
    for x in range(1, n_sims+1)
]

metrics = ['n_craters_in_observed_area', 'areal_density', 'z', 'za']
for metric in metrics:
    pairwise_stats = get_all_pairwise_ks_and_ad_stats(idl_stats, python_stats, metric)
    ks_mean_ci = get_boostrapped_mean_ci([x[0] for x in pairwise_stats])
    ad_mean_ci = get_boostrapped_mean_ci([x[1] for x in pairwise_stats])
    
    print(f'{metric} 95% CI on the mean:')
    print(f'  KS: {ks_mean_ci}')
    print(f'  AD: {ad_mean_ci}')

### Comparing SFDs using boostrapped CIs on KS and AD statistics

The final SFDs of all Python simulations are compared against those of all IDL simulations. For each pair (Python sim, IDL sim), KS and AD statistics are computed. A 95% confidence interval is bootstrapped from the resulting KS and AD statistics.

In [None]:
def read_idl_radii(sim_number: int,
                   base_path: str,
                   slope: float,
                   r_stat_multiplier: float,
                   min_rim_percentage: float,
                   effective_radius_multiplier: float) -> pd.DataFrame:
    filename = f'{base_path}/{slope:.2f}_{r_stat_multiplier:.2f}_{min_rim_percentage:.2f}_{effective_radius_multiplier:.2f}/run_-{slope:.2f}_{effective_radius_multiplier:.2f}_{min_rim_percentage:.2f}_{r_stat_multiplier:.2f}_{sim_number}_3.csv'
    
    data = pd.read_csv(filename, skiprows=3, header=None)
    data.columns = ['x', 'y', 'radius', 'order_removed']
    radii = data[data.order_removed == 0].radius

    return radii

def read_python_radii(sim_number: int,
                      base_path: str,
                      slope: float,
                      r_stat_multiplier: float,
                      min_rim_percentage: float,
                      effective_radius_multiplier: float,
                      step_number: int = 5000) -> pd.DataFrame:
    filename = f'{base_path}/{slope:.2f}_{r_stat_multiplier:.2f}_{min_rim_percentage:.2f}_{effective_radius_multiplier:.2f}/{sim_number}/state_{step_number}.csv'
    
    radii = pd.read_csv(filename).radius
    return radii

In [None]:
slope = 1.00
r_stat_multiplier = 3.00
min_rim_percentage = 0.40
effective_radius_multiplier = 1.50
n_sims = 55

idl_radii = [
    read_idl_radii(x,
                   idl_base_path,
                   slope,
                   r_stat_multiplier,
                   min_rim_percentage,
                   effective_radius_multiplier)
    for x in range(1, n_sims+1)
]
idl_radii = [pd.DataFrame(x, columns=['radius']) for x in idl_radii]

python_radii = [
    read_python_radii(x,
                      python_base_path,
                      slope,
                      r_stat_multiplier,
                      min_rim_percentage,
                      effective_radius_multiplier,
                      4999)
    for x in range(1, n_sims+1)
]
python_radii = [pd.DataFrame(x, columns=['radius']) for x in python_radii]

pairwise_stats = get_all_pairwise_ks_and_ad_stats(idl_radii, python_radii, 'radius')
ks_mean_ci = get_boostrapped_mean_ci([x[0] for x in pairwise_stats], n_resamples=10000)
ad_mean_ci = get_boostrapped_mean_ci([x[1] for x in pairwise_stats], n_resamples=10000)

print(f'Radius 95% CI on the mean:')
print(f'  KS statistic: {ks_mean_ci}')
print(f'  AD statistic: {ad_mean_ci}')

In [None]:
slope = 2.00
r_stat_multiplier = 3.00
min_rim_percentage = 0.40
effective_radius_multiplier = 1.70
n_sims = 55

idl_radii = [
    read_idl_radii(x,
                   idl_base_path,
                   slope,
                   r_stat_multiplier,
                   min_rim_percentage,
                   effective_radius_multiplier)
    for x in range(1, n_sims+1)
]
idl_radii = [pd.DataFrame(x, columns=['radius']) for x in idl_radii]

python_radii = [
    read_python_radii(x,
                      python_base_path,
                      slope,
                      r_stat_multiplier,
                      min_rim_percentage,
                      effective_radius_multiplier,
                      4999)
    for x in range(1, n_sims+1)
]
python_radii = [pd.DataFrame(x, columns=['radius']) for x in python_radii]

pairwise_stats = get_all_pairwise_ks_and_ad_stats(idl_radii, python_radii, 'radius')
ks_mean_ci = get_boostrapped_mean_ci([x[0] for x in pairwise_stats], n_resamples=10000)
ad_mean_ci = get_boostrapped_mean_ci([x[1] for x in pairwise_stats], n_resamples=10000)

print(f'Radius 95% CI on the mean:')
print(f'  KS statistic: {ks_mean_ci}')
print(f'  AD statistic: {ad_mean_ci}')

### Comparing using bootstrap T

In [71]:
python_base_path = f'/home/mason/full_runs/python/'
idl_base_path = f'/home/mason/full_runs/idl/'

slope = 1.00
r_stat_multiplier = 3.00
min_rim_percentage = 0.40
effective_radius_multiplier = 1.50
n_sims = 5

idl_stats = [
    read_idl_stats(x,
                   idl_base_path,
                   slope,
                   r_stat_multiplier,
                   min_rim_percentage,
                   effective_radius_multiplier)
    for x in range(1, n_sims+1)
]

python_stats = [
    read_python_stats(x,
                      python_base_path,
                   slope,
                   r_stat_multiplier,
                   min_rim_percentage,
                   effective_radius_multiplier)
    for x in range(1, n_sims+1)
]

In [72]:
def get_all_pairwise_samples(python_series: List[pd.Series],
                             idl_series: List[pd.Series]) -> pd.DataFrame:
    paired_series = []
    for p in python_series:
        for i in idl_series:
            if (i - p).any():
                paired = pd.DataFrame(pd.concat([p.rename("python"), i.rename("idl")], axis=1))
                paired_series.append(paired)
    
    return pd.concat(paired_series, axis=0).reset_index()

In [64]:
metric_name = "areal_density"
python_series = [x[metric_name] for x in idl_stats]
idl_series = [x[metric_name] for x in idl_stats]
pairwise_samples = get_all_pairwise_samples(python_series, idl_series)
difference = pairwise_samples.python - pairwise_samples.idl
stats.ttest_1samp(a=difference.dropna(), popmean=0.0)

Ttest_1sampResult(statistic=4.4263987815523627e-13, pvalue=0.9999999999996468)

In [65]:
metric_name = "z"
python_series = [x[metric_name] for x in idl_stats]
idl_series = [x[metric_name] for x in idl_stats]
pairwise_samples = get_all_pairwise_samples(python_series, idl_series)
difference = pairwise_samples.python - pairwise_samples.idl
stats.ttest_1samp(a=difference.dropna(), popmean=0.0)

Ttest_1sampResult(statistic=1.632774660839682e-13, pvalue=0.9999999999998697)

In [66]:
metric_name = "za"
python_series = [x[metric_name] for x in idl_stats]
idl_series = [x[metric_name] for x in idl_stats]
pairwise_samples = get_all_pairwise_samples(python_series, idl_series)
difference = pairwise_samples.python - pairwise_samples.idl
stats.ttest_1samp(a=difference.dropna(), popmean=0.0)

Ttest_1sampResult(statistic=2.363041035925222e-14, pvalue=0.9999999999999811)

In [67]:
metric_name = "areal_density"
python_series = [x[metric_name] for x in python_stats]
idl_series = [x[metric_name] for x in python_stats]
pairwise_samples = get_all_pairwise_samples(python_series, idl_series)
difference = pairwise_samples.python - pairwise_samples.idl
stats.ttest_1samp(a=difference.dropna(), popmean=0.0)

Ttest_1sampResult(statistic=-8.40596436559658e-14, pvalue=0.9999999999999329)

In [68]:
metric_name = "areal_density"
python_series = [x[metric_name] for x in python_stats]
idl_series = [x[metric_name] for x in idl_stats]
pairwise_samples = get_all_pairwise_samples(python_series, idl_series)
difference = pairwise_samples.python - pairwise_samples.idl
stats.ttest_1samp(a=difference.dropna(), popmean=0.0)

Ttest_1sampResult(statistic=330.70275727787225, pvalue=0.0)

In [69]:
metric_name = "z"
python_series = [x[metric_name] for x in python_stats]
idl_series = [x[metric_name] for x in idl_stats]
pairwise_samples = get_all_pairwise_samples(python_series, idl_series)
difference = pairwise_samples.python - pairwise_samples.idl
stats.ttest_1samp(a=difference.dropna(), popmean=0.0)

Ttest_1sampResult(statistic=-194.0164009638684, pvalue=0.0)

In [70]:
metric_name = "za"
python_series = [x[metric_name] for x in python_stats]
idl_series = [x[metric_name] for x in idl_stats]
pairwise_samples = get_all_pairwise_samples(python_series, idl_series)
difference = pairwise_samples.python - pairwise_samples.idl
stats.ttest_1samp(a=difference.dropna(), popmean=0.0)

Ttest_1sampResult(statistic=124.97157817429037, pvalue=0.0)

In [None]:
bootstrap_result = stats.bootstrap((difference.dropna().values,), np.mean, batch=2, n_resamples=10)
bootstrap_result

In [None]:
bootstrap_result = stats.bootstrap((difference.dropna().values,), np.mean, batch=5, n_resamples=100)
bootstrap_result

In [None]:
bootstrap_result = stats.bootstrap((difference.dropna().values,), np.mean, batch=5, n_resamples=1000)
bootstrap_result

In [None]:
metric_name = "z"
python_series = [x[metric_name] for x in python_stats]
idl_series = [x[metric_name] for x in idl_stats]
pairwise_samples = get_all_pairwise_samples(python_series, idl_series)
difference = pairwise_samples.python - pairwise_samples.idl

bootstrap_result = stats.bootstrap((difference.dropna().values,), np.mean, batch=2, n_resamples=10)
bootstrap_result

In [None]:
metric_name = "za"
python_series = [x[metric_name] for x in python_stats]
idl_series = [x[metric_name] for x in idl_stats]
pairwise_samples = get_all_pairwise_samples(python_series, idl_series)
difference = pairwise_samples.python - pairwise_samples.idl

bootstrap_result = stats.bootstrap((difference.dropna().values,), np.mean, batch=2, n_resamples=10)
bootstrap_result