In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import polars as pl
import polars.selectors as cs

SPECIES = ["spruce", "pine", "beech", "oak"]

# Load the performance summary CSV file
PERF_CSV = "./cache/performance_summary.csv"
perf = pl.read_csv(PERF_CSV)


cols = cs.by_name("spruce", "pine", "beech", "oak")
keys = ["ablation", "model", "group_by", "split"]

perf = (
    perf.unpivot(on=cols, variable_name="species", value_name="r2_score", index=keys)
    .with_columns(
        r2_mean=pl.col("r2_score").str.split(" ± ").list.get(0).cast(pl.Float64),
        r2_std=pl.col("r2_score").str.split(" ± ").list.get(1).cast(pl.Float64),
    )
    .drop("r2_score")
    .filter(
        (pl.col("model") == "lgbm")
        & (pl.col("group_by") == "tree_id")
        & (pl.col("split") == "test")
    )
    .select("ablation", "species", "r2_mean", "r2_std")
)

perf

ablation,species,r2_mean,r2_std
str,str,f64,f64
"""all""","""spruce""",0.5,0.01
"""all""","""pine""",0.44,0.02
"""all""","""beech""",0.46,0.01
"""all""","""oak""",0.38,0.06


In [3]:
from scipy.stats import ttest_ind_from_stats
from functools import partial

perf_pvalue = (
    perf.join(perf, on=["species"], suffix="_2")
    .select(
        pl.col("species"),
        pl.col("ablation").alias("ablation1"),
        pl.col("ablation_2").alias("ablation2"),
        pl.col("r2_mean").alias("mean1"),
        pl.col("r2_std").alias("std1"),
        pl.col("r2_mean_2").alias("mean2"),
        pl.col("r2_std_2").alias("std2"),
    )
    .with_columns(
        pl.struct(["mean1", "std1", "mean2", "std2"])
        .map_elements(
            lambda row: partial(ttest_ind_from_stats, nobs1=5, nobs2=5)(**row)[1],
            return_dtype=pl.Float64,
        )
        .alias("p-value")
    )
)

for species in SPECIES:
    print()
    print(f"Species: {species}")
    with pl.Config() as cfg:
        cfg.set_tbl_formatting("ASCII_MARKDOWN")
        cfg.set_float_precision(3)
        cfg.set_tbl_rows(100)
        cfg.set_tbl_hide_column_data_types(True)

        print(
            perf_pvalue.filter(pl.col("species") == species).pivot(
                index="ablation1", on="ablation2", values="p-value"
            )
        )


Species: spruce
shape: (1, 2)
| ablation1 | all   |
|-----------|-------|
| all       | 1.000 |

Species: pine
shape: (1, 2)
| ablation1 | all   |
|-----------|-------|
| all       | 1.000 |

Species: beech
shape: (1, 2)
| ablation1 | all   |
|-----------|-------|
| all       | 1.000 |

Species: oak
shape: (1, 2)
| ablation1 | all   |
|-----------|-------|
| all       | 1.000 |


In [4]:
perf_pvalue.pivot(
    index="ablation1",
    on="ablation2",
    values="p-value",
    aggregate_function="mean",
)

ablation1,all
str,f64
"""all""",1.0
