# Changeâ€“Learn Trade-Off Sweep Plots (SciencePlots)

This notebook reproduces the Jobseeker sweep plots from `performative-sweep.zip` using:
- $D_Z = 5.29$
- $L_{\ell} = \sqrt{28}$

It generates:
- `change_learn_tradeoff_thm313_science_DZ5p29_LellSqrt28.png`
- `change_learn_tradeoff_thm315_science_DZ5p29_LellSqrt28.png`


In [None]:
# If needed, install SciencePlots:
# !pip -q install SciencePlots

import os
import math
import zipfile
import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss

import scienceplots  # noqa: F401

# SciencePlots styling (no LaTeX dependency)
plt.style.use(["science", "no-latex"])

In [None]:
# --------------------------
# Inputs / outputs
# --------------------------
ZIP_PATH = "performative-sweep.zip"  # path to the zip file
EXTRACT_DIR = "performative_sweep_extracted"

OUT_PLOT_313 = "change_learn_tradeoff_thm313_science_DZ5p29_LellSqrt28.png"
OUT_PLOT_315 = "change_learn_tradeoff_thm315_science_DZ5p29_LellSqrt28.png"

# --------------------------
# Constants
# --------------------------
delta = 0.05
q = 1.959963984540054  # Phi^{-1}(0.975) for delta=0.05
p_bound = 2            # theorem exponent/order

D_Z = 5.29
L_f = 0.25             # since ||theta||<=1 and sigma'(t) <= 1/4
L_ell = math.sqrt(28)
F = 1.0

C_infty = 7.3855
B_const = 0.001

# --------------------------
# Plot labels / sizing
# --------------------------
FIGSIZE = (4.2, 4.2)
TITLE = "Change-Learn Trade-Off on Jobseeker Data"
XLABEL = "Share of Jobseekers Receiving Job Trainings"
YLABEL = "Generalization Gap Bound (Blue) and Its Components"

In [None]:
# --------------------------
# Helpers
# --------------------------
def unzip(zip_path: str, extract_dir: str) -> None:
    if os.path.exists(extract_dir):
        shutil.rmtree(extract_dir)
    os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(extract_dir)


def find_run_folders(extract_dir: str):
    """Return folders that contain train_predictions.csv and test_predictions.csv."""
    run_folders = []
    for root, _, files in os.walk(extract_dir):
        if "train_predictions.csv" in files and "test_predictions.csv" in files:
            run_folders.append(root)
    return run_folders


def parse_r_from_folder(folder: str):
    """
    Parse r=m/n from a folder name of the form "..._p_0.2500".
    In this sweep, folder "p" means r (NOT p_bound).
    """
    base = os.path.basename(folder)
    if "_p_" in base:
        try:
            return float(base.split("_p_")[-1])
        except ValueError:
            return None
    return None


def compute_R(r: float, n: int) -> float:
    """
    Radius:
        R = max( (q * sqrt(r(1-r)/n))^{1/p_bound} * D_Z,  r^{1/p_bound} * D_Z )
    """
    term1 = (q * math.sqrt(r * (1 - r) / n)) ** (1 / p_bound) * D_Z
    term2 = (r ** (1 / p_bound)) * D_Z
    return max(term1, term2)


def components_313(r: float, n: int):
    """
    Components for the bound labeled 'Thm 3.13' in the plots.

    Theorem term: (48/sqrt(n)) * ( C_infty + L_ell*L_f*R^{1-p}*D_Z^p )
    With p_bound=2: R^{1-p}*D_Z^p = D_Z^2 / R
    """
    R = compute_R(r, n)
    complexity = (48 / math.sqrt(n)) * (C_infty + (L_ell * L_f) * (D_Z**2 / R))
    sampling = F * math.sqrt(2 * math.log(2 / delta) / n)
    performative = 2 * L_ell * R
    total = complexity + sampling + performative
    return total, complexity, performative, sampling


def components_315(r: float, n: int, B: float = B_const):
    """
    Components for the bound labeled 'Thm. 3.15' in the plots.

    Replacement complexity add-on:
        B * 2^{p-1} * (1 + D_Z/R)^p
    with p_bound=2.
    """
    R = compute_R(r, n)
    replacement = B * (2 ** (p_bound - 1)) * (1 + D_Z / R) ** p_bound
    complexity = (48 / math.sqrt(n)) * (C_infty + replacement)
    sampling = F * math.sqrt(2 * math.log(2 / delta) / n)
    performative = 2 * L_ell * R
    total = complexity + sampling + performative
    return total, complexity, performative, sampling


def compute_empirical_logloss(csv_path: str) -> float:
    df = pd.read_csv(csv_path)
    y = df["y_true"].to_numpy()
    p = np.clip(df["y_pred_proba"].to_numpy(), 1e-15, 1 - 1e-15)
    return float(log_loss(y, p, labels=[0, 1]))


def make_tradeoff_plot(df: pd.DataFrame,
                       total_col: str,
                       comp_col: str,
                       perf_col: str,
                       samp_col: str,
                       total_label: str,
                       outpath: str) -> None:
    fig, ax = plt.subplots(figsize=FIGSIZE)

    # Lines WITH point markers so legend shows markers (points) too.
    ax.plot(df["r"], df[total_col], marker="o", markersize=3, linewidth=1.2, label=total_label)
    ax.plot(df["r"], df[comp_col],  marker="o", markersize=3, linewidth=1.2, label="Complexity term")
    ax.plot(df["r"], df[perf_col],  marker="o", markersize=3, linewidth=1.2, label="Performative term")
    ax.plot(df["r"], df[samp_col],  marker="o", markersize=3, linewidth=1.2, label="Sampling term")

    ax.set_title(TITLE)
    ax.set_xlabel(XLABEL)
    ax.set_ylabel(YLABEL)
    ax.legend(fontsize=8)

    fig.tight_layout()
    fig.savefig(outpath, dpi=300)
    plt.close(fig)

In [None]:
# --------------------------
# Run: unzip, compute bounds, and plot
# --------------------------
if not os.path.exists(ZIP_PATH):
    raise FileNotFoundError(f"Could not find {ZIP_PATH}. Put the zip next to this notebook or update ZIP_PATH.")

unzip(ZIP_PATH, EXTRACT_DIR)
run_folders = find_run_folders(EXTRACT_DIR)
if len(run_folders) == 0:
    raise RuntimeError("No run folders found (expected train_predictions.csv and test_predictions.csv).")

records = []
for folder in run_folders:
    r = parse_r_from_folder(folder)
    if r is None:
        raise RuntimeError(f"Could not parse r from folder name: {folder}")

    train_csv = os.path.join(folder, "train_predictions.csv")
    test_csv = os.path.join(folder, "test_predictions.csv")

    # Optional: empirical risks (not used in plots)
    train_ll = compute_empirical_logloss(train_csv)
    test_ll = compute_empirical_logloss(test_csv)

    n_train = len(pd.read_csv(train_csv))

    tot313, comp313, perf313, samp313 = components_313(r, n_train)
    tot315, comp315, perf315, samp315 = components_315(r, n_train, B=B_const)

    records.append({
        "r": r,
        "n_train": n_train,
        "train_logloss": train_ll,
        "test_logloss": test_ll,
        "total_313": tot313,
        "complexity_313": comp313,
        "performative_313": perf313,
        "sampling_313": samp313,
        "total_315": tot315,
        "complexity_315": comp315,
        "performative_315": perf315,
        "sampling_315": samp315,
    })

df = pd.DataFrame(records).sort_values("r").reset_index(drop=True)
df.head()

In [None]:
make_tradeoff_plot(
    df,
    total_col="total_313",
    comp_col="complexity_313",
    perf_col="performative_313",
    samp_col="sampling_313",
    total_label="Total bound (Thm 3.13)",
    outpath=OUT_PLOT_313,
)

make_tradeoff_plot(
    df,
    total_col="total_315",
    comp_col="complexity_315",
    perf_col="performative_315",
    samp_col="sampling_315",
    total_label="Total bound (Thm. 3.15)",
    outpath=OUT_PLOT_315,
)

print("Wrote:")
print(" -", OUT_PLOT_313)
print(" -", OUT_PLOT_315)

## Display plots

In [None]:
from PIL import Image
from IPython.display import display

display(Image.open(OUT_PLOT_313))
display(Image.open(OUT_PLOT_315))