In [1]:
# ==============================================================================
# V5 CELL 1 — THE SHADOWMAP ORACLE CHASER (144-D WITH L2 PRIOR & MEDIAN LOCK)
# ==============================================================================

import sys, subprocess, os, time, math
import numpy as np
import scipy.sparse as sp
import scipy.stats
import concurrent.futures

try:
    import cma
    import pymatching
except ImportError:
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "cma", "pymatching", "scipy"], check=True)
    import cma
    import pymatching

print("="*75)
print("V5 SHADOWMAP PHYSICS ENGINE: 144-D REGULARIZED CMA-ES (THE ORACLE CHASER)")
print("="*75)

d = 9
p_good = 0.04
p_bad = 0.20
trials = 80000

print(f"Fabricating d={d} chip with exactly 144 independent physical edges...")
print(f" -> Injecting 'Dead Quadrant' (20% error) in the Top-Right corner...")

H_rows, H_cols, L_cols, true_probs = [], [], [], []
edge_types, edge_coords = [], []
edge_idx = 0
j_cut = d // 2

for r in range(d):
    for c in range(d):
        u = r * d + c

        # Horizontal
        if c + 1 < d:
            H_rows.extend([u, r * d + (c + 1)]); H_cols.extend([edge_idx, edge_idx])
            is_defect = (r <= 3) and (c >= 5)
            true_probs.append(p_bad if is_defect else p_good)
            edge_types.append('H'); edge_coords.append((r, c))
            if c == j_cut: L_cols.append(edge_idx)
            edge_idx += 1

        # Vertical
        if r + 1 < d:
            H_rows.extend([u, (r + 1) * d + c]); H_cols.extend([edge_idx, edge_idx])
            is_defect = (r <= 3) and (c >= 5)
            true_probs.append(p_bad if is_defect else p_good)
            edge_types.append('V'); edge_coords.append((r, c))
            edge_idx += 1

num_physical_edges = edge_idx

# Virtual Boundaries
for r in range(d):
    for c in (0, d - 1):
        u = r * d + c
        H_rows.append(u); H_cols.append(edge_idx)
        true_probs.append(0.0)
        edge_types.append('B'); edge_coords.append((r, c))
        edge_idx += 1

num_total_edges = edge_idx

H = sp.csc_matrix(([1]*len(H_rows), (H_rows, H_cols)), shape=(d*d, num_total_edges), dtype=np.uint8)
L = sp.csc_matrix(([1]*len(L_cols), ([0]*len(L_cols), L_cols)), shape=(1, num_total_edges), dtype=np.uint8)
true_probs = np.array(true_probs)

print(f"Generating {trials:,} CRN Training & Testing Trials...")
H_csr, L_csr = H.tocsr(), L.tocsr()

def make_crn(seed):
    rng = np.random.default_rng(seed)
    noise = (rng.random((trials, num_total_edges)) < true_probs).astype(np.uint8)
    syn = np.asarray((H_csr @ noise.T).T % 2, dtype=np.uint8)
    obs = np.asarray((L_csr @ noise.T).T % 2, dtype=np.uint8)[:, 0]
    return syn, obs

syn_train, obs_train = make_crn(777)
syn_test,  obs_test  = make_crn(888)

initial_weight = math.log((1 - p_good) / p_good)

# ------------------------------------------------------------------------------
# THE PATCHES: Median Normalization & L2 Prior
# ------------------------------------------------------------------------------
def normalize_to_median(w, target_median):
    w = np.maximum(0.01, np.asarray(w, dtype=np.float32))
    med = np.median(w)
    if med <= 0: return w
    return w * (target_median / med)

def evaluate_cma(weights_phys, dataset="train", return_arr=False):
    # PATCH 1: Fix Scale Degeneracy
    weights_phys = normalize_to_median(weights_phys, initial_weight)

    full_weights = np.zeros(num_total_edges, dtype=np.float32)
    full_weights[:num_physical_edges] = weights_phys
    full_weights[num_physical_edges:] = 0.0

    matcher = pymatching.Matching.from_check_matrix(H, weights=full_weights, faults_matrix=L)

    if dataset == "train":
        pred = matcher.decode_batch(syn_train)[:, 0]
        fails = int(np.sum(pred != obs_train))

        # PATCH 2: L2 Regularization (Occam's Razor)
        lambda_l2 = 1e-3
        reg = lambda_l2 * float(np.sum((weights_phys - initial_weight)**2))
        return fails + reg
    else:
        pred = matcher.decode_batch(syn_test)[:, 0]
        arr = (pred != obs_test)
        fails = int(np.sum(arr))
        if return_arr:
            return fails, arr
        return fails

# ------------------------------------------------------------------------------
print(f"\nEvaluating Naive Baseline (Uniform weight = {initial_weight:.3f})...")
base_fails_train = evaluate_cma(np.ones(num_physical_edges) * initial_weight, "train")
print(f" -> Baseline Train Rate: {int(base_fails_train)/trials:.5f}\n")

print(f"Igniting L2-Regularized CMA-ES across 144 dimensions...")
t0 = time.time()

# PATCH 3: CMA_diagonal & Popsize
options = {
    'bounds': [0.01, 10.0],
    'popsize': 28,
    'CMA_diagonal': 20,
    'verbose': -9
}
es = cma.CMAEvolutionStrategy(np.ones(num_physical_edges) * initial_weight, 0.5, options)

max_gens = 60

# MATCH CORES TO POPSIZE MAX
num_cores = min(os.cpu_count() or 1, 28)

with concurrent.futures.ThreadPoolExecutor(max_workers=num_cores) as executor:
    for gen in range(max_gens):
        solutions = es.ask()
        fitnesses = list(executor.map(lambda x: evaluate_cma(x, "train"), solutions))
        es.tell(solutions, fitnesses)

        best_fit = np.min(fitnesses)
        if (gen+1) % 5 == 0 or gen == 0:
            print(f" -> CMA-ES GEN {gen+1:02d}/{max_gens} | Best Train Fitness (Fails+L2): {best_fit:.2f}")

opt_weights = normalize_to_median(es.result.xbest, initial_weight)
elapsed = time.time() - t0

# ------------------------------------------------------------------------------
# HOLDOUT TEST & ORACLE COMPARISON
# ------------------------------------------------------------------------------
opt_fails_test, opt_arr = evaluate_cma(opt_weights, "test", return_arr=True)
base_fails_test, base_arr = evaluate_cma(np.ones(num_physical_edges) * initial_weight, "test", return_arr=True)

th_weights = np.zeros(num_physical_edges)
for i in range(num_physical_edges):
    p = true_probs[i]
    th_weights[i] = math.log((1 - p) / p) if p > 0 else 1.0

th_fails_test, th_arr = evaluate_cma(th_weights, "test", return_arr=True)

# McNemar: AI vs Baseline
n01_base = int(np.sum((base_arr == True) & (opt_arr == False)))
n10_base = int(np.sum((base_arr == False) & (opt_arr == True)))
chi2_base = (abs(n01_base - n10_base) - 1)**2 / (n01_base + n10_base) if (n01_base + n10_base) > 0 else 0
p_val_base = scipy.stats.chi2.sf(chi2_base, 1) if (n01_base + n10_base) > 0 else 1.0

# PATCH 4: McNemar SHADOWMAP() vs ORACLE
n01_ora = int(np.sum((th_arr == True) & (opt_arr == False)))
n10_ora = int(np.sum((th_arr == False) & (opt_arr == True)))
chi2_ora = (abs(n01_ora - n10_ora) - 1)**2 / (n01_ora + n10_ora) if (n01_ora + n10_ora) > 0 else 0
p_val_ora = scipy.stats.chi2.sf(chi2_ora, 1) if (n01_ora + n10_ora) > 0 else 1.0

print("\n" + "="*75)
print(f"V5 REGULARIZED OPTIMIZATION COMPLETE IN {elapsed:.2f}s")
print("="*75)
print(f"TEST Baseline (Uniform):  {base_fails_test/trials:.5f} ({base_fails_test}/{trials})")
print(f"TEST CMA-ES Optimized:    {opt_fails_test/trials:.5f} ({opt_fails_test}/{trials})")
print(f"TEST Log-Odds ORACLE:     {th_fails_test/trials:.5f} ({th_fails_test}/{trials})")
print("-" * 75)
print("McNemar (SHADOWMAP() vs Baseline):")
print(f"  AI saved: {n01_base:,} | AI broke: {n10_base:,} | p-value: {p_val_base:.2e}")
print("McNemar (AI vs ORACLE) - THE GAP:")
print(f"  Gap Size: {opt_fails_test - th_fails_test} extra failures vs Oracle")
print(f"  Oracle saved, AI failed: {n10_ora:,} | AI saved, Oracle failed: {n01_ora:,}")
print(f"  p-value: {p_val_ora:.2e}")
print("="*75)

print("\nSHADOWMAP() REGULARIZED ASCII RENDER:")
print("Phantom defects should be suppressed. The map should be surgically clean.\n")

h_grid = np.zeros((d, d-1))
v_grid = np.zeros((d-1, d))

idx = 0
for t, (r, c) in zip(edge_types[:num_physical_edges], edge_coords[:num_physical_edges]):
    if t == 'H': h_grid[r, c] = opt_weights[idx]
    elif t == 'V': v_grid[r, c] = opt_weights[idx]
    idx += 1

print("HORIZONTAL EDGE WEIGHTS (Expected Defect: Rows 0-3, Cols 5-7):")
for r in range(d):
    row_str = " ".join([f"*{w:4.1f}*" if w < 2.3 else f" {w:4.1f} " for w in h_grid[r]])
    print(f"Row {r}: {row_str}")

print("\nVERTICAL EDGE WEIGHTS (Expected Defect: Rows 0-3, Cols 5-8):")
for r in range(d-1):
    row_str = " ".join([f"*{w:4.1f}*" if w < 2.3 else f" {w:4.1f} " for w in v_grid[r]])
    print(f"Row {r}: {row_str}")
print("="*75)

V5 SHADOWMAP PHYSICS ENGINE: 144-D REGULARIZED CMA-ES (THE ORACLE CHASER)
Fabricating d=9 chip with exactly 144 independent physical edges...
 -> Injecting 'Dead Quadrant' (20% error) in the Top-Right corner...
Generating 80,000 CRN Training & Testing Trials...

Evaluating Naive Baseline (Uniform weight = 3.178)...
 -> Baseline Train Rate: 0.02701

Igniting L2-Regularized CMA-ES across 144 dimensions...
 -> CMA-ES GEN 01/60 | Best Train Fitness (Fails+L2): 2044.03
 -> CMA-ES GEN 05/60 | Best Train Fitness (Fails+L2): 1722.04
 -> CMA-ES GEN 10/60 | Best Train Fitness (Fails+L2): 1609.06
 -> CMA-ES GEN 15/60 | Best Train Fitness (Fails+L2): 1495.08
 -> CMA-ES GEN 20/60 | Best Train Fitness (Fails+L2): 1438.09
 -> CMA-ES GEN 25/60 | Best Train Fitness (Fails+L2): 1386.09
 -> CMA-ES GEN 30/60 | Best Train Fitness (Fails+L2): 1373.11
 -> CMA-ES GEN 35/60 | Best Train Fitness (Fails+L2): 1352.11
 -> CMA-ES GEN 40/60 | Best Train Fitness (Fails+L2): 1333.12
 -> CMA-ES GEN 45/60 | Best Train F

In [2]:
# ==============================================================================
# V6 PHYSICS ENGINE: LLM-ENSEMBLE EDITION (THE ORACLE CHASER)
# Architecture by Gemini | ML Regularization by ChatGPT 5.2 Pro
# ==============================================================================

import sys, subprocess, os, time, math
import numpy as np
import scipy.sparse as sp
import scipy.stats
import concurrent.futures

try:
    import cma
    import pymatching
except ImportError:
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "cma", "pymatching", "scipy"], check=True)
    import cma
    import pymatching

print("="*75)
print("V6 PHYSICS ENGINE: 144-D LLM-ENSEMBLE REGULARIZED CMA-ES")
print("="*75)

d = 9
p_good = 0.04
p_bad = 0.20
trials = 80000

print(f"Fabricating d={d} chip with 144 independent physical edges...")
print(f" -> Injecting 'Dead Quadrant' (20% error) in Top-Right corner...")

H_rows, H_cols, L_cols, true_probs = [], [], [], []
edge_types, edge_coords = [], []
edge_idx = 0
j_cut = d // 2

for r in range(d):
    for c in range(d):
        u = r * d + c
        if c + 1 < d:  # Horizontal
            H_rows.extend([u, r * d + (c + 1)]); H_cols.extend([edge_idx, edge_idx])
            is_defect = (r <= 3) and (c >= 5)
            true_probs.append(p_bad if is_defect else p_good)
            edge_types.append('H'); edge_coords.append((r, c))
            if c == j_cut: L_cols.append(edge_idx)
            edge_idx += 1
        if r + 1 < d:  # Vertical
            H_rows.extend([u, (r + 1) * d + c]); H_cols.extend([edge_idx, edge_idx])
            is_defect = (r <= 3) and (c >= 5)
            true_probs.append(p_bad if is_defect else p_good)
            edge_types.append('V'); edge_coords.append((r, c))
            edge_idx += 1

num_physical_edges = edge_idx

# Virtual Boundaries
for r in range(d):
    for c in (0, d - 1):
        u = r * d + c
        H_rows.append(u); H_cols.append(edge_idx)
        true_probs.append(0.0)
        edge_types.append('B'); edge_coords.append((r, c))
        edge_idx += 1

num_total_edges = edge_idx

H = sp.csc_matrix(([1]*len(H_rows), (H_rows, H_cols)), shape=(d*d, num_total_edges), dtype=np.uint8)
L = sp.csc_matrix(([1]*len(L_cols), ([0]*len(L_cols), L_cols)), shape=(1, num_total_edges), dtype=np.uint8)
true_probs = np.array(true_probs)

print(f"Generating {trials:,} CRN Training & Testing Trials...")
H_csr, L_csr = H.tocsr(), L.tocsr()

def make_crn(seed):
    rng = np.random.default_rng(seed)
    noise = (rng.random((trials, num_total_edges)) < true_probs).astype(np.uint8)
    syn = np.asarray((H_csr @ noise.T).T % 2, dtype=np.uint8)
    obs = np.asarray((L_csr @ noise.T).T % 2, dtype=np.uint8)[:, 0]
    return syn, obs

syn_train, obs_train = make_crn(777)
syn_test,  obs_test  = make_crn(888)

# ------------------------------------------------------------------------------
# CHATGPT PATCHES: Plausible Bounds, Post-Clip, Rate-Based L2
# ------------------------------------------------------------------------------
initial_weight = math.log((1 - p_good) / p_good)
W_MIN = math.log((1 - 0.30) / 0.30)  # ~0.847
W_MAX = math.log((1 - 0.01) / 0.01)  # ~4.595
lambda_l2 = 3e-3

def normalize_to_median(w, target_median):
    w = np.maximum(0.01, np.asarray(w, dtype=np.float32))
    med = np.median(w)
    if med <= 0: return w
    return w * (target_median / med)

def evaluate_cma(weights_phys, dataset="train", return_arr=False):
    w = normalize_to_median(weights_phys, initial_weight)
    w = np.clip(w, W_MIN, W_MAX) # ChatGPT Patch: Safe Bounds!

    full_weights = np.zeros(num_total_edges, dtype=np.float32)
    full_weights[:num_physical_edges] = w
    full_weights[num_physical_edges:] = 0.0

    matcher = pymatching.Matching.from_check_matrix(H, weights=full_weights, faults_matrix=L)

    if dataset == "train":
        pred = matcher.decode_batch(syn_train)[:, 0]
        fails = int(np.sum(pred != obs_train))
        fail_rate = fails / trials
        # ChatGPT Patch: Regularize on RATE, not COUNT.
        reg = lambda_l2 * float(np.mean((w - initial_weight)**2))
        return float(fail_rate + reg)

    pred = matcher.decode_batch(syn_test)[:, 0]
    arr = (pred != obs_test)
    fails = int(np.sum(arr))
    if return_arr:
        return fails, arr
    return fails

# ------------------------------------------------------------------------------
print(f"\nEvaluating Naive Baseline (Uniform weight = {initial_weight:.3f})...")
base_fails_train = evaluate_cma(np.ones(num_physical_edges) * initial_weight, "train")
print(f" -> Baseline Train Rate (with L2): {base_fails_train:.5f}\n")

print(f"Igniting Rate-Regularized CMA-ES across 144 dimensions...")
t0 = time.time()

options = {
    'bounds': [W_MIN, W_MAX],
    'popsize': 28,
    'CMA_diagonal': 20,
    'verbose': -9
}
# Adjusted initial sigma to 0.3 to match the tighter physical bounds
es = cma.CMAEvolutionStrategy(np.ones(num_physical_edges) * initial_weight, 0.3, options)

max_gens = 60
num_cores = min(os.cpu_count() or 1, 28)

with concurrent.futures.ThreadPoolExecutor(max_workers=num_cores) as executor:
    for gen in range(max_gens):
        solutions = es.ask()
        fitnesses = list(executor.map(lambda x: evaluate_cma(x, "train"), solutions))
        es.tell(solutions, fitnesses)

        best_fit = np.min(fitnesses)
        if (gen+1) % 5 == 0 or gen == 0:
            print(f" -> CMA-ES GEN {gen+1:02d}/{max_gens} | Best Train Fitness (Rate+L2): {best_fit:.5f}")

opt_weights = normalize_to_median(es.result.xbest, initial_weight)
opt_weights = np.clip(opt_weights, W_MIN, W_MAX)
elapsed = time.time() - t0

# ------------------------------------------------------------------------------
# HOLDOUT TEST & ORACLE COMPARISON
# ------------------------------------------------------------------------------
opt_fails_test, opt_arr = evaluate_cma(opt_weights, "test", return_arr=True)
base_fails_test, base_arr = evaluate_cma(np.ones(num_physical_edges) * initial_weight, "test", return_arr=True)

th_weights = np.zeros(num_physical_edges)
for i in range(num_physical_edges):
    p = true_probs[i]
    th_weights[i] = math.log((1 - p) / p) if p > 0 else 1.0

th_fails_test, th_arr = evaluate_cma(th_weights, "test", return_arr=True)

n01_base = int(np.sum((base_arr == True) & (opt_arr == False)))
n10_base = int(np.sum((base_arr == False) & (opt_arr == True)))
p_val_base = scipy.stats.chi2.sf((abs(n01_base - n10_base) - 1)**2 / (n01_base + n10_base), 1) if (n01_base + n10_base) > 0 else 1.0

n01_ora = int(np.sum((th_arr == True) & (opt_arr == False)))
n10_ora = int(np.sum((th_arr == False) & (opt_arr == True)))
p_val_ora = scipy.stats.chi2.sf((abs(n01_ora - n10_ora) - 1)**2 / (n01_ora + n10_ora), 1) if (n01_ora + n10_ora) > 0 else 1.0

print("\n" + "="*75)
print(f"V6 LLM-ENSEMBLE OPTIMIZATION COMPLETE IN {elapsed:.2f}s")
print("="*75)
print(f"TEST Baseline (Uniform):  {base_fails_test/trials:.5f} ({base_fails_test}/{trials})")
print(f"TEST CMA-ES Optimized:    {opt_fails_test/trials:.5f} ({opt_fails_test}/{trials})")
print(f"TEST Log-Odds ORACLE:     {th_fails_test/trials:.5f} ({th_fails_test}/{trials})")
print("-" * 75)
print("McNemar (AI vs ORACLE) - THE GAP:")
print(f"  Gap Size: {opt_fails_test - th_fails_test} extra failures vs Oracle")
print(f"  Oracle saved, AI failed: {n10_ora:,} | AI saved, Oracle failed: {n01_ora:,}")
print(f"  p-value: {p_val_ora:.2e}")
print("="*75)

# ------------------------------------------------------------------------------
# MAP QUALITY DIAGNOSTICS (AUC)
# ------------------------------------------------------------------------------
is_defect = np.array([(r <= 3) and (c >= 5) for t, (r, c) in zip(edge_types[:num_physical_edges], edge_coords[:num_physical_edges])], dtype=bool)

w_def = opt_weights[is_defect]
w_ok  = opt_weights[~is_defect]

scores = -opt_weights
labels = is_defect.astype(int)
order = np.argsort(scores)
ranks = np.empty_like(order)
ranks[order] = np.arange(len(scores)) + 1
n_pos = labels.sum()
n_neg = len(labels) - n_pos
auc = (ranks[labels == 1].sum() - n_pos*(n_pos+1)/2) / (n_pos*n_neg) if n_pos*n_neg>0 else float('nan')

print("DEFECT DETECTION AUDIT (AUC & SEPARATION)")
print(f"   Mean Weight   -> Defect Zone: {float(w_def.mean()):.4f} | Healthy Zone: {float(w_ok.mean()):.4f}")
print(f"   AUC Score     -> {float(auc):.4f} (1.0 is perfect separation)")
print("="*75)

print("\nSHADOWMAP() V6 ASCII RENDER:")
h_grid, v_grid = np.zeros((d, d-1)), np.zeros((d-1, d))
idx = 0
for t, (r, c) in zip(edge_types[:num_physical_edges], edge_coords[:num_physical_edges]):
    if t == 'H': h_grid[r, c] = opt_weights[idx]
    elif t == 'V': v_grid[r, c] = opt_weights[idx]
    idx += 1

print("HORIZONTAL EDGE WEIGHTS (Expected Defect: Rows 0-3, Cols 5-7):")
for r in range(d):
    row_str = " ".join([f"*{w:4.1f}*" if w < 2.3 else f" {w:4.1f} " for w in h_grid[r]])
    print(f"Row {r}: {row_str}")

print("\nVERTICAL EDGE WEIGHTS (Expected Defect: Rows 0-3, Cols 5-8):")
for r in range(d-1):
    row_str = " ".join([f"*{w:4.1f}*" if w < 2.3 else f" {w:4.1f} " for w in v_grid[r]])
    print(f"Row {r}: {row_str}")
print("="*75)

V6 PHYSICS ENGINE: 144-D LLM-ENSEMBLE REGULARIZED CMA-ES
Fabricating d=9 chip with 144 independent physical edges...
 -> Injecting 'Dead Quadrant' (20% error) in Top-Right corner...
Generating 80,000 CRN Training & Testing Trials...

Evaluating Naive Baseline (Uniform weight = 3.178)...
 -> Baseline Train Rate (with L2): 0.02701

Igniting Rate-Regularized CMA-ES across 144 dimensions...
 -> CMA-ES GEN 01/60 | Best Train Fitness (Rate+L2): 0.02503
 -> CMA-ES GEN 05/60 | Best Train Fitness (Rate+L2): 0.02246
 -> CMA-ES GEN 10/60 | Best Train Fitness (Rate+L2): 0.02055
 -> CMA-ES GEN 15/60 | Best Train Fitness (Rate+L2): 0.01945
 -> CMA-ES GEN 20/60 | Best Train Fitness (Rate+L2): 0.01872
 -> CMA-ES GEN 25/60 | Best Train Fitness (Rate+L2): 0.01811
 -> CMA-ES GEN 30/60 | Best Train Fitness (Rate+L2): 0.01770
 -> CMA-ES GEN 35/60 | Best Train Fitness (Rate+L2): 0.01783
 -> CMA-ES GEN 40/60 | Best Train Fitness (Rate+L2): 0.01790
 -> CMA-ES GEN 45/60 | Best Train Fitness (Rate+L2): 0.01777


In [3]:
import numpy as np, math
import pymatching

# log-odds weight for the healthy region
w_good = math.log((1-p_good)/p_good)

def normalize_to_median(w, target):
    w = np.maximum(0.01, np.asarray(w, dtype=np.float32))
    med = float(np.median(w))
    return w if med <= 0 else w * (target / med)

def eval_phys_weights(phys_w, syn, obs):
    full = np.zeros(num_total_edges, dtype=np.float32)
    full[:num_physical_edges] = phys_w
    full[num_physical_edges:] = 0.0
    m = pymatching.Matching.from_check_matrix(H, weights=full, faults_matrix=L)
    pred = m.decode_batch(syn)[:,0]
    arr = (pred != obs)
    return int(arr.sum()), arr

# Start from your learned weights, scale-lock to w_good
w = normalize_to_median(opt_weights, w_good)

# Use a subsample for fast model selection
subN = 10000
syn_sub = syn_train[:subN]
obs_sub = obs_train[:subN]

# Candidate thresholds: choose by quantiles of learned weights
qs = np.linspace(0.05, 0.30, 14)  # defect fraction candidates
thr_vals = np.quantile(w, qs)

# Candidate defect weights: search a plausible range (centered around the true regime)
# If you know p_bad ~0.2, w_bad~1.386, but we keep it generic.
w_bad_grid = np.linspace(0.85, 2.4, 16)

best = None
for thr in thr_vals:
    mask = (w < thr)
    for w_bad in w_bad_grid:
        w_snap = np.where(mask, w_bad, w_good).astype(np.float32)
        k_sub, _ = eval_phys_weights(w_snap, syn_sub, obs_sub)
        rate_sub = k_sub / subN
        if best is None or rate_sub < best[0]:
            best = (rate_sub, float(thr), float(w_bad), int(mask.sum()))

print("Best snap on subsample:")
print("  rate_sub =", best[0], "thr =", best[1], "w_bad =", best[2], "defect_edges =", best[3])

# Evaluate best snap on full train + test
_, thr, w_bad, _ = best
mask = (w < thr)
w_snap = np.where(mask, w_bad, w_good).astype(np.float32)

k_train, train_arr = eval_phys_weights(w_snap, syn_train, obs_train)
k_test,  test_arr  = eval_phys_weights(w_snap, syn_test,  obs_test)

print("\nSNAP RESULTS (two-level weights):")
print(f"  Train rate: {k_train/len(obs_train):.5f} ({k_train}/{len(obs_train)})")
print(f"  Test  rate: {k_test/len(obs_test):.5f} ({k_test}/{len(obs_test)})")
print(f"  Learned w_bad: {w_bad:.3f}  (w_good: {w_good:.3f})")
print(f"  Flagged defect edges: {mask.sum()} / {num_physical_edges}")


Best snap on subsample:
  rate_sub = 0.0152 thr = 2.2339259028434753 w_bad = 0.85 defect_edges = 19

SNAP RESULTS (two-level weights):
  Train rate: 0.01649 (1319/80000)
  Test  rate: 0.01692 (1354/80000)
  Learned w_bad: 0.850  (w_good: 3.178)
  Flagged defect edges: 19 / 144


In [4]:
# ==============================================================================
# V6.1 CELL — THE CLASSIFIER SNAP (Bridging the Final Oracle Gap)
# Architecture by ChatGPT 5.2 Pro
# ==============================================================================
import numpy as np
import math
import scipy.stats
import pymatching

print("="*75)
print("V6.1: SNAP-TO-TWO-LEVEL POST-PROCESSING")
print("="*75)

# log-odds weight for the healthy region
w_good = math.log((1 - p_good) / p_good)

def normalize_to_median(w, target):
    w = np.maximum(0.01, np.asarray(w, dtype=np.float32))
    med = float(np.median(w))
    return w if med <= 0 else w * (target / med)

def eval_phys_weights(phys_w, syn, obs, return_arr=False):
    full = np.zeros(num_total_edges, dtype=np.float32)
    full[:num_physical_edges] = phys_w
    full[num_physical_edges:] = 0.0
    m = pymatching.Matching.from_check_matrix(H, weights=full, faults_matrix=L)
    pred = m.decode_batch(syn)[:,0]
    arr = (pred != obs)
    if return_arr:
        return int(arr.sum()), arr
    return int(arr.sum())

# Start from your learned weights, scale-lock to w_good
w = normalize_to_median(opt_weights, w_good)

# Use a subsample for fast model selection (10,000 trials)
subN = 10000
syn_sub = syn_train[:subN]
obs_sub = obs_train[:subN]

print("Executing 2D Grid Search on Subsample (Threshold vs w_bad)...")

# Candidate thresholds: choose by quantiles of learned weights
qs = np.linspace(0.05, 0.30, 14)
thr_vals = np.quantile(w, qs)

# Candidate defect weights: search a plausible range
w_bad_grid = np.linspace(0.85, 2.4, 16)

best = None
for thr in thr_vals:
    mask = (w < thr)
    for w_bad in w_bad_grid:
        w_snap = np.where(mask, w_bad, w_good).astype(np.float32)
        k_sub = eval_phys_weights(w_snap, syn_sub, obs_sub)
        rate_sub = k_sub / subN
        if best is None or rate_sub < best[0]:
            best = (rate_sub, float(thr), float(w_bad), int(mask.sum()))

print(f" -> Best Subsample Rate: {best[0]:.5f}")
print(f" -> Optimal Threshold:   w < {best[1]:.3f}")
print(f" -> Optimal w_bad:       {best[2]:.3f}")
print(f" -> Atoms Flagged Dead:  {best[3]} / {num_physical_edges}\n")

# Evaluate best snap on full train + test
_, thr, w_bad, flagged_count = best
mask = (w < thr)
w_snap = np.where(mask, w_bad, w_good).astype(np.float32)

print(f"Deploying Snapped Weights to full {trials:,} Testing set...")
k_train = eval_phys_weights(w_snap, syn_train, obs_train)
k_test, test_arr_snap = eval_phys_weights(w_snap, syn_test, obs_test, return_arr=True)

# ------------------------------------------------------------------------------
# THE FINAL MCNEMAR VS ORACLE
# ------------------------------------------------------------------------------
# We still have th_arr and th_fails_test in memory from V6!
n01_ora = int(np.sum((th_arr == True) & (test_arr_snap == False)))
n10_ora = int(np.sum((th_arr == False) & (test_arr_snap == True)))
chi2_ora = (abs(n01_ora - n10_ora) - 1)**2 / (n01_ora + n10_ora) if (n01_ora + n10_ora) > 0 else 0
p_val_ora = scipy.stats.chi2.sf(chi2_ora, 1) if (n01_ora + n10_ora) > 0 else 1.0

print("\n" + "="*75)
print("SNAP RESULTS (TWO-LEVEL WEIGHTS):")
print("="*75)
print(f"  Train rate:            {k_train/trials:.5f} ({k_train}/{trials})")
print(f"  Test rate (SNAPPED):   {k_test/trials:.5f} ({k_test}/{trials})")
print(f"  Test rate (ORACLE):    {th_fails_test/trials:.5f} ({th_fails_test}/{trials})")
print("-" * 75)
print(f"  Learned w_bad:         {w_bad:.3f} (Theoretical Truth: 1.386)")
print(f"  Learned w_good:        {w_good:.3f} (Theoretical Truth: 3.178)")
print(f"  Flagged defect edges:  {flagged_count} / {num_physical_edges} (True defect count is 36)")
print("-" * 75)
print("FINAL MCNEMAR (SNAPPED AI vs ORACLE) - THE GAP:")
print(f"  Gap Size: {k_test - th_fails_test} extra failures vs Oracle")
print(f"  Oracle saved, AI failed: {n10_ora:,} | AI saved, Oracle failed: {n01_ora:,}")
print(f"  p-value: {p_val_ora:.2e}")
print("="*75)

V6.1: SNAP-TO-TWO-LEVEL POST-PROCESSING
Executing 2D Grid Search on Subsample (Threshold vs w_bad)...
 -> Best Subsample Rate: 0.01520
 -> Optimal Threshold:   w < 2.234
 -> Optimal w_bad:       0.850
 -> Atoms Flagged Dead:  19 / 144

Deploying Snapped Weights to full 80,000 Testing set...

SNAP RESULTS (TWO-LEVEL WEIGHTS):
  Train rate:            0.01649 (1319/80000)
  Test rate (SNAPPED):   0.01692 (1354/80000)
  Test rate (ORACLE):    0.01574 (1259/80000)
---------------------------------------------------------------------------
  Learned w_bad:         0.850 (Theoretical Truth: 1.386)
  Learned w_good:        3.178 (Theoretical Truth: 3.178)
  Flagged defect edges:  19 / 144 (True defect count is 36)
---------------------------------------------------------------------------
FINAL MCNEMAR (SNAPPED AI vs ORACLE) - THE GAP:
  Gap Size: 95 extra failures vs Oracle
  Oracle saved, AI failed: 291 | AI saved, Oracle failed: 196
  p-value: 2.05e-05
