In [1]:
# ============================= PATH SETUP =============================
# Choose ONE of the following blocks depending on where you run the notebook.
# ------------------------------------------------------------------------------

from pathlib import Path
import sys

# ------------------------------------------------------------------------------
# OPTION A — Google Colab (recommended if reproducing from the paper)
# ------------------------------------------------------------------------------
# If you are running on Google Colab, UNCOMMENT this block.

"""
from google.colab import drive
drive.mount("/content/drive")

ROOT = Path("/content/drive/MyDrive/mbrl-testing-frameworks-empirical-study")
"""

# ------------------------------------------------------------------------------
# OPTION B — Local machine (GitHub clone)
# ------------------------------------------------------------------------------
# If you are running locally (e.g., after `git clone`), UNCOMMENT this block.
"""
ROOT = Path("mbrl-testing-frameworks-empirical-study")
"""
# ------------------------------------------------------------------------------
# Shared paths (DO NOT EDIT BELOW)
# ------------------------------------------------------------------------------

AGENT_ROOT  = ROOT / "agents" / "dynaq" / "frozenlake"
RESULTS_DIR = ROOT / "results" / "frozenlake"
TABLES_DIR  = RESULTS_DIR / "tables"
FIGS_DIR    = RESULTS_DIR / "figs"
RAW_DIR     = RESULTS_DIR / "raw_single"

RESULTS_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
FIGS_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)

if str(AGENT_ROOT) not in sys.path:
    sys.path.insert(0, str(AGENT_ROOT))

print("AGENT_ROOT :", AGENT_ROOT)
print("RESULTS_DIR:", RESULTS_DIR)


Mounted at /content/drive
AGENT_ROOT : /content/drive/MyDrive/mbrl-testing-frameworks-empirical-study/agents/dynaq/frozenlake
RESULTS_DIR: /content/drive/MyDrive/mbrl-testing-frameworks-empirical-study/results/frozenlake


In [2]:
import os
import time
import json
import math
import random
import pickle
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import os, random
from time import perf_counter
from collections import deque
import pandas as pd
import gymnasium as gym

# ---------- Env & globals ----------
ENV_ID  = "FrozenLake-v1"
ENV_KW  = dict(map_name="8x8", is_slippery=False)
HORIZON = 200
EPISODE_BUDGET = 1500
SEED    = 2025

env0 = gym.make(ENV_ID, **ENV_KW)
N_STATES  = env0.observation_space.n
N_ACTIONS = env0.action_space.n

DESC = np.array(env0.unwrapped.desc, dtype="U1")
HOLES = [(r,c) for r in range(8) for c in range(8) if DESC[r,c]=="H"]
GOAL  = next((r,c) for r in range(8) for c in range(8) if DESC[r,c]=="G")

ACTIONS = [(0,-1),(1,0),(0,1),(-1,0)]

def make_env(seed=None):
    env = gym.make(ENV_ID, **ENV_KW)
    if seed is not None:
        env.reset(seed=seed)
    return env

# ---------- BD ----------
def bd_tuple(s: int):
    r,c = divmod(int(s), 8)
    d_goal = abs(r-GOAL[0]) + abs(c-GOAL[1])
    d_hole = min(abs(r-hr)+abs(c-hc) for hr,hc in HOLES)
    return (d_goal, d_hole)

# ---------- Load trained Dyna-Q ----------
AGENT_Q_PATH = os.path.join(AGENT_ROOT, "dynaq_Q_frozenlake.npy")
assert os.path.exists(AGENT_Q_PATH), "Missing trained Dyna-Q Q-table"
Q = np.load(AGENT_Q_PATH)

# ---------- Shortest-path oracle ----------
DIST = np.full(N_STATES, np.inf)
goal_idx = GOAL[0]*8 + GOAL[1]
DIST[goal_idx] = 0
q = deque([goal_idx])
while q:
    s = q.popleft()
    r,c = divmod(s,8)
    for dr,dc in ACTIONS:
        rr,cc = r+dr,c+dc
        if 0<=rr<8 and 0<=cc<8 and DESC[rr,cc]!="H":
            sp = rr*8+cc
            if DIST[sp] > DIST[s]+1:
                DIST[sp] = DIST[s]+1
                q.append(sp)

TAU_SLACK = 2

def run_episode(env_obj, Q_mat, max_steps=HORIZON,
                epsilon=0.0, softmax_tau=None, logit_noise=None):

    s0, _ = env_obj.reset()
    s0 = int(s0)
    s  = s0

    base_d = DIST[s0]
    if not np.isfinite(base_d):
        tau = max_steps
    else:
        tau = min(max_steps, int(base_d) + TAU_SLACK)

    total = 0.0
    steps = 0
    term = False
    trunc = False
    illegal = False

    for t in range(max_steps):
        qrow = Q_mat[int(s)].copy()

        if epsilon and (np.random.rand() < epsilon):
            a = env_obj.action_space.sample()
        else:
            if logit_noise:
                qrow = qrow + np.random.normal(0, logit_noise, size=qrow.shape)
            if softmax_tau:
                z = qrow - qrow.max()
                p = np.exp(z / softmax_tau)
                p /= p.sum()
                a = int(np.random.choice(len(qrow), p=p))
            else:
                a = int(np.argmax(qrow))

        s2, r, term, trunc, _ = env_obj.step(a)
        s2 = int(s2)

        total += r
        steps = t + 1
        s = s2

        if term or trunc:
            break

    success = (total > 0.0) and (steps <= tau)
    timeout = (steps > tau)
    failure = (not success) or timeout

    return float(total), int(steps), bool(failure), bool(illegal)


# ---------- Metrics ----------
def metrics_from_rows(df):
    fails = df["failure"].astype(int).to_numpy()
    steps = df["steps"].astype(float).to_numpy()
    F = fails.sum(); n=len(fails)
    FR = fails.mean()
    if F>0:
        cum = np.cumsum(fails)
        APFD = np.trapezoid(cum, x=np.arange(1,n+1))/(F*n)
        T = np.cumsum(steps)
        APFD_t = np.trapezoid(cum, x=T)/(F*T[-1])
        TTF = int(np.argmax(fails==1)+1)
    else:
        APFD=APFD_t=TTF=np.nan
    return FR, APFD, APFD_t, TTF

def get_metrics_summary(rows_df):

    out = metrics_from_rows(rows_df)

    if isinstance(out, dict):
        return out

    if isinstance(out, tuple) and len(out) == 2 and isinstance(out[1], dict):
        return out[1]

    if isinstance(out, tuple):
        for item in out:
            if isinstance(item, dict):
                return item

    raise RuntimeError(
        f"{type(out)}"
    )


In [4]:
def _save_raw(framework, rows, fname):
    fw_dir = os.path.join(RAW_DIR, framework)
    os.makedirs(fw_dir, exist_ok=True)
    path = os.path.join(fw_dir, fname)
    pd.DataFrame(rows).to_csv(path, index=False)
    return path

# ===== STATE SAMPLING HELPER=====
def sample_pool(n=5000, seed_base=10000):

    rng = np.random.default_rng(seed_base)

    if isinstance(N_STATES, int):
        return rng.integers(0, N_STATES, size=n, endpoint=False)

    env = make_env()
    states = []
    for i in range(n):
        s, _ = env.reset(seed=seed_base + i)
        states.append(int(s))
    return np.array(states, dtype=int)

In [5]:
# ===== Framework runners =====

def run_AdaStop_single():
    rows=[]
    for i in range(EPISODE_BUDGET):
        env = make_env(SEED+i)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF = metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="AdaStop",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

def run_CUSUMRL_single():
    rows=[]
    for i in range(EPISODE_BUDGET):
        env = make_env(SEED+i)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF = metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="CUSUM-RL",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

def run_DLBT_single():
    rows=[]
    for i in range(750):
        env = make_env(SEED+i)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF = metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="DLBT",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

In [6]:
def run_FOracle_single():
    rows=[]
    for i in range(EPISODE_BUDGET):
        env = make_env(SEED+i)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF = metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="F-Oracle",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

In [7]:
def run_QDTesting_single():
    pool = np.arange(N_STATES)
    rows=[]
    for i in range(EPISODE_BUDGET):
        s = pool[i % len(pool)]
        env = make_env(SEED+i)
        env.reset(seed=SEED+i)
        env.unwrapped.s = int(s)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF = metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="QD-Testing",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

In [8]:
def run_STARLA_single():
    rows=[]
    pop = list(range(N_STATES))
    for i in range(EPISODE_BUDGET):
        s = pop[i % len(pop)]
        env = make_env(SEED+i)
        env.reset(seed=SEED+i)
        env.unwrapped.s = int(s)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF = metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="STARLA",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

In [9]:
def run_Indago_single():
    pool = np.arange(N_STATES)
    rows=[]
    for i in range(EPISODE_BUDGET):
        s = pool[(i*7) % len(pool)]
        env = make_env(SEED+i)
        env.reset(seed=SEED+i)
        env.unwrapped.s = int(s)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF = metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="Indago",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

In [10]:
def run_NRRL_single():
    rows=[]
    for i in range(EPISODE_BUDGET):
        env = make_env(SEED+i)
        s,_ = env.reset(seed=SEED+i)
        s=int(s)
        for _ in range(HORIZON):
            q=Q[s].copy()
            a = np.argmin(q) if np.random.rand()<0.3 else np.argmax(q)
            s2,r,term,trunc,_ = env.step(int(a))
            s=int(s2)
            if term or trunc: break
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF = metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="NR-RL",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

In [11]:
def run_SBF_single():
    rows=[]
    boundary=[s for s in range(N_STATES) if bd_tuple(s)[0]<=2 or bd_tuple(s)[1]<=1]
    for i in range(EPISODE_BUDGET):
        s = boundary[i % len(boundary)]
        env = make_env(SEED+i)
        env.reset(seed=SEED+i)
        env.unwrapped.s=int(s)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF = metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="SBF",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

In [12]:
def run_MDPFuzz_single():
    rows=[]
    rng=np.random.default_rng(SEED)
    for i in range(EPISODE_BUDGET):
        s=rng.integers(0,N_STATES)
        env=make_env(SEED+i)
        env.reset(seed=SEED+i)
        env.unwrapped.s=int(s)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF=metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="MDPFuzz",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

In [13]:
def run_GMBT_single():
    rows=[]
    for i in range(EPISODE_BUDGET):
        s=(i*13)%N_STATES
        env=make_env(SEED+i)
        env.reset(seed=SEED+i)
        env.unwrapped.s=int(s)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF=metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="GMBT",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

In [14]:
def run_CureFuzz_single():
    rows=[]
    rng=np.random.default_rng(SEED)
    for i in range(EPISODE_BUDGET):
        s=rng.choice(range(N_STATES))
        env=make_env(SEED+i)
        env.reset(seed=SEED+i)
        env.unwrapped.s=int(s)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({"failure":fail,"steps":steps})
    FR,APFD,APFD_t,TTF=metrics_from_rows(pd.DataFrame(rows))
    return dict(framework="CureFuzz",FR=FR,APFD=APFD,APFD_time=APFD_t,TTF=TTF,budget=len(rows))

In [15]:
def run_RLMutation_single():
    FRAME = "RLMutation"
    TOTAL = EPISODE_BUDGET

    class RN_RewardNoise(gym.Wrapper):
        def __init__(self, env, p=0.20, sigma=0.10):
            super().__init__(env)
            self.p = p
            self.sigma = sigma

        def step(self, a):
            s2, r, term, trunc, info = self.env.step(a)
            if r > 0 and np.random.rand() < self.p:
                r = float(r + np.random.normal(0, self.sigma))
            return s2, r, term, trunc, info

    class M_Sticky(gym.Wrapper):
        def __init__(self, env, p=0.20):
            super().__init__(env)
            self.p = p
            self.prev = None

        def reset(self, **kw):
            self.prev = None
            return self.env.reset(**kw)

        def step(self, a):
            s2, r, term, trunc, info = self.env.step(a)
            if self.prev is not None and np.random.rand() < self.p:
                s2, r, term, trunc, info = self.prev
            self.prev = (s2, r, term, trunc, info)
            return s2, r, term, trunc, info

    mutants = [("RN", RN_RewardNoise), ("M", M_Sticky),
               ("RN2", RN_RewardNoise), ("M2", M_Sticky)]

    share = TOTAL // 4
    pool = sample_pool(8000, seed_base=12000)

    rows = []
    start = perf_counter()
    ttf = None
    ttf_sec = None
    idx = 0

    for tag, wrapper in mutants:
        env = wrapper(make_env(seed=SEED))
        starts = pool[idx:idx + share]
        idx += share

        for s in starts:
            env.reset(seed=SEED + idx)
            env.unwrapped.s = int(s)
            _, _, fail,_ = run_episode(env, Q)
            rows.append({"failure": int(fail), "steps": 1, "state": int(s), "mutant": tag})

            if fail and ttf is None:
                ttf = len(rows)
                ttf_sec = perf_counter() - start

        if len(rows) >= TOTAL:
            break

    rows = rows[:TOTAL]
    fails = np.array([r["failure"] for r in rows], int)

    FR = float(fails.mean())
    if fails.sum() > 0:
        pos = np.where(fails == 1)[0] + 1
        APFD = 1.0 - (pos.sum() / (len(fails) * len(pos))) + (1.0 / (2 * len(fails)))
        TTF = float(pos[0])
    else:
        APFD = np.nan
        TTF = np.nan

    out = {
        "framework": FRAME,
        "FR": FR,
        "APFD": APFD,
        "APFD_time": np.nan,
        "TTF": TTF,
        "TTF_tests": TTF,
        "TTF_seconds": float(ttf_sec) if ttf_sec else np.nan,
        "total_seconds": float(perf_counter() - start),
        "budget": len(rows),
    }

    _save_raw(FRAME, rows, "rlmutation_1500.csv")
    return out

In [16]:
def run_FS_single():
    FRAME = "FS"
    rows  = []
    start = perf_counter()
    ttf   = None
    ttf_sec = None

    for i in range(EPISODE_BUDGET):
        env = make_env(seed=SEED + i)

        ret, steps, fail, _ = run_episode(
            env,
            Q,
            epsilon=0.05
        )

        rows.append({
            "idx":      i,
            "return":   float(ret),
            "steps":    int(steps),
            "failure":  int(fail),
            "wall_dt":  float(perf_counter() - start),
        })

        if ttf is None and fail:
            ttf = i + 1
            ttf_sec = rows[-1]["wall_dt"]

    raw_path = _save_raw(FRAME, rows, "fs_1500.csv")

    df = pd.DataFrame(rows)

    fails = df["failure"].to_numpy()
    times = df["wall_dt"].to_numpy()

    FR = float(fails.mean())
    TTF_tests = float(ttf) if ttf is not None else float(len(fails))
    TTF_seconds = float(ttf_sec) if ttf_sec is not None else float(times.sum())

    return {
        "framework":     FRAME,
        "FR":            FR,
        "TTF_tests":     TTF_tests,
        "TTF_seconds":   TTF_seconds,
        "total_seconds": float(times.sum()),
        "budget":        len(df),
        "raw_path":      raw_path,
    }

In [17]:
def run_piFuzz_single():
    FRAME = "pi-Fuzz"
    rows = []
    start = perf_counter()
    ttf = None
    ttf_sec = None

    def relax(s, closer=True, delta=1):
        r, c = divmod(int(s), DESC.shape[1])
        dr = np.sign(GOAL[0] - r)
        dc = np.sign(GOAL[1] - c)
        if closer:
            r2 = int(np.clip(r + dr*delta, 0, DESC.shape[0]-1))
            c2 = int(np.clip(c + dc*delta, 0, DESC.shape[1]-1))
        else:
            r2 = int(np.clip(r - dr*delta, 0, DESC.shape[0]-1))
            c2 = int(np.clip(c - dc*delta, 0, DESC.shape[1]-1))
        return r2 * DESC.shape[1] + c2

    def succ_from_state(s, seed):
        env = make_env(seed=seed)
        env.reset(seed=seed)
        env.unwrapped.s = int(s)
        _, _, fail,_ = run_episode(env, Q)
        return not fail

    base_env = make_env(seed=SEED)

    for i in range(300):
        s, _ = base_env.reset(seed=SEED + i)
        s_int = int(s)

        hard = relax(s_int, closer=False)
        easy = relax(s_int, closer=True)

        bug = succ_from_state(hard, SEED + 10*i) and not succ_from_state(easy, SEED + 10*i + 1)
        rows.append({"failure": int(bug), "steps": 1, "state": s_int})

        if bug and ttf is None:
            ttf = len(rows)
            ttf_sec = perf_counter() - start

    while len(rows) < EPISODE_BUDGET:
        rows.append({"failure": 0, "steps": 1, "state": 0})

    fails = np.array([r["failure"] for r in rows], int)

    FR = float(fails.mean())
    if fails.sum() > 0:
        pos = np.where(fails == 1)[0] + 1
        APFD = 1.0 - (pos.sum() / (len(fails) * len(pos))) + (1.0 / (2 * len(fails)))
        TTF = float(pos[0])
    else:
        APFD = np.nan
        TTF = np.nan

    out = {
        "framework": FRAME,
        "FR": FR,
        "APFD": APFD,
        "APFD_time": np.nan,
        "TTF": TTF,
        "TTF_tests": TTF,
        "TTF_seconds": float(ttf_sec) if ttf_sec else np.nan,
        "total_seconds": float(perf_counter() - start),
        "budget": len(rows),
    }

    _save_raw(FRAME, rows, "pifuzz_mr_1500.csv")
    return out

In [18]:
# ===== μPRL helper=====

def make_muprl_mutant_Q(Q_base: np.ndarray, frac_corrupt: float, seed: int = 0) -> np.ndarray:
    rng = np.random.default_rng(seed)
    Qm = Q_base.copy()
    n_states, n_actions = Qm.shape

    n_mut = max(1, int(frac_corrupt * n_states))
    states = rng.choice(n_states, size=n_mut, replace=False)

    for s in states:
        q_row = Qm[s].copy()
        if np.allclose(q_row, q_row[0]):
            continue

        a_g = int(np.argmax(q_row))
        other_actions = [a for a in range(n_actions) if a != a_g]
        if not other_actions:
            continue

        a_bad = int(rng.choice(other_actions))

        Qm[s, a_bad] = q_row[a_g] + 0.1
        Qm[s, a_g]   = q_row[a_g] - 0.1

    return Qm
def cov_div_from_states(states_iter, fails_iter):

    states = [int(s) for s in states_iter]
    fails  = [int(f) for f in fails_iter]

    failing_states = [s for s, f in zip(states, fails) if f == 1]
    if not failing_states:
        return 0, 0

    coverage = len({bd_tuple(s) for s in failing_states})
    diversity = len(set(failing_states))
    return coverage, diversity


In [19]:
def run_MUPRL_single():
    FRAME  = "μPRL"
    BUDGET = EPISODE_BUDGET
    share  = BUDGET // 4

    variants = [
        {"frac": 0.05, "seed": 4000},
        {"frac": 0.08, "seed": 4031},
        {"frac": 0.12, "seed": 4062},
        {"frac": 0.15, "seed": 4093},
    ]

    rows = []
    start = perf_counter()

    pool = sample_pool(6000, seed_base=13000)

    for k, cfg in enumerate(variants):
        Qm = make_muprl_mutant_Q(
            Q_base=Q,
            frac_corrupt=cfg["frac"],
            seed=cfg["seed"],
        )

        starts = pool[k*share:(k+1)*share]
        for s in starts:
            env = make_env()
            env.reset()
            env.unwrapped.s = int(s)

            ret, steps, fail, _ = run_episode(env, Qm)

            rows.append({
                "failure": int(fail),
                "steps":   int(steps),
            })

    rows = rows[:BUDGET]
    df = pd.DataFrame(rows)

    fails = df["failure"].to_numpy()
    steps = df["steps"].to_numpy()

    FR = float(fails.mean())
    TTF_tests = next((i+1 for i,f in enumerate(fails) if f==1), float(len(fails)))
    TTF_seconds = float(steps[:int(TTF_tests)].sum())

    out = {
        "framework":     FRAME,
        "FR":            FR,
        "TTF_tests":     TTF_tests,
        "TTF_seconds":   TTF_seconds,
        "total_seconds": float(perf_counter() - start),
        "budget":        len(df),
    }

    _save_raw(FRAME, rows, "muPRL_aggregate_1500.csv")
    return out

In [20]:
def run_AlphaTest_single():
    FRAME = "AlphaTest"
    BUDGET = EPISODE_BUDGET

    class SlipWrapper(gym.Wrapper):
        def __init__(self, env, p_slip=0.0):
            super().__init__(env)
            self.p_slip = p_slip

        def step(self, action):
            if np.random.rand() < self.p_slip:
                action = self.action_space.sample()
            return self.env.step(action)

    rows = []
    start = perf_counter()
    ttf_idx = None
    ttf_sec = None

    SLIP_GRID = [0.0, 0.05, 0.10, 0.15]

    for i, slip in enumerate(SLIP_GRID):
        env = SlipWrapper(make_env(seed=SEED + i), p_slip=slip)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({
            "slip": slip,
            "return": ret,
            "steps": steps,
            "failure": fail,
        })
        if ttf_idx is None and fail:
            ttf_idx = len(rows)
            ttf_sec = perf_counter() - start

    i = len(rows)
    while len(rows) < BUDGET:
        slip = float(np.clip(np.random.rand() * 0.20, 0.0, 0.20))
        env = SlipWrapper(make_env(seed=SEED + i), p_slip=slip)
        ret, steps, fail, _ = run_episode(env, Q)
        rows.append({
            "slip": slip,
            "return": ret,
            "steps": steps,
            "failure": fail,
        })
        if ttf_idx is None and fail:
            ttf_idx = len(rows)
            ttf_sec = perf_counter() - start
        i += 1

    fails = np.array([r["failure"] for r in rows], dtype=int)
    steps_arr = np.array([r["steps"] for r in rows], dtype=float)

    FR = float(fails.mean()) if len(fails) else np.nan

    if fails.sum() > 0:
        pos = np.where(fails == 1)[0] + 1
        APFD = 1.0 - (pos.sum() / (len(fails) * len(pos))) + (1.0 / (2 * len(fails)))
        TTF_tests = float(pos[0])
    else:
        APFD = np.nan
        TTF_tests = np.nan

    out = {
        "framework": FRAME,
        "FR": FR,
        "APFD": APFD,
        "APFD_time": np.nan,
        "TTF": TTF_tests,
        "TTF_tests": TTF_tests,
        "TTF_seconds": float(ttf_sec) if ttf_sec else np.nan,
        "total_seconds": float(perf_counter() - start),
        "budget": len(rows),
    }

    _save_raw(FRAME, rows, "alphatest_1500.csv")
    return out

In [21]:
def run_MDPMorph_single():
    FRAME = "MDPMorph"
    rows = []
    start = perf_counter()
    ttf = None
    ttf_sec = None

    scored = []
    for s in range(N_STATES):
        d_g, d_h = bd_tuple(s)
        scored.append((s, d_g, d_h))
    scored = sorted(scored, key=lambda x: (-x[1], x[2]))

    starts = [s for (s, _, _) in scored][:min(EPISODE_BUDGET, len(scored))]

    for i, s0 in enumerate(starts):
        env_b = make_env(seed=SEED + i)
        env_b.reset(seed=SEED + i)
        env_b.unwrapped.s = int(s0)
        _, _, fail_b, _ = run_episode(env_b, Q)

        env_f = make_env(seed=SEED + i)
        env_f.reset(seed=SEED + i)
        env_f.unwrapped.s = int(s0)

        prefix = []
        r, c = divmod(int(s0), 8)
        for a_idx, (dr, dc) in enumerate(ACTIONS):
            rr, cc = r + dr, c + dc
            if 0 <= rr < 8 and 0 <= cc < 8 and DESC[rr, cc] != "H":
                prefix = [a_idx]
                break

        for a in prefix:
            s2, r2, term2, trunc2, _ = env_f.step(int(a))
            if term2 or trunc2:
                break

        _, _, fail_f, _ = run_episode(env_b, Q)


        violated = int(fail_b != fail_f)
        rows.append(violated)

        if violated and ttf is None:
            ttf = i + 1
            ttf_sec = perf_counter() - start

    fails = np.array(rows, dtype=int)
    steps = np.ones_like(fails, dtype=float)

    FR = float(fails.mean()) if len(fails) else np.nan

    if fails.sum() > 0:
        pos = np.where(fails == 1)[0] + 1
        APFD = 1.0 - (pos.sum() / (len(fails) * len(pos))) + (1.0 / (2 * len(fails)))
        TTF_tests = float(pos[0])
    else:
        APFD = np.nan
        TTF_tests = np.nan

    total_seconds = float(perf_counter() - start)

    out = {
        "framework": FRAME,
        "FR": FR,
        "APFD": APFD,
        "APFD_time": np.nan,
        "TTF": TTF_tests,
        "TTF_tests": TTF_tests,
        "TTF_seconds": float(ttf_sec) if ttf_sec else np.nan,
        "total_seconds": total_seconds,
        "budget": len(fails),
    }

    _save_raw(FRAME, [{"failure": int(f)} for f in fails], "mdpmorph_episodes.csv")
    return out

In [None]:
FRAMEWORKS = [
    run_AdaStop_single,
    run_FOracle_single,
    run_CUSUMRL_single,
    run_DLBT_single,
    run_MDPMorph_single,
    run_QDTesting_single,
    run_STARLA_single,
    run_Indago_single,
    run_AlphaTest_single,
    run_NRRL_single,
    run_SBF_single,
    run_MDPFuzz_single,
    run_piFuzz_single,
    run_GMBT_single,
    run_CureFuzz_single,
    run_RLMutation_single,
    run_FS_single,
    run_MUPRL_single,
]

results=[]
for fn in FRAMEWORKS:
    print("→",fn.__name__)
    results.append(fn())

df=pd.DataFrame(results)
out=os.path.join(TABLES_DIR,"frozenlake_single_row_metrics.csv")
df.to_csv(out,index=False)
df

→ run_AdaStop_single
→ run_FOracle_single
→ run_CUSUMRL_single
→ run_DLBT_single
→ run_MDPMorph_single
→ run_QDTesting_single
→ run_STARLA_single
→ run_Indago_single
→ run_AlphaTest_single
→ run_NRRL_single
→ run_SBF_single
→ run_MDPFuzz_single
→ run_piFuzz_single
→ run_GMBT_single
→ run_CureFuzz_single
→ run_RLMutation_single
→ run_FS_single
→ run_MUPRL_single


In [None]:
# ===== CELL FL1 — Single-run table + FR bars (Dyna-Q × FrozenLake) =====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

RESULTS_ROOT = "/content/drive/MyDrive/mbrl-testing-frameworks-empirical-study/results/frozenlake"
TABLES_DIR   = os.path.join(RESULTS_ROOT, "tables")
FIGS_DIR     = os.path.join(RESULTS_ROOT, "figs_single")
RAW_DIR= os.path.join(RESULTS_ROOT, "raw_single")
os.makedirs(FIGS_DIR, exist_ok=True)

single_csv = os.path.join(TABLES_DIR, "frozenlake_single_row_metrics.csv")
assert os.path.exists(single_csv), f"Single-run metrics not found: {single_csv}"

SINGLE = pd.read_csv(single_csv)
SINGLE = SINGLE.sort_values("framework").reset_index(drop=True)

FRAMEWORK_COLORS = {
    # C1
    "AdaStop":"#808000",
    "F-Oracle":"#000080",
    "CUSUM-RL":"#DDA0DD",
    "DLBT":"#008000",
    "MDPMorph":"#B56576",

    # C2
    "QD-Testing":"#580F41",
    "STARLA":"#069AF3",
    "Indago":"#40E0D0",
    "AlphaTest":"#9A0EEA",

    # C3
    "NR-RL":"#9ACD32",
    "SBF":"#FE420F",
    "Fuzzer":"#6E6F71",
    "MDPFuzz":"#EE82EE",
    "pi-Fuzz":"#808080",
    "GMBT":"#C79FEF",
    "CureFuzz":"#E50000",

    # C4
    "RLMutation":"#15B01A",
    "FS":"#DAA520",

    # Extra
    "μPRL":"#3357A6",
}
def _fw_col(name):
    return FRAMEWORK_COLORS.get(name, "#777777")

# ---------------------- IEEE-STYLE ----------------------
plt.rcParams.update({
    "figure.figsize": (10.5, 4.2),
    "font.family": "serif",
    "font.serif": ["Times New Roman","DejaVu Serif","Nimbus Roman","Times"],
    "font.size": 12,
    "axes.titlesize": 13,
    "axes.labelsize": 12,
    "axes.edgecolor":"#333",
    "axes.linewidth":1.0,
    "axes.grid": True,
    "grid.alpha": .25,
    "grid.linestyle": "--",
    "legend.frameon": False,
})

# ---------------------- FR BAR PLOT ----------------------
def plot_fr_single(df, title, out_base):
    d = df.copy().sort_values("framework")
    labels = d["framework"].tolist()
    y = pd.to_numeric(d["FR"], errors="coerce").fillna(0.0).to_numpy()
    colors = [_fw_col(f) for f in labels]

    x = np.arange(len(labels))
    plt.figure()
    bars = plt.bar(x, y, color=colors, edgecolor="black", linewidth=0.6)

    ymax = max(0.05, float(np.nanmax(y))*1.15)
    plt.ylim(0, ymax)
    plt.xticks(x, labels, rotation=60, ha="right")
    plt.ylabel("Failure Rate (FR)")
    plt.title(title)
    plt.grid(True, axis="y", alpha=.25, linestyle="--")
    plt.tight_layout()

    for b, m in zip(bars, y):
        plt.text(
            b.get_x() + b.get_width()/2,
            b.get_height() + 0.01*ymax,
            f"{m:.3f}",
            ha="center", va="bottom", fontsize=9
        )

    for ext in (".png", ".pdf", ".svg"):
        plt.savefig(out_base + ext, dpi=300, bbox_inches="tight")
    plt.show(); plt.close()

    print("Saved:", out_base + ".[png|pdf|svg]")

plot_fr_single(
    SINGLE,
    "FrozenLake (Dyna-Q) — Single-run Failure Rate (FR)",
    os.path.join(FIGS_DIR, "frozenlake_FR_single_all")
)

print("=== Single-run metrics table ===")
display(SINGLE)

In [None]:
# =====  CF-tests & CF-time =====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

os.makedirs(FIGS_DIR, exist_ok=True)

def _find_first_csv(framework):
    fw_dir = os.path.join(RAW_DIR, framework)
    if not os.path.isdir(fw_dir):
        return None
    for f in sorted(os.listdir(fw_dir)):
        if f.lower().endswith(".csv"):
            return os.path.join(fw_dir, f)
    return None

def _read_and_normalize(framework, budget):
    p = _find_first_csv(framework)
    if p is None:
        return None

    try:
        df = pd.read_csv(p)
    except Exception:
        return None

    if "failure" not in df.columns:
        return None

    d = df.copy()
    d["failure"] = pd.to_numeric(d["failure"], errors="coerce").fillna(0).astype(int)

    if "steps" not in d.columns:
        d["steps"] = 1
    d["steps"] = pd.to_numeric(d["steps"], errors="coerce").fillna(1).astype(float)

    if "wall_dt" in d.columns:
        d["time_sec"] = pd.to_numeric(d["wall_dt"], errors="coerce").fillna(0.0)
    else:
        d["time_sec"] = d["steps"]

    if len(d) < budget:
        pad = pd.DataFrame({
            "failure": [0] * (budget - len(d)),
            "steps":   [d["steps"].median() or 1.0] * (budget - len(d)),
            "time_sec":[d["time_sec"].median() or 0.0] * (budget - len(d)),
        })
        d = pd.concat([d, pad], ignore_index=True)

    return d.iloc[:budget].reset_index(drop=True)

def _cumulative(df):
    y = df["failure"].to_numpy().cumsum()
    y = np.maximum.accumulate(y)
    t = df["time_sec"].to_numpy().cumsum()
    x = np.arange(1, len(df) + 1)
    return x, y, t

plt.rcParams.update({
    "figure.figsize": (10.5, 5.2),
    "font.family": "serif",
    "font.serif": ["Times New Roman","DejaVu Serif","Nimbus Roman","Times"],
    "font.size": 12,
    "axes.titlesize": 13,
    "axes.labelsize": 12,
    "axes.edgecolor":"#333",
    "axes.linewidth":1.0,
    "axes.grid": True,
    "grid.alpha": .25,
    "grid.linestyle": "--",
    "legend.frameon": False,
})

def plot_cf(axis="tests"):

    plt.figure()
    plotted = 0

    for fw in SINGLE["framework"]:
        dfw = _read_and_normalize(fw, budget=int(SINGLE.loc[SINGLE["framework"]==fw,"budget"].iloc[0]))
        if dfw is None:
            continue

        x, y, t = _cumulative(dfw)
        F = int(dfw["failure"].sum())

        if axis == "tests":
            plt.plot(x, y, label=f"{fw} (F={F})", color=_fw_col(fw))
            xlabel = "Tests (episodes)"
        else:
            plt.plot(t, y, label=f"{fw} (F={F})", color=_fw_col(fw))
            xlabel = "Cumulative time (s)"

        plotted += 1

    if plotted == 0:
        print("⚠ No CF curves to plot.")
        plt.close()
        return

    plt.xlabel(xlabel)
    plt.ylabel("Cumulative failures")
    plt.title(f"FrozenLake (Dyna-Q) — CF vs {'Tests' if axis=='tests' else 'Time'} (ALL)")
    plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5), ncol=2)
    plt.tight_layout()

    out = os.path.join(FIGS_DIR, f"frozenlake_CF_vs_{axis}_all")
    for ext in (".png", ".pdf", ".svg"):
        plt.savefig(out + ext, dpi=300, bbox_inches="tight")

    plt.show()
    plt.close()

    print("Saved:", out + ".[png|pdf|svg]")

plot_cf(axis="tests")
plot_cf(axis="time")