In [18]:
RESULT_PATHS = {
    "/nfs/scratch_2/adai/qdhf_sandbox/results/gen_runs_paper_story_fixed/rouge_filter/opinions": "LMX, ROUGE-L",
    "/nfs/scratch_2/adai/qdhf_sandbox/results/gen_runs_paper_story_fixed/rouge_quality_si/opinions": "LMX, ROUGE-L (w/ QAIF)",
    "/nfs/scratch_2/adai/qdhf_sandbox/results/gen_runs_paper_story_fixed/nsaif/opinions": "LMX, NSAIF",
    "/nfs/scratch_2/adai/qdhf_sandbox/results/gen_runs_paper_story_fixed/nsaif_q_filter/opinions": "LMX, NSAIF (w/ QAIF)",
    "/nfs/scratch_2/adai/openelm_no_install/results/data/histories_opinions_stories/qdaif/opinions/lmx_near_seeded_init": "QDAIF (ours)",
}
N_BINS = 20 # number of bin intervals for 1D
ELITES_INTERVALS = 100 # elites at a given archive state saved every N intervals
TITLE = "Stories - Genre and Ending"
Y_LABEL = "QD Score" # "QD Score", "Coverage", "Best Solution Quality"

In [19]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from typing import Optional, Tuple, Union, List
import json
from map_elites import MAPElitesEvaluation, MAPElites2DEvaluation

def bootstrap_samples(data, n_bootstrap_samples=1000):
    """Generates bootstrap samples."""
    n_points = data.shape[1]
    bootstrap_samples = np.empty((n_bootstrap_samples, data.shape[0], n_points))
    
    for i in range(n_bootstrap_samples):
        sample_indices = np.random.choice(data.shape[0], size=data.shape[0], replace=True)
        bootstrap_samples[i] = data[sample_indices]
    
    return bootstrap_samples

def plot_with_bootstrap_ci(data, run_name, ax, stat_func):
    """
    Plots the statistic along with its 95% CI using bootstrapping.
    
    Parameters:
    - data: 2D array, shape (num_seeds, timesteps), data to plot
    - run_name: str, name of the run to be used in the legend
    - ax: matplotlib axis, axis to plot on
    - stat_func: function, function to compute the statistic
    """
    n_bootstrap_samples = 1000
    bootstrap_s = bootstrap_samples(data, n_bootstrap_samples)
    
    # stats for each bootstrap sample and for each timestep
    bootstrap_stat = np.array([stat_func(sample, axis=0) for sample in bootstrap_s])
    
    # 95% CI
    lower_bound = np.percentile(bootstrap_stat, 2.5, axis=0)
    upper_bound = np.percentile(bootstrap_stat, 97.5, axis=0)
    stat_original = stat_func(data, axis=0)
    print(stat_original[-1], lower_bound[-1], upper_bound[-1])

    timesteps = np.arange(data.shape[1])
    ax.plot(timesteps, stat_original, label=run_name)
    ax.fill_between(timesteps, lower_bound, upper_bound, alpha=0.2)
    ax.set_xlabel("Iterations", fontsize=14)
    ax.set_ylabel(Y_LABEL, fontsize=14)
    ax.set_title(TITLE, fontsize=16)
    ax.legend(loc="lower right", fontsize=8)

def plot_mean(data, run_name, ax):
    plot_with_bootstrap_ci(data, run_name, ax, np.mean)

In [None]:
prev_expt_dir = ""
qd_score_runs = {}
dirs = [Path(list(RESULT_PATHS.keys())[i]) for i in range(len(RESULT_PATHS))]

# contains every experiment of runs in base_dir
for base_dir in dirs:
    # sort so that experiments are processed at a time
    for sub_dir in sorted(base_dir.rglob("history.jsonl")):
        expt_dir = sub_dir.parent.parent # this dir would contain multiple seed runs
        # sub_dir goes through all rerun dir elites for an expt dir, so we continue onto next expt dir 
        if expt_dir == prev_expt_dir:
            continue
        
        # initialize to store stats across reruns in single experiment
        qd_score_seeds = []

        # process all random rng seed reruns in experiment
        for seed_dir in expt_dir.rglob("history.jsonl"):
            X = pd.read_json(seed_dir, lines=True)
            history_length = len(X) # set it to a shorter length of iterations if desired

            elites_path = None

            if "qdef/" in str(seed_dir): # for embedding feedback experiment logs
                custom_bins = np.array([0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.50, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.60])
            else: # default
                custom_bins = np.array([0.005, 0.01, 0.015, 0.02, 0.03, 0.04, 0.05, 0.10, 0.20, 0.50, 0.80, 0.90, 0.95, 0.96, 0.97, 0.98, 0.985, 0.99, 0.995])

            if "stories_genre_ending" in str(seed_dir): # 2D grid domain
                X_BINS = [0.005, 0.02, 0.05, 0.20, 0.50, 0.80, 0.95, 0.98, 0.995]
                Y_BINS = [0.005, 0.02, 0.05, 0.20, 0.50, 0.80, 0.95, 0.98, 0.995]

                map_elites_evaluation = MAPElites2DEvaluation(history_length=history_length, x_bins=X_BINS, y_bins=Y_BINS, start=(0,0), stop=(1,1), elites_intervals=ELITES_INTERVALS)
                map_elites_evaluation.fit(phenotype_key="phenotype", data=X, elites_path=elites_path)
            else:
                # pass
                map_elites_evaluation = MAPElitesEvaluation(history_length=history_length, n_bins=N_BINS, start=0, stop=1, custom_bins=custom_bins, elites_intervals=ELITES_INTERVALS)
                map_elites_evaluation.fit(phenotype_key="phenotype", data=X, elites_path=elites_path)

            if Y_LABEL == "QD Score":
                qd_score_seed = map_elites_evaluation.qd_scores[:2000] # cap iterations if loading from files with longer histories (e.g. 5000)
            elif Y_LABEL == "Coverage":
                qd_score_seed = map_elites_evaluation.coverage[:2000]
            elif Y_LABEL == "Best Solution Quality":
                qd_score_seed = map_elites_evaluation.max_fitnesses[:2000]
            qd_score_seeds.append([qd_score_seed])
        
        qd_score_runs[str(expt_dir)] = qd_score_seeds
        prev_expt_dir = expt_dir

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi'] = 300

# compute stats for each run/experiment, and add to same plot
fig, ax = plt.subplots(figsize=(4,3))

for run in qd_score_runs.keys():
    if run in RESULT_PATHS.keys():
        run_data = qd_score_runs[run]
        run_name = RESULT_PATHS[run]
        run_data = np.array(run_data)[:,0]
        plot_mean(run_data, run_name, ax)

plt.tight_layout()
plt.show()