In [1]:
# set paths of experiment logs to compare performance and save elites_last_iter.jsonl file per (re-)run
runs_whitelist = {
    "data/histories_poetry/gpt_4/qdaif_lmx_rewrite": "QDAIF (GPT-4)",
    "data/histories_poetry/gpt_3_5/qdaif_lmx_rewrite": "QDAIF (GPT 3.5)",
}

In [None]:
import json
from pathlib import Path
from typing import Any

def load_run_stats(
    base_dirs: list[str],
) -> dict[str, Any]:
    qd_score_runs = {}
    for base_dir in base_dirs: # contains every run in base_dir
        qd_score_seeds = []
        for seed_dir in base_dir.rglob("**/history.jsonl"):
            with open(seed_dir, "r") as f:
                data = [json.loads(line) for line in f]
            
            # empty elites archive
            all_genres_tones = [
                f'{g}_{t}' for g in ['haiku', 'sonnet', 'ballad', 'limerick', 'hymn']
                for t in ['happy', 'dark', 'mysterious', 'romantic', 'reflective']
            ]
            elites = {key: ('', -1) for key in all_genres_tones}  # (genotype, fitness) tuple

            load_iterations(qd_score_seeds, data, elites, seed_dir)

            # log final elites
            output_path = seed_dir.parent / "elites_last_iter.jsonl"
            with open(output_path, "w") as file:
                for k, v in elites.items():
                    elite_entry = {
                        'poem': v[0],
                        'quality': v[1],
                        'genre': k.split('_')[0],
                        'tone': k.split('_')[1]
                    }
                    file.write(json.dumps(elite_entry) + "\n")

        qd_score_runs[str(base_dir)] = qd_score_seeds
    return qd_score_runs

def load_iterations(qd_score_seeds, data, elites, seed_dir):
    qd_score_seed = []

    # fill in elites archive
    for datapoint in data:
        poem = datapoint['poem']
        quality = float(datapoint['quality'])
        genre = datapoint['genre']
        tone = datapoint['tone']

        if genre not in ['haiku', 'sonnet', 'ballad', 'limerick', 'hymn']:
            print("erroneous genre detected, skipping")
            continue

        if tone not in ['happy', 'dark', 'mysterious', 'romantic', 'reflective']:
            print("erroneous tone detected, skipping")
            continue

        prev_elite, prev_quality = elites[f'{genre}_{tone}']
        if quality >= prev_quality: # greater than or equal to get latest elites
            elites[f'{genre}_{tone}'] = (poem, quality)
        # qd score per iteration
        qd_score_seed.append(sum([max(v[1], 0) for v in elites.values()]))

    print(f"Path: {seed_dir} ========= QD: {qd_score_seed[-1]}")
    qd_score_seeds.append(qd_score_seed)

qd_score_runs = load_run_stats([Path(list(runs_whitelist.keys())[i]) for i in range(len(runs_whitelist))])

In [9]:
import numpy as np
import matplotlib.pyplot as plt

def bootstrap_samples(data, n_bootstrap_samples=1000):
    """Generates bootstrap samples."""
    n_points = data.shape[1]
    bootstrap_samples = np.empty((n_bootstrap_samples, data.shape[0], n_points))
    
    for i in range(n_bootstrap_samples):
        sample_indices = np.random.choice(data.shape[0], size=data.shape[0], replace=True)
        bootstrap_samples[i] = data[sample_indices]
    
    return bootstrap_samples

def plot_with_bootstrap_ci(data, run_name, ax, stat_func):
    """
    Plots the statistic along with its 95% CI using bootstrapping.
    
    Parameters:
    - data: 2D array, shape (num_seeds, timesteps), data to plot
    - run_name: str, name of the run to be used in the legend
    - ax: matplotlib axis, axis to plot on
    - stat_func: function, function to compute the statistic
    """
    n_bootstrap_samples = 1000
    bootstrap_s = bootstrap_samples(data, n_bootstrap_samples)
    
    # stats for each bootstrap sample and for each timestep
    bootstrap_stat = np.array([stat_func(sample, axis=0) for sample in bootstrap_s])
    
    # 95% CI
    lower_bound = np.percentile(bootstrap_stat, 2.5, axis=0)
    upper_bound = np.percentile(bootstrap_stat, 97.5, axis=0)
    stat_original = stat_func(data, axis=0)
    
    timesteps = np.arange(data.shape[1])
    ax.plot(timesteps, stat_original, label=run_name)
    ax.fill_between(timesteps, lower_bound, upper_bound, alpha=0.2)
    ax.set_xlabel("Iterations", fontsize=14)
    ax.set_ylabel("QD Score", fontsize=14)
    ax.set_title(f"Poetry", fontsize=16)
    ax.legend(loc="lower right", fontsize=10)

def plot_mean(data, run_name, ax):
    plot_with_bootstrap_ci(data, run_name, ax, np.mean)

In [None]:
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

# compute stats for each run/experiment, and add to same plot
fig, ax = plt.subplots(figsize=(4,3))

for run in qd_score_runs.keys():
    if run in runs_whitelist.keys():
        run_data = qd_score_runs[run]
        run_name = runs_whitelist[run]
        run_data = np.array(run_data)
        plot_mean(run_data, run_name, ax)

plt.tight_layout()
plt.show()