In [None]:
# add project root (parent of figures) to module search path
import os
os.chdir(os.path.abspath(".."))  # now CWD is ai_scientist_project/
#  import packages
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# get the program data from the census

def extract_program_data_new(census, reference_loss=22.02000046):
    """
    Extracts program data from the NEW census arrays. These are not stored in such a stupid way as the old ones.
    Each program is a row in a long numpy array with the following columns:
    - 0: iteration number
    - 1: island number
    - 2: batch number
    - 3: LLM name
    - 4: loss
    - 5: time in seconds
    - 6: parent1 id (int)
    - 7: parent2 id (int)
    - 8: y_eval (float)
    - 9: n_free_params (int)
     
    Returns a dictionary with the following keys
    - 'scores': np.ndarray of scores (exp(-(loss - reference_loss)))
    - 'losses': np.ndarray of losses
    - 'running_max_scores': np.ndarray of running max scores
    - 'times': np.ndarray of times in seconds
    - 'parent1_id': np.ndarray of parent1 ids (int)
    - 'parent2_id': np.ndarray of parent2 ids (int)
    - 'program_id': list of tuples representing the program ids in format (iteration, island
    , batch)
    - 'llm_name': np.ndarray of LLM names (str)
    - 'innovation_indices': list of indices where innovations occurred (where the max score increases)
    - 'generations': list of lists, where generations[k] is a list of the kth order ancestors of the winner.
                     So generations[0] = [winner_index], generations[1] = [parent1_index, parent2_index], etc.
    - 'y_eval': np.ndarray of y_eval values (float)
    - 'n_free_params': np.ndarray of number of free parameters (int)
    """
    # extract data from the census
    n_programs = census.shape[0]  # number of programs
    losses = census[:, 4]  # losses
    times = census[:, 5]  # times in seconds
    program_tuple_id = [tuple(census[i, :3]) for i in range(n_programs)]  # program ids as tuples (iteration, island, batch)
    parent1_tuple_id = census[:, 6]  # parent1 ids
    parent2_tuple_id = census[:, 7]  # parent2 ids
    llm_name = census[:, 3]  # LLM names
    y_eval = census[:, 8]  # y_eval values
    n_free_params = census[:, 9]  # number of free parameters (not used in this function, but could be useful later)

    # now loop through all programs with index >= 2 and find their parents
    parent1_int_id = [-1, -1]  # initialize parent ids with -1
    parent2_int_id = [-1, -1]  # initialize parent ids with -1
    for i in range(2, n_programs):
        p1_tuple_id = parent1_tuple_id[i]
        p2_tuple_id = parent2_tuple_id[i]
        # print(f"Processing program {i}: {program_tuple_id[i]}, parents: {p1_tuple_id}, {p2_tuple_id}")
        # print(f"p1_tuple_id.dtype: {type(p1_tuple_id)}, p2_tuple_id.dtype: {type(p2_tuple_id)}")
        p1_int_id = program_tuple_id.index(p1_tuple_id)
        p2_int_id = program_tuple_id.index(p2_tuple_id)
        # print(f"p1_int_id: {p1_int_id}, p2_int_id: {p2_int_id}")
        parent1_int_id.append(p1_int_id)
        parent2_int_id.append(p2_int_id)
    # check everyone has 2 parents
    assert len(parent1_int_id) == n_programs, "parent1_int_id must have the same length as census"
    assert len(parent2_int_id) == n_programs, "parent2_int_id must have the same length as census"

    # convert to numpy arrays
    parent1_id = np.array(parent1_int_id, dtype=int)
    parent2_id = np.array(parent2_int_id, dtype=int)
    llm_name = np.array(llm_name, dtype=str)
    # subtract the start time from all times
    times = times - np.min(times)
    # get scores
    losses = np.array(losses, dtype=float)
    losses_relative = losses - reference_loss  # relative losses
    scores = np.exp(-losses_relative)  # convert losses to scores
    running_max_scores = np.maximum.accumulate(scores)  # running max scores
    # look for innovations (where the max score increases) and get the indices
    innovation_threshold = 0.01  # threshold for innovation detection
    innovation_indices = np.where(np.diff(running_max_scores) > innovation_threshold)[0]
    innovation_indices = innovation_indices.tolist()  # convert to list for easier manipulation
    innovation_indices = [1] + innovation_indices  # include the first index

    # build family tree of winner
    index = np.argmax(scores)
    generations = [[index]]
    # now go through the most recent generation and find the parents
    all_seed = False
    while not all_seed:
        current_generation = generations[-1]
        all_seed = all([i < 2 for i in current_generation])  # check if all are seed programs
        parent_generation = []
        for idx in current_generation:
            p1_idx, p2_idx = parent1_id[idx], parent2_id[idx]
            if p1_idx >= 0:
                parent_generation.append(p1_idx)
            if p2_idx >= 0:
                parent_generation.append(p2_idx)
        parent_generation = list(set(parent_generation))
        if len(parent_generation) == 0:
            break
        generations.append(parent_generation)

    results_dict = {
        'scores': scores,
        'losses': losses,
        'running_max_scores': running_max_scores,
        'times': times,
        'parent1_id': parent1_id,
        'parent2_id': parent2_id,
        'program_id': program_tuple_id,
        'llm_name': llm_name,
        'innovation_indices': innovation_indices,
        'generations': generations,
        'y_eval': y_eval,
        'n_free_params': n_free_params
    }
    return results_dict

In [None]:
# 'program_databases/07-15/16-22-07 (big_only)/combined/census.npy',
# 'program_databases/07-15/17-05-19 (big_only)/combined/census.npy',
# 'program_databases/07-15/17-48-16 (big_only)/combined/census.npy',
# 'program_databases/07-15/18-31-33 (big_only)/combined/census.npy',

# 'program_databases/07-16/00-59-22 (mix)/combined/census.npy',
# 'program_databases/07-16/01-46-55 (mix)/combined/census.npy',
# 'program_databases/07-16/02-29-34 (mix)/combined/census.npy',
# 'program_databases/07-16/03-15-03 (mix)/combined/census.npy',

# 'program_databases/07-17/07-14-37 (mix + image)/combined/census.npy',
# 'program_databases/07-17/08-10-17 (mix + image)/combined/census.npy',
# 'program_databases/07-17/09-07-26 (mix + image)/combined/census.npy',
# 'program_databases/07-17/10-05-55 (mix + image)/combined/census.npy',

# 'program_databases/07-20/01-37-40 (image feedback)/combined/census.npy',
# 'program_databases/07-20/02-43-00 (image feedback)/combined/census.npy',
# 'program_databases/07-20/03-45-10 (image feedback)/combined/census.npy',
# 'program_databases/07-20/04-47-23 (image feedback)/combined/census.npy',

# 'program_databases/07-16/12-46-54 (big + no_image)/combined/census.npy',
# 'program_databases/07-16/13-18-43 (big + no_image)/combined/census.npy',
# 'program_databases/07-16/13-51-51 (big + no_image)/combined/census.npy',
# 'program_databases/07-16/14-23-46 (big + no_image)/combined/census.npy',

# 'program_databases/07-16/10-46-45 (mix + no_image)/combined/census.npy',
# 'program_databases/07-16/11-18-03 (mix + no_image)/combined/census.npy',
# 'program_databases/07-16/11-48-51 (mix + no_image)/combined/census.npy',
# 'program_databases/07-16/12-18-10 (mix + no_image)/combined/census.npy',

# 'program_databases/07-17/11-02-49 (mix + no_image)/combined/census.npy',
# 'program_databases/07-17/11-42-11 (mix + no_image)/combined/census.npy',
# 'program_databases/07-17/12-22-36 (mix + no_image)/combined/census.npy',
# 'program_databases/07-17/14-55-12 (mix + no_image)/combined/census.npy',

# 'program_databases/07-17/04-58-28 (no_param_est)/combined/census.npy',
# 'program_databases/07-17/05-31-58 (no_param_est)/combined/census.npy',
# 'program_databases/07-17/06-05-20 (no_param_est)/combined/census.npy',
# 'program_databases/07-17/06-40-16 (no_param_est)/combined/census.npy',

# 'program_databases/07-17/19-02-25 (no_gradient)/combined/census.npy',
# 'program_databases/07-17/20-10-14 (no_gradient)/combined/census.npy',
# 'program_databases/07-17/21-11-44 (no_gradient)/combined/census.npy',
# 'program_databases/07-17/22-13-56 (no_gradient)/combined/census.npy',

# 'program_databases/07-20/05-52-26 (no image)/combined/census.npy',
# 'program_databases/07-20/06-26-59 (no image)/combined/census.npy',
# 'program_databases/07-20/07-01-03 (no image)/combined/census.npy',
# 'program_databases/07-20/07-35-59 (no image)/combined/census.npy',

# 'program_databases/07-21/15-31-23 (no_image)/combined/census.npy',
# 'program_databases/07-21/18-30-33 (no_image)/combined/census.npy',
# 'program_databases/07-21/19-33-36 (no_image)/combined/census.npy',
# 'program_databases/07-21/20-34-39 (no_image)/combined/census.npy',

# 'program_databases/07-22/02-35-30 (no image)/combined/census.npy',
# 'program_databases/07-22/03-16-56 (no image)/combined/census.npy',
# 'program_databases/07-22/04-01-16 (no image)/combined/census.npy',
# 'program_databases/07-22/04-47-09 (no image)/combined/census.npy',

# 'program_databases/07-22/05-31-26 (image)/combined/census.npy',
# 'program_databases/07-22/06-16-02 (image)/combined/census.npy',
# 'program_databases/07-22/07-00-37 (image)/combined/census.npy',
# 'program_databases/07-22/07-45-07 (image)/combined/census.npy',
# 'program_databases/07-23/01-00-00 (image)/combined/census.npy',

# 'program_databases/07-23/03-10-12 (text)/combined/census.npy',
# 'program_databases/07-23/04-09-16 (text)/combined/census.npy',
# 'program_databases/07-23/05-08-08 (text)/combined/census.npy',
# 'program_databases/07-23/06-07-12 (text)/combined/census.npy',

# 'program_databases/07-22/23-18-55 (image)/combined/census.npy',
# 'program_databases/07-23/00-21-26 (image)/combined/census.npy',
# 'program_databases/07-23/01-16-39 (image)/combined/census.npy',
# 'program_databases/07-23/02-14-04 (image)/combined/census.npy',

# 'program_databases/07-30/15-16-32/combined/census.npy',
# 'program_databases/07-30/16-11-02/combined/census.npy',
# 'program_databases/07-30/17-01-31/combined/census.npy',

# 'program_databases/07-30/17-52-59/combined/census.npy',
# 'program_databases/07-30/18-44-20/combined/census.npy',
# 'program_databases/07-30/19-31-37/combined/census.npy',

# 'program_databases/07-31/03-14-29/combined/census.npy',
# 'program_databases/07-31/03-52-27/combined/census.npy',
# 'program_databases/07-31/04-31-51/combined/census.npy',
# 'program_databases/07-31/05-08-24/combined/census.npy',

# 'program_databases/07-31/05-49-09/combined/census.npy',
# 'program_databases/07-31/06-20-58/combined/census.npy',
# 'program_databases/07-31/06-55-12/combined/census.npy',
# 'program_databases/07-31/07-37-03/combined/census.npy',

census_path = 'program_databases/07-22/23-18-55 (image)/combined/census.npy'
census = np.load(census_path, allow_pickle=True)
print(f"Processing {census_path}...")
results = extract_program_data_new(census)

# Now we can compute the scores and other metrics
losses = results['losses'].copy()
n_free_params = results['n_free_params'].copy()
n_free_params = np.array(n_free_params, dtype=float)
raw_losses = losses - 0.01 * n_free_params
raw_seed_loss = raw_losses[1]
gain_raw = raw_seed_loss - raw_losses
score = gain_raw - 0.01 * (n_free_params - n_free_params[1])
score = np.clip(score, 0, None)  # ensure scores are non-negative


In [None]:
plt.figure(figsize=(12, 8))

# Scatter plot
scatter = plt.scatter(
    n_free_params, 
    gain_raw, 
    alpha=1, 
    s=100, 
    c=score, 
    cmap='viridis', 
    edgecolor='black'
)

# add point with transparent face and red edge for the best program
best_program_idx = np.argmax(score)
plt.scatter(n_free_params[best_program_idx], 
            gain_raw[best_program_idx], 
            s=200, 
            facecolor='none', 
            edgecolor='red', 
            linewidth=2, 
            label='Best Program')

# Add colorbar
cbar = plt.colorbar(scatter)
cbar.set_label(r'$\text{Score}[\phi] = \mathcal{L}[\phi] - \lambda \cdot \text{n\_free\_params}[\phi]$', fontsize=16)
# set cbar ticks to only be 0.0 and max
cbar.set_ticks([0.0, np.max(score)])
cbar.set_ticklabels(['0.0', f'{np.max(score):.2f}'])

# add line with slope 0.01 going through the best program
x = np.linspace(4, 18, 100)
y = 0.01 * (x - n_free_params[best_program_idx]) + gain_raw[best_program_idx]
plt.plot(x, y, color='red', linestyle='--', linewidth=2, label=f'score={score[best_program_idx]:.2f}', zorder=0)
# add line with slope 0.01 with score 0.1
y2 = 0.01 * (x - n_free_params[best_program_idx]) + gain_raw[best_program_idx] - 0.09
plt.plot(x, y2, color='grey', linestyle='--', linewidth=2, label='0.10 Score Line', zorder=0)
# add a line with slope 0.01 with score 0.0
y3 = 0.01 * (x - n_free_params[best_program_idx]) + gain_raw[best_program_idx] - 0.19
plt.plot(x, y3, color='grey', linestyle='--', linewidth=2, label='0.00 Score Line', zorder=0)

# Set plot limits and labels
plt.ylim(-0.01, 0.28)
plt.xlim(4, 18)
plt.xlabel(r'$\mathcal{C}[\phi] = \text{n\_free\_params}[\phi]$', fontsize=16)
plt.ylabel(r'$\text{Likelihood}[\phi]$', fontsize=16)

# Add angled text at the red line
plt.text(7, gain_raw[best_program_idx] - 0.04,
         f'Score = {score[best_program_idx]:.2f}', fontsize=18, color='red', rotation=21, ha='right')
# add text at score 0.1 line
plt.text(7, gain_raw[best_program_idx] - 0.14 + 0.01,
         'Score = 0.10', fontsize=18, color='grey', rotation=21, ha='right')
# add text at score 0.0 line
plt.text(7, gain_raw[best_program_idx] - 0.24 + 0.01,
         'Score = 0.00', fontsize=18, color='grey', rotation=21, ha='right')

# Add title
plt.title('Program Likelihood (y-axis) vs Program Complexity (x-axis)', fontsize=16)

# Show plot
plt.yticks([])
plt.xticks(np.arange(4, 19), fontsize=14)
plt.tight_layout()
plt.show()