In [None]:
# Copyright 2019 Google LLC
# Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2020.06 
# Copyright 2025 Anonymized Authors

# Licensed under the Apache License, Version 2.0 (the "License"); 
# you may not use this file except in compliance with the License. 
# You may obtain a copy of the License at
# https://www.apache.org/licenses/LICENSE-2.0
"""
This notebook is designed to familiarize with the code and the underlying 
datasets packages such as NAS-Bench-101 and NATS-Bench. We will introduce the
functions that are used in our nas_utils.py to reduce boilerplate code.

Requirements: 

- the install script should automatically install all libraries
-This notebook requires that tensorflow and numpy be installed within the 
Python environment you are running this script in. 
- NATS-Bench: 
git submodule update --init --recursive
cd ./thirdparty/autodl
pip install .
- NAS-Bench-101
pip install -i https://test.pypi.org/simple/ nasbench-TF2 (this is a TF2 version)
python ./experiments/utils/download_nasbench.py 
"""

In [None]:
# first, we check if or benchmarks are correctly installed and working

import os
import numpy as np
import tensorflow as tf # required for nasbench
import nasbench # checking if this causes any errors
from nasbench.api import ModelSpec, NASBench
import nats_bench # checking if this causes any errors 
from nats_bench import create
import torch # required for natsbench

print(f"Installed numpy version     : {np.__version__:11s} | expected: 1.26.4")
print(f"Installed TensorFlow version: {tf.__version__:11s} | expected: 2.15.0")
print(f"Installed torch version     : {torch.__version__:11s} | expected: 2.3.0")

# check if submodule is installed correctly
submodule_installed = (os.path.isfile(os.path.join("..", "thirdparty", "autodl", "exps", "NATS-algos", "regularized_ea.py")))
print(f"Submodule file found: {submodule_installed}")


# TORCH_HOME needs to be set
if "TORCH_HOME" not in os.environ:
    print("TORCH_HOME was not found. Please refer to readme.")
else:
    print("TORCH installed correctly")



In [None]:
# lets load the nasbench 
def load_nasbench(path=None):
    """Loads nasbench dataset from data."""
    if path is None:
        path = os.path.join("..","generated", "nasbench_only108.tfrecord") 

    physical_devices = tf.config.experimental.list_physical_devices("GPU")

    if len(physical_devices) > 0:
        print("Using GPU")
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
    else:
        print("No GPU.")

    return NASBench(path)

# we expect the download script to download the dataset to ../generated/nasbench_only108.tfrecord
nasb = load_nasbench() 

In [None]:
# these are helper functions for running our code 

# defines benchmark specific constants
class NASBenchConstants:
    INPUT = "input"
    OUTPUT = "output"
    CONV3X3 = "conv3x3-bn-relu"
    CONV1X1 = "conv1x1-bn-relu"
    MAXPOOL3X3 = "maxpool3x3"
    NUM_VERTICES = 7
    MAX_EDGES = 9
    EDGE_SPOTS = NUM_VERTICES * (NUM_VERTICES - 1) / 2  # Upper triangular matrix
    OP_SPOTS = NUM_VERTICES - 2  # Input/output vertices are fixed
    ALLOWED_OPS = [CONV3X3, CONV1X1, MAXPOOL3X3]
    ALLOWED_EDGES = [0, 1]  # Binary adjacency matrix
    OPS_IND = list(range(len(ALLOWED_OPS)))
    TRIU = np.triu_indices(NUM_VERTICES, k=1)


# a wrapper function for the original NAS-Bench-101 encoding that allows to use 
# its original usecase but you can extend the support for a onehot flattened with
# flat = spec.to_flat() and spec = SpecOneHot.spec_from_flat()
class SpecOneHot(ModelSpec):
    """Spec extension to support categorical one-hot for operations and adjacency matrix."""

    flat_matrix = ((NASBenchConstants.NUM_VERTICES-1+1)*(NASBenchConstants.NUM_VERTICES-1)//2) * 2
    input_size = ((NASBenchConstants.NUM_VERTICES-1+1)*(NASBenchConstants.NUM_VERTICES-1)//2)*2 + len(NASBenchConstants.ALLOWED_OPS)*(NASBenchConstants.NUM_VERTICES - 2)
  
    name_lu = {op: np.eye(3)[i] for i, op in enumerate(NASBenchConstants.ALLOWED_OPS)}
    onehot_lu = list(name_lu.values())

    def __init__(self, matrix, ops):
        super().__init__(matrix=matrix, ops=ops)
        # self.flat = self.to_flat() # only when using instead of when initializing

    def to_flat(self):
        ops = self.original_ops[1:-1] # omit first and last one 
        mat = self.original_matrix
        ops_onehot = np.concatenate([SpecOneHot.name_lu[s] for s in ops])
        flat = self.encode(mat, ops_onehot)
        return flat.astype(int)

    @staticmethod
    def encode(matrix,ops_onehot):
        # TRIU only uses upper triangle matrix indices including diagonal
        onehot_matrix = np.eye(2)[matrix[NASBenchConstants.TRIU]].flatten()
        flattened =  np.concatenate((onehot_matrix,ops_onehot))
        return flattened
    
    @staticmethod
    def decode(flat):
        matrix = np.zeros((NASBenchConstants.NUM_VERTICES, NASBenchConstants.NUM_VERTICES), dtype=flat.dtype)
        onehot_matrix = flat[:SpecOneHot.flat_matrix].reshape((-1,2))
        entries = np.argmax(onehot_matrix, 1)
        matrix[NASBenchConstants.TRIU] = entries
        assert np.all(np.triu(matrix) == matrix)

        ops_onehot = flat[SpecOneHot.flat_matrix:].reshape((NASBenchConstants.NUM_VERTICES - 2, len(NASBenchConstants.ALLOWED_OPS)))
        assert np.sum(ops_onehot) == NASBenchConstants.NUM_VERTICES - 2

        indices = np.argmax(ops_onehot, 1)
        return matrix, indices

    @staticmethod
    def test_decode_encode():
        rando = np.random.randint(2, size=NASBenchConstants.NUM_VERTICES*NASBenchConstants.NUM_VERTICES)
        matrix = rando.reshape((NASBenchConstants.NUM_VERTICES, NASBenchConstants.NUM_VERTICES))
        matrix = np.triu(matrix, 1)
        ops_ind = np.random.choice(NASBenchConstants.OPS_IND, size=NASBenchConstants.NUM_VERTICES-2)
        ops_onehot = np.concatenate([SpecOneHot.onehot_lu[i] for i in ops_ind])
        flat = SpecOneHot.encode(matrix,ops_onehot)
        m,o = SpecOneHot.decode(flat)
        assert np.array_equal(matrix,m) 
        assert np.array_equal(ops_ind,o) 

    @staticmethod
    def spec_from_flat(flat):
        matrix,indices = SpecOneHot.decode(flat)
        ops = np.asarray(NASBenchConstants.ALLOWED_OPS)[indices]
        cops = np.concatenate(([NASBenchConstants.INPUT], ops, [NASBenchConstants.OUTPUT])).tolist()      
        spec = SpecOneHot(matrix=matrix, ops=cops)
        return spec
    
# this returns a random spec from the NAS-Bench-101    
def random_spec(nasbench: NASBench):
    """Returns a random valid spec."""
    while True:
        matrix = np.random.choice(NASBenchConstants.ALLOWED_EDGES,
                                  size=(NASBenchConstants.NUM_VERTICES, NASBenchConstants.NUM_VERTICES))
        matrix = np.triu(matrix, 1)
        ops = np.random.choice(NASBenchConstants.ALLOWED_OPS, size=NASBenchConstants.NUM_VERTICES).tolist()
        ops[0] = NASBenchConstants.INPUT
        ops[-1] = NASBenchConstants.OUTPUT
        spec = SpecOneHot(matrix=matrix, ops=ops)
        if nasbench.is_valid(spec):
            return spec

In [None]:
# now, we define regularized evolution search as in https://github.com/google-research/nasbench/blob/b94247037ee470418a3e56dcb83814e9be83f3a8/NASBench.ipynb#L339
import random
import copy


def random_combination(iterable, sample_size):
    """Random selection from itertools.combinations(iterable, r)."""
    pool = tuple(iterable)
    n = len(pool)
    indices = sorted(random.sample(range(n), sample_size))
    return tuple(pool[i] for i in indices)

def mutate_spec(old_spec, nasbench: NASBench, mutation_rate=1.0):
    """Computes a valid mutated spec from the old_spec."""
    while True:
        new_matrix = copy.deepcopy(old_spec.original_matrix)
        new_ops = copy.deepcopy(old_spec.original_ops)

        # In expectation, V edges flipped (note that most end up being pruned).
        edge_mutation_prob = mutation_rate / NASBenchConstants.NUM_VERTICES
        for src in range(0, NASBenchConstants.NUM_VERTICES - 1):
            for dst in range(src + 1, NASBenchConstants.NUM_VERTICES):
                if random.random() < edge_mutation_prob:
                    new_matrix[src, dst] = 1 - new_matrix[src, dst]

        # In expectation, one op is resampled.
        op_mutation_prob = mutation_rate / NASBenchConstants.OP_SPOTS
        for ind in range(1, NASBenchConstants.NUM_VERTICES - 1):
            if random.random() < op_mutation_prob:
                available = [
                    o for o in nasbench.config["available_ops"]
                    if o != new_ops[ind]
                ]
                new_ops[ind] = random.choice(available)

        new_spec = SpecOneHot(new_matrix, new_ops)
        if nasbench.is_valid(new_spec):
            return new_spec

def run_revolution_search(
    nasbench: NASBench,
    max_time_budget=5e6,
    population_size=50,
    tournament_size=10,
    mutation_rate=0.5
):
    """Run a single roll-out of regularized evolution to a fixed time budget."""

    times, best_valids, best_tests = [0.0], [0.0], [0.0]
    population = []  # (validation, spec) tuples

    # For the first population_size individuals, seed the population with
    # randomly generated cells.
    for _ in range(population_size):
        spec = random_spec(nasbench)
        data = nasbench.query(spec)
        time_spent, _ = nasbench.get_budget_counters()
        times.append(time_spent)
        population.append((data["validation_accuracy"], spec))

        if data["validation_accuracy"] > best_valids[-1]:
            best_valids.append(data["validation_accuracy"])
            best_tests.append(data["test_accuracy"])
        else:
            best_valids.append(best_valids[-1])
            best_tests.append(best_tests[-1])

        if time_spent > max_time_budget:
            break
    # After the population is seeded, proceed with evolving the population.
    while True:
        sample = random_combination(population, tournament_size)
        best_spec = sorted(sample, key=lambda i: i[0])[-1][1]
        new_spec = mutate_spec(best_spec, nasbench, mutation_rate)

        data = nasbench.query(new_spec)
        time_spent, _ = nasbench.get_budget_counters()
        times.append(time_spent)

        # In regularized evolution, we kill the oldest individual.
        population.append((data["validation_accuracy"], new_spec))
        population.pop(0)

        if data["validation_accuracy"] > best_valids[-1]:
            best_valids.append(data["validation_accuracy"])
            best_tests.append(data["test_accuracy"])
        else:
            best_valids.append(best_valids[-1])
            best_tests.append(best_tests[-1])

        if time_spent > max_time_budget:
            break

    return times, best_valids, best_tests

In [None]:
# now we define plotting functions so we can define experiments and plot the data

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from IPython.display import clear_output
from scipy.stats import t # for statistical testing

def plot_run(data, color, label, config, ax=None, gran=10000):
  """Computes the mean and std"""

  # which = 2 is test, which = 1 is valid
  which = 2
  if config["dataset"] == "test":
    which = 2
  elif config["dataset"] == "validation": 
    which = 1
  xs = range(0, config["budget"]+1, gran)
  mean = [0.0]
  std = [0.0]
  per25 = [0.0]
  per75 = [0.0]
  ci_lower = [0.0]
  ci_upper = [0.0]
  repeats = len(data)
  pointers = [1 for _ in range(repeats)]
  
  cur = gran
  while cur < config["budget"]+1:
    all_vals = []
    for repeat in range(repeats):
      while (pointers[repeat] < len(data[repeat][0]) and 
             data[repeat][0][pointers[repeat]] < cur):
        pointers[repeat] += 1
      prev_time = data[repeat][0][pointers[repeat]-1]
      prev_test = data[repeat][which][pointers[repeat]-1]
      next_time = data[repeat][0][pointers[repeat]]
      next_test = data[repeat][which][pointers[repeat]]
      assert prev_time < cur and next_time >= cur

      # Linearly interpolate the test between the two surrounding points
      cur_val = ((cur - prev_time) / (next_time - prev_time)) * (next_test - prev_test) + prev_test
      
      all_vals.append(cur_val)
      
    all_vals = sorted(all_vals)


    all_vals = sorted(all_vals)
    cur_mean = sum(all_vals) / float(len(all_vals))
    cur_std = np.std(all_vals)
    std.append(cur_std)
    
    mean.append(sum(all_vals) / float(len(all_vals)))
    per25.append(all_vals[int(0.25 * repeats)])
    per75.append(all_vals[int(0.75 * repeats)])

    if config["confidence_intervall"] is True:

      # Calculate the confidence interval
      sem = cur_std / np.sqrt(repeats)  # Standard Error
      confidence = 1 - config["pvalue"]
      t_critical = t.ppf(confidence + (1 - confidence) / 2, repeats - 1)
      margin_of_error = t_critical * sem
        
      ci_lower.append(cur_mean - margin_of_error)
      ci_upper.append(cur_mean + margin_of_error)
    
    cur += gran

  if config["confidence_intervall"] is True:
    ax.fill_between(xs, ci_lower, ci_upper, alpha=0.5, linewidth=0, facecolor=color)
  ax.plot(xs, mean, color=color, label=label, linewidth=2)
  # plt.fill_between(xs, per25, per75, alpha=0.1, linewidth=0, facecolor=color)

  return mean, std


def plot_significance(means, stds, n, red, green, ax, pvalue_threshold):
    comparisons = len(means)

    all_pvalues = []
    for a in range(comparisons):
        for b in range(comparisons):

            if a <=b :
                continue
            pvalues = []
            for mean1, std1, mean2, std2 in zip(means[a], stds[a], means[b], stds[b]):
                
                se1 = std1 / np.sqrt(n)
                se2 = std2 / np.sqrt(n)

                if se1==0 and se2==0:
                    pvalues.append(0.0)
                    continue
                t_stat = (mean1 - mean2) / np.sqrt(se1**2 + se2**2)
                
                df = ((se1**2 + se2**2)**2) / (((se1**2)**2 / (n-1)) + ((se2**2)**2 / (n-1)))
                
                p_value = 2 * t.sf(np.abs(t_stat), df)
                pvalues.append(p_value)
            # plt.plot(range(len(pvalues)), pvalues, label=f"comparing {b} and {a}", linewidth=2)
            all_pvalues.append(pvalues)
    # plt.show()
    for idx in range(len(all_pvalues[0])):
        color = green
        for k in range(len(all_pvalues)):
            p = all_pvalues[k][idx]
            if p > pvalue_threshold: # at least one red
                color = red

        if color == green:
            continue
        ax.vlines(x=idx*10000, ymin=0.9, ymax=0.943, alpha=0.1,color=color, linewidth=2)

    # this line is just for plotting the legend label once
    ax.vlines(x=idx*10000, ymin=1, ymax=1.1,color=red, linewidth=2, label=f"pvalue > {pvalue_threshold}")



def plot_all(exp, ax=None):
    # tableau_colorblind10
    tc = {
        'Dark Blue': '#006BA4',
        'Blue': '#5F9ED1',
        'Light Blue': '#A2C8EC',
        'Dark Orange': '#FF800E',
        'Red Orange': '#C85200',
        'Light Orange': '#FFBC79',
        'Very Dark Gray': '#595959',
        'Dark Gray': '#898989',
        'Light Gray': '#ABABAB',
        'Very Light Gray': '#CFCFCF',
        "Green" : '#228B22',
        "Red" : '#FF4500' 
    }

    # clear_output(wait=True)  # Clear the previous output

    means, stds = [],[]
    for key,value in exp["data"].items():
        label = key
        data = value[0]
        color = value[1]

        mean, std = plot_run(data, tc[color], label, exp["config"], ax)
        means.append(mean)
        stds.append(std)


    if exp["config"]["significant_areas"]:
        plot_significance(means, stds, exp["config"]["n"], tc['Light Gray'], tc['Green'], ax, exp["config"]["pvalue"])


    ax.legend(loc='lower right')
    ax.set_ylim(exp["config"]["limits"][0], exp["config"]["limits"][1])
    ax.set_xlabel('total training time spent (seconds)')
    ax.set_ylabel('accuracy')
    return means, stds


In [None]:
# now we can go ahead with a run.

# we can initialize an experiment like this, feel free to modify configs
exp1 = {
    "data": 
    {
        "regularized evolution" : [[],"Very Dark Gray"],

    },
    "config": 
    {
        "budget" : int(5e5), # maximum time budget for the experiment
        "limits" : (0.93, 0.9435), # y limits for the plot
        "n" : 10, # number of runs
        "print_every" : 2, # update you view ever n runs
        "confidence_intervall" : True, # plotting the confidence intervall
        "pvalue" : 0.05, # pvalue when comparing multiple algorithms
        "significant_areas": False, # plotting significant areas with multiple algorithms
        "dataset" : "test", # which dataset to use for evaluation
   }
}

budget = exp1["config"]["budget"] # get budget from config
for run in range(exp1["config"]["n"]):
    nasb.reset_budget_counters() # we need to rest this before every search run
    times, best_valid, best_test = run_revolution_search(nasb, budget, 50,10,0.72)
    exp1["data"]["regularized evolution"][0].append((times, best_valid, best_test))

    if (run % exp1["config"]["print_every"] == 0): # plot intermediate results
        clear_output(wait=True)
        fig, ax = plt.subplots()
        plot_all(exp1, ax)
        plt.show()
        print('Running repeat %d' % (run + 1))

# plot final result
clear_output(wait=True)
fig, ax = plt.subplots()
plot_all(exp1, ax)
plt.show()

In [None]:
# next, we will look into the NATS-Bench
from collections import OrderedDict
from nats_bench import create
from scipy.stats import t

# as this library saves its data in its individual files, we will only use 
# scripts to access and read out the results and then visualize them. 


# we expect to fetch data from ../output/search/tss
def fetch_data(root_dir=None, search_space="tss", dataset=None, algorithms=["REA","REINFORCE", "RANDOM", "GM", "GE", "BOHB"]):
    if root_dir is None:
        root_dir = os.path.join("..", "output", "search")
    ss_dir = "{:}-{:}".format(root_dir, search_space)
    alg2name, alg2path = OrderedDict(), OrderedDict()
    for alg in algorithms:
        alg2name[alg] = alg
    for alg, name in alg2name.items():
        alg2path[alg] = os.path.join(ss_dir, dataset, name, "results.pth")
        assert os.path.isfile(alg2path[alg]), "invalid path : {:}".format(alg2path[alg])
    alg2data = OrderedDict()
    for alg, path in alg2path.items():
        data = torch.load(path)
        for index, info in data.items():
            info["time_w_arch"] = [
                (x, y) for x, y in zip(info["all_total_times"], info["all_archs"])
            ]
            for j, arch in enumerate(info["all_archs"]):
                assert arch != -1, "invalid arch from {:} {:} {:} ({:}, {:})".format(
                    alg, search_space, dataset, index, j
                )
        alg2data[alg] = data
    return alg2data


def query_performance(api, data, dataset, ticket):
    results, is_size_space = [], api.search_space_name == "size"
    for i, info in data.items():
        time_w_arch = sorted(info["time_w_arch"], key=lambda x: abs(x[0] - ticket))
        time_a, arch_a = time_w_arch[0]
        time_b, arch_b = time_w_arch[1]
        info_a = api.get_more_info(
            arch_a, dataset=dataset, hp=90 if is_size_space else 200, is_random=False
        )
        info_b = api.get_more_info(
            arch_b, dataset=dataset, hp=90 if is_size_space else 200, is_random=False
        )
        accuracy_a, accuracy_b = info_a["test-accuracy"], info_b["test-accuracy"]
        interplate = (time_b - ticket) / (time_b - time_a) * accuracy_a + (
            ticket - time_a
        ) / (time_b - time_a) * accuracy_b
        results.append(interplate)
    return np.mean(results), np.std(results)


def show_valid_test(api, data, dataset):
    valid_accs, test_accs, is_size_space = [], [], api.search_space_name == "size"
    for i, info in data.items():
        time, arch = info["time_w_arch"][-1]
        if dataset == "cifar10":
            xinfo = api.get_more_info(
                arch, dataset=dataset, hp=90 if is_size_space else 200, is_random=False
            )
            test_accs.append(xinfo["test-accuracy"])
            xinfo = api.get_more_info(
                arch,
                dataset="cifar10-valid",
                hp=90 if is_size_space else 200,
                is_random=False,
            )
            valid_accs.append(xinfo["valid-accuracy"])
        else:
            xinfo = api.get_more_info(
                arch, dataset=dataset, hp=90 if is_size_space else 200, is_random=False
            )
            valid_accs.append(xinfo["valid-accuracy"])
            test_accs.append(xinfo["test-accuracy"])
    valid_str = "{:.2f}$\pm${:.2f}".format(np.mean(valid_accs), np.std(valid_accs))
    test_str = "{:.2f}$\pm${:.2f}".format(np.mean(test_accs), np.std(test_accs))
    return valid_str, test_str


ylims ={
    "tss" : {
        "cifar10": (91,94.3),
        "cifar100": (69,75.0),
        "ImageNet16-120": (42,52)
    },
    "sss" : {
        "cifar10": (92,93.3),
        "cifar100": (65,70.5),
        "ImageNet16-120": (40,46)
    }
}



def visualize_curve(api, exp, dataset, ax, search_space="tss"):
    def sub_plot_fn(ax, dataset, exp):
        xdataset, max_time = dataset.split("-T")
        algorithms_labels = [d[0] for d in exp["data"].items()]
        algorithms = [d[1][0] for d in exp["data"].items()]
        alg2data = fetch_data(search_space=search_space, dataset=dataset, algorithms=algorithms)
        total_tickets = 150
        time_tickets = [
            float(i) / total_tickets * int(max_time) for i in range(total_tickets)
        ]
        tc = {
            'Dark Blue': '#006BA4',
            'Blue': '#5F9ED1',
            'Light Blue': '#A2C8EC',
            'Dark Orange': '#FF800E',
            'Red Orange': '#C85200',
            'Light Orange': '#FFBC79',
            'Very Dark Gray': '#595959',
            'Dark Gray': '#898989',
            'Light Gray': '#ABABAB',
            'Very Light Gray': '#CFCFCF',
            "Green" : '#228B22',
            "Red" : '#FF4500' 
        }
        colors = [tc[d[1][1]] for d in exp["data"].items()]

        # ax.set_xlim(0,exp["config"]["budget"]) # to make it easier for different experiments, we encode budget in name
        ax.set_xlim(0,int(max_time))
        
        ax.set_ylim(
            exp["config"]["limits"][xdataset][0], exp["config"]["limits"][xdataset][1]
        )

        xs = [x for x in time_tickets]

        for idx, (alg, data) in enumerate(alg2data.items()):
            accuracies = []
            ci_lower = []
            ci_upper = []
            repeats = len(data)
            for ticket in time_tickets:
                cur_mean, cur_std = query_performance(api, data, xdataset, ticket)
                accuracies.append(cur_mean)
                if exp["config"]["confidence_intervall"] is True:

                    # Calculate the confidence interval
                    sem = cur_std / np.sqrt(repeats)  # Standard Error
                    confidence = 1 - exp["config"]["pvalue"]
                    t_critical = t.ppf(confidence + (1 - confidence) / 2, repeats - 1)
                    margin_of_error = t_critical * sem
                        
                    ci_lower.append(cur_mean - margin_of_error)
                    ci_upper.append(cur_mean + margin_of_error)
                    
            ax.plot(
                xs,
                accuracies,
                c=colors[idx],
                label="{:}".format(algorithms_labels[idx]),
            )
            if exp["config"]["confidence_intervall"] is True:
                ax.fill_between(xs, ci_lower, ci_upper, alpha=0.5, linewidth=0, facecolor=colors[idx])

            ax.set_ylabel("accuracy")
            ax.set_xlabel('total training time spent (seconds)')

            name2label = {
                "cifar10": "CIFAR-10",
                "cifar100": "CIFAR-100",
                "ImageNet16-120": "ImageNet-16-120",
            }
            ax.set_title("NATS-Bench results on {:}".format(name2label[xdataset]))
            formatter = ticker.ScalarFormatter(useMathText=True)
            formatter.set_powerlimits((1,4))
            ax.xaxis.set_major_formatter(formatter)
            # ax.xaxis.set_major_locator(ticker.MultipleLocator(100))


        ax.legend(loc=4)
    sub_plot_fn(ax, dataset, exp)
    print("sub-plot {:} on {:} done.".format(dataset, search_space))
    




In [None]:
# we can start define experiments from the provided script in our autodl submodule
import subprocess


def run_algorithm(algname, dataset, budget, loops): 
    algorithms = {

        # custom scripts like this
        # "custom evolution": os.path.join("custom_evolution.py"),
        "GENE": os.path.join("4_GENE.py"),

        # baselines provided from autodl lib
        "regularized evolution": os.path.join("..", "thirdparty", "autodl", "exps", "NATS-algos", "regularized_ea.py"),
        "random": os.path.join("..", "thirdparty", "autodl", "exps", "NATS-algos", "random_wo_share.py"),
        "reinforce": os.path.join("..", "thirdparty", "autodl", "exps", "NATS-algos", "reinforce.py"),
        "bohb": os.path.join("..", "thirdparty", "autodl", "exps", "NATS-algos", "bohb.py"),
        
    }
    # Use os.path.join to construct the save_dir
    save_dir = os.path.join("..", "output", "search")

    print(f"Running algorithm {algname} on {dataset}...")
    command = [
        "python", 
        algorithms[algname],
        "--save_dir", save_dir, 
        "--dataset", dataset,
        "--search_space", "tss",
        "--time_budget", str(budget),
        "--loops_if_rand", str(loops),

    ]

    if algname== "regularized evolution" or algname== "regularized evolution gm": 
        command += ["--ea_cycles", "200","--ea_population", "20","--ea_sample_size", "10"]

    if algname=="bohb":
        command += ["--num_samples", "4", "--random_fraction", "0.0", "--bandwidth_factor","3"]

    if algname=="reinforce":
        command += ["--learning_rate", "0.01"]


    # Run the command
    result = subprocess.run(command, capture_output=True, text=True)

    # if no outputs are saved, check this command
    # print(result)


In [None]:
# this is how we run the algorithms from nats-bench

algorithms = [
    "regularized evolution",
    "random",
    # "reinforce",

]
datasets = {
    "cifar10" : 2000, # time budget
    # "cifar100" : 400000,
    # "ImageNet16-120": 120000,
}

n = 5 # number of runs

for algorithm in algorithms:
    for dataset, budget in datasets.items():
        run_algorithm(algorithm, dataset, budget, n)

In [None]:
# now we can visualize our experiments

search_space = "tss"
api = create(None, search_space, fast_mode=True, verbose=False)

budget = int(2000) # this has to match with the experiment from above
datasets = [f"cifar10-T{budget}"]
ylims = {
        "cifar10": (91,95.3),
        "cifar100": (69,75.0),
        "ImageNet16-120": (42,52)}


exp1 = {
    "data": 
    {
        "regularized evolution" : ["R-EA-SS10","Dark Blue"],
        # "reinforce" : ["REINFORCE-0.01","Light Gray"],
        "random" : ["RANDOM","Red Orange"],
        # "bohb" : ["BOHB","Dark Orange"],

    },
    "config": 
    {
        "budget" : int(2e4),
        "limits" : ylims,
        "n" : 1000,
        "print_every" : 5,
        "confidence_intervall" : True,
        "pvalue" : 0.05,
        "significant_areas": False,
        "dataset" : "test",
   }
}

# one plot
fig, ax = plt.subplots()
visualize_curve(api, exp1, datasets[0], ax)
plt.show()

