In [1]:
import os
import random
import pickle

import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
source_path = ''
target_path = ''

Prepare einspace data
- split by seed

In [4]:
einspace = pd.read_csv(os.path.join(source_path, 'einspace.csv'))
einspace_aug = pd.read_csv(os.path.join(source_path, 'einspace_augmentation.csv'))

In [5]:
einspace.columns

Index(['onnx_encoding', 'accuracy', 'onnx_encoding_tokens', 'dataset', 'name'], dtype='object')

In [6]:
einspace['seed'] = einspace['name'].apply(lambda x: x.split('_')[0])

In [7]:
einspace_val = einspace[einspace['seed'] == 'seed=4']
einspace_train = einspace[(einspace['seed'] != 'seed=4') & (einspace['seed'] != 'seed=0')]

In [8]:
einspace_train = pd.concat([einspace_train, einspace_aug], ignore_index=True)
einspace_train = einspace_train[einspace_train['accuracy'] > 2]

In [9]:
print("Training set shape:", einspace_train.shape)
print("Validation set shape:", einspace_val.shape)

Training set shape: (37416, 6)
Validation set shape: (1582, 6)


In [10]:
einspace_train.to_csv(os.path.join(target_path, 'einspace_train.csv'), index=False)
einspace_val.to_csv(os.path.join(target_path, 'einspace_val.csv'), index=False)

Prepare hnasbench201 data
- split by seed

In [11]:
hnasbench201 = pd.read_csv(os.path.join(source_path, 'hnasbench201.csv'))

In [12]:
hnasbench201['seed'] = hnasbench201['name'].apply(lambda x: x.split('_')[0])
hnasbench201['index'] = hnasbench201['name'].apply(lambda x: x.split('_')[1])

In [13]:
hnasbench201_val = hnasbench201[hnasbench201['seed'] == 'seed=4']
hnasbench201_train = hnasbench201[hnasbench201['seed'] != 'seed=4']
hnasbench201_train = hnasbench201_train[hnasbench201_train['accuracy'] > 2]

In [14]:
print("Training set shape:", hnasbench201_train.shape)
print("Validation set shape:", hnasbench201_val.shape)

Training set shape: (6403, 7)
Validation set shape: (1000, 7)


In [15]:
hnasbench201_train.to_csv(os.path.join(target_path, 'hnasbench201_train.csv'), index=False)
hnasbench201_val.to_csv(os.path.join(target_path, 'hnasbench201_val.csv'), index=False)

Prepare nasbench201 + natsbench data
- these two search space considered together
- random split by 80%/20%

In [16]:
nasbench201 = pd.read_csv(os.path.join(source_path, 'nasbench201.csv'))
natsbench = pd.read_csv(os.path.join(source_path, 'natsbench.csv'))

In [17]:
nas201nats = pd.concat([nasbench201, natsbench], ignore_index=True)
nas201nats_train, nas201nats_val = train_test_split(
    nas201nats, test_size=0.2, random_state=42
)

In [18]:
print("Training set shape:", nas201nats_train.shape)
print("Validation set shape:", nas201nats_val.shape)

Training set shape: (38714, 5)
Validation set shape: (9679, 5)


In [19]:
nas201nats_train.to_csv(os.path.join(target_path, 'nas201nats_train.csv'), index=False)
nas201nats_val.to_csv(os.path.join(target_path, 'nas201nats_val.csv'), index=False)

Prepare nasbench101
- split based on hash file

In [20]:
with open("nb101_hash.txt", "rb") as fp:
    nb101_hash = pickle.load(fp)
nasbench101 = pd.read_csv(os.path.join(source_path, 'nasbench101.csv'))

In [21]:
nasbench101['hash'] = nasbench101['name'].apply(lambda x: x.split('_')[-1])

In [22]:
nasbench101_val = nasbench101[nasbench101['hash'].isin(nb101_hash)]
nasbench101_train = nasbench101[~nasbench101['hash'].isin(nb101_hash)]

In [23]:
print("Training set shape:", nasbench101_train.shape)
print("Validation set shape:", nasbench101_val.shape)

Training set shape: (416334, 6)
Validation set shape: (7290, 6)


In [24]:
nasbench101_train.to_csv(os.path.join(target_path, 'nasbench101_train.csv'), index=False)
nasbench101_val.to_csv(os.path.join(target_path, 'nasbench101_val.csv'), index=False)

In [25]:
nasbench101_train = nasbench101_train.sample(n = 50_000, random_state=42).reset_index(drop=True)
nasbench101_train.to_csv(os.path.join(target_path, 'nasbench101_50k.csv'), index=False)

Prepare nasbench301
- split based on source

In [26]:
nasbench301 = pd.read_csv(os.path.join(source_path, 'nasbench301.csv'))

In [None]:
def list_onnx_files(folder_path):
    """
    Return a list of all .onnx file paths found within 'folder_path' (recursively).
    """
    onnx_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith('.onnx'):
                full_path = os.path.join(root, file)
                onnx_files.append(full_path)
    return onnx_files

onnx_files = list_onnx_files('../onnx/nasbench301')

In [28]:
index_source_map = {}
for i in onnx_files:
    index = i.split('/')[-1].split('.')[0]
    source = i.split('/')[-2]
    index_source_map[index] = source

In [29]:
nasbench301['source'] = nasbench301['name'].apply(lambda x: index_source_map[str(x)])

In [30]:
val_sources = ['only_avg_pool_3x3', 'local_search', 'bananas']

In [31]:
# val_sources = random.sample(list(nasbench301['source'].unique()), 3)
# val_sources

In [32]:
nasbench301_train = nasbench301[~nasbench301['source'].isin(val_sources)]
nasbench301_val = nasbench301[nasbench301['source'].isin(val_sources)]

In [33]:
print("Training set shape:", nasbench301_train.shape)
print("Validation set shape:", nasbench301_val.shape)

Training set shape: (51297, 6)
Validation set shape: (5892, 6)


In [34]:
nasbench301_train.to_csv(os.path.join(target_path, 'nasbench301_train.csv'), index=False)
nasbench301_val.to_csv(os.path.join(target_path, 'nasbench301_val.csv'), index=False)