In [30]:
# generate all sequences by composition
n = 20
allow_symmetry = False
all_seq_by_frac = {k: set() for k in range(n+1)}
limit = 2**n
for i in range(limit):
    sequence = bin(i)[2:].zfill(n)
    mirror_sequence = sequence[::-1]
    if sequence <= mirror_sequence or allow_symmetry:
        all_seq_by_frac[sequence.count('1')].add(sequence)

# create a master list of all possible sequences
all_sequences = []
for k, v in all_seq_by_frac.items():
    all_sequences += v
print(f'generated {len(all_sequences)} sequences')
possible_sequences = sorted(all_seq_by_frac[8])  # without sort the order is NOT guaranteed
print(f'choosing from {len(possible_sequences)} sequences')

generated 524800 sequences
choosing from 63090 sequences


In [31]:
import os
import torch

# load the model ensemble
model_ensemble = []
model_path = os.path.join('models', 'gru-opt-cv10-sym')
for i in range(10):
    model = torch.jit.load(os.path.join(model_path, f'fold-{i:02d}-scripted.pt'), map_location='cpu')
    model.eval()
    model_ensemble.append(model)

In [33]:
import json
import model_utils, message_utils
import numpy as np
from target_defs import archetype_predictions, archetype_sequences
import time


n_batch = 5
n_iter = 10
batch_prompt = "Here\n<result>\nNote"  # an empty/fake prompt to facilitate the message_utils
use_seed = False
  
arch_morphs = list(archetype_predictions.keys())
for morph in arch_morphs:

    print(f'computing results for {morph}...')
    target = archetype_predictions[morph]

    params = {'n_batch': n_batch,
              'target': target.tolist(),
              'morph': morph,
              'use_seed': use_seed,
              }

    start_time = time.time()
    for ridx in range(5):
        
        fake_payload = [{"role": "user", "content": [{"type": "text", "text": "N/A"}]}]
    
        rng = np.random.RandomState(ridx)
        init_idx = rng.choice(np.arange(len(possible_sequences)), n_batch, replace=False)
        init_bitstr = [possible_sequences[it] for it in init_idx]

        if use_seed:
            init_bitstr[0] = archetype_sequences[morph]
        init_sequences = [it.replace('0', 'A').replace('1', 'B') for it in init_bitstr]
    
        is_avail = np.ones(len(possible_sequences))
        is_avail[init_idx] = 0
        p_seq = is_avail / np.sum(is_avail)

        rollout_bitstr = rng.choice(possible_sequences, n_batch*(n_iter-1), p=p_seq, replace=False)
        rollout_sequences = [it.replace('0', 'A').replace('1', 'B') for it in rollout_bitstr]
        
        seq_by_iter = np.array_split(init_sequences + rollout_sequences, n_iter)
        
        for i in range(n_iter):
            out = model_utils.evaluate_sequences(seq_by_iter[i], target, model_ensemble)
            fake_payload.append(message_utils.build_user_message(batch_prompt, out))
        
        param_hash = message_utils.hash_dict(params)
        buffer = {'params': params, 'messages': fake_payload}
        suffix = str(ridx)
        seed_hash = 'seeded' if use_seed else 'unseeded'
        logdir = f'data/corrected-batches/{seed_hash}/random/{morph}/'
        if not os.path.isdir(logdir):
            os.mkdir(logdir)
        logfile = os.path.join(logdir, f'random-{param_hash}-{start_time}{suffix}.json')
        with open(logfile, 'w') as fid:
            json.dump(buffer, fid)

computing results for liquid...
computing results for membrane...
computing results for spherical micelle...
computing results for string...
computing results for vesicle...
computing results for wormlike micelle...
