In [1]:
from collections import namedtuple
import os
from pathlib import Path
import inspect
from typing import Dict, List
from functools import partial
import json
from uuid import uuid4

import esm
from pgen import utils
from pgen.sampler_1 import Sampler_1

## Experiment Driver Code

In [2]:
ModelMetadata = namedtuple("ModelMetadata", ["model_id", "dataset_id", "model_run_id", "model_checkpoint_id"])

class SamplerWrapper:
    def __init__(self, sampler):
        self.sampler = sampler
        self.id = sampler.__class__.__name__

    @staticmethod
    def dynamic_config_update(sampler_config, seed):
        sampler_config.update({'seed_seq': seed})
        return sampler_config

    def generate(self, sample_config):
        return self.sampler.generate(**sample_config)
    
class Wrapped_Sampler1(SamplerWrapper):
    def __init__(self, sampler):
        super(Wrapped_Sampler1, self).__init__(sampler)
    
    @staticmethod
    def dynamic_config_update(sampler_config, seed, dynamics):
        for key, lambda_func in dynamics.items():
            sampler_config.update({key: lambda_func(seed)})

        SamplerWrapper.dynamic_config_update(sampler_config, seed)
        return sampler_config
        
    @classmethod
    def construct(cls, wrapped_model, device="cpu"):
        return cls(Sampler_1(wrapped_model, device))

def assert_is_file(file: Path):
    assert os.path.isfile(file), f"File does not exist {file}"

def assert_is_directory(directory: Path):
    assert os.path.isdir(directory), f"Path does not exist {directory}"

class ModelArtifactsContainer:
    """ Container that assumes some file system structure """
    def __init__(self, models_dir: Path, model_metadata: ModelMetadata):
        assert_is_directory(models_dir)
        self.models_dir = models_dir
        self.model_metadata = model_metadata
        
        self.model_artifacts_dir = Path(f'{models_dir}/{model_metadata.model_id}/{model_metadata.dataset_id}/{model_metadata.model_run_id}')
        assert_is_directory(self.model_artifacts_dir)
        
        self.model_checkpoint_path = self.model_artifacts_dir/model_metadata.model_checkpoint_id
        assert_is_file(self.model_checkpoint_path)
        
class ESMLocal:
    def __init__(self, model_artifacts: ModelArtifactsContainer):
        self.model, self.alphabet = esm.pretrained.load_model_and_alphabet_local(str(model_artifacts.model_checkpoint_path))
        self.batch_converter = self.alphabet.get_batch_converter()
        self.model_artifacts = model_artifacts

class ProTransLocal:
    def __init__(self, model_artifacts: ModelArtifactsContainer):
        self.model, self.alphabet = esm.pretrained.load_model_and_alphabet_local(str(model_artifacts.model_checkpoint_path))
        self.batch_converter = self.alphabet.get_batch_converter()
        self.model_artifacts = model_artifacts
        
class SequenceGenerator:
    def __init__(self):
        pass

    @staticmethod
    def write_metadata(output_dir, generation_guid, model_artifacts: ModelArtifactsContainer):
#         metadata_dir = model_artifacts.model_artifacts_dir/"metadata"
#         metadata_dir.mkdir(parents=True, exist_ok=True)
        metadata_file = output_dir/f"{generation_guid}_experiment_metadata.txt"
        
        model_metadata = model_artifacts.model_metadata
        
        with open(metadata_file, "w") as fpmd:
            metadata = inspect.cleandoc(f"""
            Experiment Metadata:
            
            Model ID         = {model_metadata.model_id}
            Dataset ID       = {model_metadata.dataset_id}
            Model Run ID     = {model_metadata.model_run_id}
            Model Checkpoint = {model_metadata.model_checkpoint_id} 
            """)
            
            fpmd.write(metadata)

    @staticmethod
    def write_generation_metadata(generation_metadata_filename, gen_id, trial_id, sampler, seed_id, sample_config, model_artifacts, trial_description):
        model_metadata = model_artifacts.model_metadata
        
        with open(generation_metadata_filename, "w") as fpmd:
            metadata = inspect.cleandoc(f"""
            Model Metadata:
            
            Model ID         = {model_metadata.model_id}
            Dataset ID       = {model_metadata.dataset_id}
            Model Run ID     = {model_metadata.model_run_id}
            Model Checkpoint = {model_metadata.model_checkpoint_id}
            
            Generation Metadata:
            
            Generation ID    = {gen_id}
            Trial Desc       = {trial_description}
            Trial ID         = {trial_id}
            Seed ID          = {seed_id}
            
            Sampler ID       = {sampler.id}
            Sampler Config:
            {json.dumps(sample_config)}
            """)
            
            fpmd.write(metadata)

    def generate_from_esm(self, trial_configs: List[Dict[str, any]], models, filename_suffix=None):
        if not isinstance(models, list):
            models = [models]
            
        for model in models:
            model_artifacts = model.model_artifacts
            # Setup filesystem
            generation_guid = str(uuid4())[:6]
            generated_seqeuences_dir = model_artifacts.model_artifacts_dir/"generated_sequences"
            generated_seqeuences_dir.mkdir(parents=True, exist_ok=True)

            # Write metadata
            # self.write_metadata(generated_seqeuences_dir, generation_guid, model_artifacts)

            # Initial check for output files -- see if already exist to fail early before long running process:
            for idx, trial_config in enumerate(trial_configs):
                seeds = trial_config['seeds']
                for seed_id, seed in seeds:
                    unique_identifier = f"{seed_id}_{generation_guid}_trial{idx+1}"
                    filename = f"{unique_identifier}_{filename_suffix}" if filename_suffix else unique_identifier
                    generated_sequences_filename = generated_seqeuences_dir/f"{filename}.fasta"
                    generation_metadata_filename = generated_seqeuences_dir/f"{filename}_config.txt"
                    if os.path.isfile(generated_sequences_filename): raise Exception(f'{generated_sequences_filename} already exists. Backup/rename before running')
                    if os.path.isfile(generation_metadata_filename): raise Exception(f'{generation_metadata_filename} already exists. Backup/rename before running')


            for idx, trial_config in enumerate(trial_configs):
                trial_id = f"trial{idx+1}"
                seeds = trial_config['seeds']
                sampler_generator, sampler_gen_args = trial_config['sampler']
                for seed_id, seed in seeds:
                    raw_sampler_config = trial_config['sampler_config']
                    sampler = sampler_generator.construct(model, **sampler_gen_args)
                    dynamics = trial_config.get('dynamics', {})
                    trial_description = trial_config.get('trial_description', 'no description provided')
                    # Check if files already exist
                    unique_identifier = f"{seed_id}_{generation_guid}_{trial_id}"
                    filename = f"{unique_identifier}_{filename_suffix}" if filename_suffix else unique_identifier
                    generated_sequences_filename = generated_seqeuences_dir/f"{filename}.fasta"
                    generation_metadata_filename = generated_seqeuences_dir/f"{filename}_config.txt"
                    if os.path.isfile(generated_sequences_filename): raise Exception(f'{generated_sequences_filename} already exists. Backup/rename before running')
                    if os.path.isfile(generation_metadata_filename): raise Exception(f'{generation_metadata_filename} already exists. Backup/rename before running')

                    # Inject seed specific sampler config if not provided
                    initial_sampler_config = raw_sampler_config.copy()
                    sampler_config = sampler.dynamic_config_update(initial_sampler_config, seed, dynamics)

                    print(inspect.cleandoc(f"""

                    Running Trial: {trial_id}
                    Trial Desc: {trial_description}

                    Generating Sequences for {seed_id} starting from seed: 
                    {seed}

                    Using sampler {sampler.id} with config:
                    {sampler_config}
                    """))

                    self.write_generation_metadata(generation_metadata_filename, generation_guid, trial_id, sampler, seed_id, sampler_config, model_artifacts, trial_description)

                    sampled_sequences = sampler.generate(sampler_config)
                    utils.write_sequential_fasta(generated_sequences_filename, sampled_sequences)

                    print(f"Finished writing to:\n{generated_sequences_filename}\n{generation_metadata_filename}")

## Setup Global Config

In [3]:
sequence_generator = SequenceGenerator()

E_coli_CM2 = "MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAHYITRLFQLIIEDSVLTQQALLQQH" #score 65.5
best_CM2 = "MDYQEKLKALRQEIDSIDNQILELINKRATLAKEVGEIKKANNLPIFVPSREKEIFDRLEKLNKGPLPTDIVKHIFREIISACRSIEENIKVVY" #score 100.7
E_coli_CM2_first_40 = "MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKL"
E_coli_CM2_first_20 = "MTSENPLLALREKISALDEK"

SEED_E_coli_CM2 = ('E_coli_CM2', E_coli_CM2)
SEED_best_CM2 = ('best_CM2', best_CM2)
SEED_E_coli_CM2_first_40 = ('E_coli_CM2_first_40', E_coli_CM2_first_40)
SEED_E_coli_CM2_first_20 = ('E_coli_CM2_first_20', E_coli_CM2_first_20)
NO_SEED = ("NoSeed", "")
M_SEED = ("MSeed", "M")

NO_SEED_M_SEED = [
    NO_SEED,
    M_SEED
]

ALL_SEED_PAIRS = [
    SEED_E_coli_CM2,
    SEED_best_CM2,
    SEED_E_coli_CM2_first_40,
    SEED_E_coli_CM2_first_20,
    NO_SEED,
    M_SEED
]

FULL_CM2_SEED_PAIRS = [
    SEED_E_coli_CM2,
    SEED_best_CM2,
]

TRUNCATED_CM2_SEED_PAIRS = [
    SEED_E_coli_CM2_first_40,
    SEED_E_coli_CM2_first_20
]

sampler_1 = Wrapped_Sampler1

## Trial Creation Utilities

In [4]:
def seed_len(seed):
    return len(seed)

def seed_len_div_ten(seed):
    return int(len(seed)/10)

def target_less_seed_len(target, seed):
    return target - len(seed)

def pass_through(x, seed):
    return x

def trial(sampler_tuple, sampler_config, seeds, transforms={}, description=None):
    if sampler_tuple is None: raise Exception('Must provide sampler_tuple')
    if seeds is None: raise Exception('Must provide seeds')
    if sampler_config is None: raise Exception('Must provide sampler_config')
    
    trial = dict(
        sampler=sampler_tuple,
        seeds=seeds,
        dynamics=transforms,
        sampler_config=sampler_config
    )
    
    if description:
        trial['trial_description'] = description
        
    return trial

def get_parameterized_standard_trials(sampler_generator,
                        batchsize,
                        run_generate=True,
                        run_seq_complete=True,
                        generation_seeds=FULL_CM2_SEED_PAIRS, 
                        seq_completion_seeds=TRUNCATED_CM2_SEED_PAIRS, 
                        num_samples=250,
                        device='gpu'):
    if not (run_generate or run_seq_complete):
        print('Nothing to do')
        return
    
    sampler_tuple = (sampler_generator, {'device': device})
    
    standard_generation_trials = [
        trial(
            description="Sequences sequentially",
            sampler_tuple=sampler_tuple,
            seeds=generation_seeds,
            transforms=dict(
                num_iters=seed_len,
                max_len=seed_len
            ),
            sampler_config=dict(n_samples=num_samples, batch_size=batchsize, in_order=True, num_positions=1, mask=True)
        ),
        trial(
            description="Sequences random 10% at a time",
            sampler_tuple=(sampler_1, {'device': device}),
            seeds=generation_seeds,
            transforms=dict(
                num_positions=seed_len_div_ten,
                max_len=seed_len
            ),
            sampler_config=dict(n_samples=num_samples, batch_size=batchsize, in_order=False, num_iters=20, mask=True)
        ),
        trial(
            description=" 1 sequence with k=1",
            sampler_tuple=(sampler_1, {'device': device}),
            seeds=generation_seeds,
            transforms=dict(
                max_len=seed_len,
                num_iters=seed_len
            ),
            sampler_config=dict(n_samples=1, batch_size=1, in_order=True, num_positions=1, top_k=1, mask=True)
        ),
        trial(
            description="1 sequence with k=1 and no mask",
            sampler_tuple=(sampler_1, {'device': device}),
            seeds=generation_seeds,
            transforms=dict(
                max_len=seed_len,
                num_iters=seed_len
            ),
            sampler_config=dict(n_samples=1, batch_size=1, in_order=True, num_positions=1, top_k=1, mask=False)
        ),
        trial(
            description="100 sequences random 10% at a time, with 10 round burnin",
            sampler_tuple=(sampler_1, {'device': device}),
            seeds=generation_seeds,
            transforms=dict(
                max_len=seed_len,
                num_positions=seed_len_div_ten
            ),
            sampler_config=dict(n_samples=100, batch_size=batchsize, in_order=False, num_iters=20, burnin=10, mask=True)
        ),
        trial(
            description="100 sequences random 10% at a time, with top_k=1 for the whole thing",
            sampler_tuple=(sampler_1, {'device': device}),
            seeds=generation_seeds,
            transforms=dict(
                max_len=seed_len,
                num_positions=seed_len_div_ten
            ),
            sampler_config=dict(n_samples=100, batch_size=batchsize, in_order=False, num_iters=20, top_k=1, mask=True)
        )
    ]
    
    standard_sequence_completion_trials = [
        trial(
            description="Seq Complete Target Len 95",
            sampler_tuple=(sampler_1, {'device': device}),
            seeds=seq_completion_seeds,
            transforms=dict(
                max_len=partial(pass_through, 95),
                leader_length=seed_len,
                num_iters=partial(target_less_seed_len, 95)
            ),
            sampler_config=dict(n_samples=100, batch_size=batchsize, in_order=True, num_positions=1, mask=True)
        ),
        trial(
            description="Seq Complete Target Len 95 top_k=1",
            sampler_tuple=(sampler_1, {'device': device}),
            seeds=seq_completion_seeds,
            transforms=dict(
                max_len=partial(pass_through, 95),
                leader_length=seed_len,
                num_iters=partial(target_less_seed_len, 95)
            ),
            sampler_config=dict(n_samples=1, batch_size=1, in_order=True, top_k=1, num_positions=1, mask=True)
        )
    ]

    all_trials_to_run = []
    
    if run_generate:
        all_trials_to_run.extend(standard_generation_trials)
    if run_seq_complete:
        all_trials_to_run.extend(standard_sequence_completion_trials)
        
    return all_trials_to_run

## User Specific Setup

In [18]:
# Specify system specific params here:
models_dir = Path('../../../cache/project-models/') # Location of model files downloaded locally
assert models_dir.exists()
esm_models_dir = models_dir/"esm"
protrans_models_dir = models_dir/"protrans"

t34_batch_size = 12
t12_batch_size = 32
device="gpu"

## Setup Shared Model Objects

In [15]:
esm1_t12_85M_UR50S_6fb55ea4_russ_model_metadata = ModelMetadata(
    model_id="esm1_t12_85M_UR50S",
    dataset_id="Russ_994_random",
    model_run_id="Dec04_13-47-23_6fb55ea4",
    model_checkpoint_id="best-model-checkpoint-Dec04_13-47-23_6fb55ea4.pt")
esm1_t12_85M_UR50S_6fb55ea4_russ_model_artifacts = ModelArtifactsContainer(esm_models_dir, esm1_t12_85M_UR50S_6fb55ea4_russ_model_metadata)
esm1_t12_85M_UR50S_6fb55ea4_russ_model = ESMLocal(esm1_t12_85M_UR50S_6fb55ea4_russ_model_artifacts)

esm1_t12_85M_UR50S_6fb55ea4_tautomerase_model_metadata = ModelMetadata(
    model_id="esm1_t12_85M_UR50S",
    dataset_id="tautomerase_2953",
    model_run_id="Dec04_14-13-14_c8588459",
    model_checkpoint_id="best-model-checkpoint-Dec04_14-13-14_c8588459.pt")
esm1_t12_85M_UR50S_6fb55ea4_tautomerase_model_artifacts = ModelArtifactsContainer(esm_models_dir, esm1_t12_85M_UR50S_6fb55ea4_tautomerase_model_metadata)
esm1_t12_85M_UR50S_6fb55ea4_tautomerase_model = ESMLocal(esm1_t12_85M_UR50S_6fb55ea4_tautomerase_model_artifacts)

xlnet_1607241613_tautomerase_model_metadata = ModelMetadata(
    model_id="xlnet",
    dataset_id="tautomerase_2953",
    model_run_id="1607241613",
    model_checkpoint_id="checkpoint-22648")
xlnet_1607241613_tautomerase_model_artifacts = ModelArtifactsContainer(protrans_models_dir, xlnet_1607241613_tautomerase_model_metadata)
xlnet_1607241613_tautomerase_model = ESMLocal(xlnet_1607241613_tautomerase_model_artifacts)

## Data Generation

### EColi CM2

```
{"n_samples": 100, "batch_size": 34, "in_order": false, "top_k": 0, "num_positions": 9, "mask": true, "max_len": 95, "leader_length": 0, "num_iters": 120, "burnin": 100, "seed_seq": "MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAHYITRLFQLIIEDSVLTQQALLQQH"}
```

In [9]:
trail_configs_ecolicm2 = [
    trial(
        description=None,
        sampler_tuple=(sampler_1, {'device': device}),
        seeds=[SEED_E_coli_CM2],
        transforms=dict(
            max_len=seed_len
        ),
        sampler_config=dict(n_samples=100, batch_size=34, in_order=False, top_k=0, num_positions=9, mask=True, num_iters=120, leader_length=0, burnin=100)
    )
]

sequence_generator.generate_from_esm(trail_configs_ecolicm2, [esm1_t12_85M_UR50S_6fb55ea4_russ_model, esm1_t12_85M_UR50S_6fb55ea4_tautomerase_model])

Running Trial: trial1
Trial Desc: no description provided

Generating Sequences for E_coli_CM2 starting from seed: 
MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAHYITRLFQLIIEDSVLTQQALLQQH

Using sampler Sampler_1 with config:
{'n_samples': 100, 'batch_size': 34, 'in_order': False, 'top_k': 0, 'num_positions': 9, 'mask': True, 'num_iters': 120, 'leader_length': 0, 'burnin': 100, 'max_len': 95, 'seed_seq': 'MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAHYITRLFQLIIEDSVLTQQALLQQH'}
0
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
iter 101
iter 111
Finished batch 1 in 10.095s
1
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
iter 101
iter 111
Finished batch 2 in 9.766s
2
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
iter 101
iter 111
Finished batch 3 in 9.810s
Finished writing to:
/workspace/code/project/tasks/finetuning/comparision_testbed/models/e

### ADHOC 2

```
{"n_samples": 100, "batch_size": 34, "in_order": true, "top_k": 0, "num_positions": 1, "mask": true, "max_len": 95, "leader_length": 40, "num_iters": 190, "seed_seq": "MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKL"}
{"n_samples": 100, "batch_size": 34, "in_order": true, "top_k": 0, "num_positions": 1, "mask": true, "max_len": 95, "leader_length": 20, "num_iters": 190, "seed_seq": "MTSENPLLALREKISALDEK"}
```

In [34]:
trial_configs_adhoc_2 = [
    trial(
        description=None,
        sampler_tuple=(sampler_1, {'device': device}),
        seeds=[("E_coli_CM2_first_40", "MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKL")],
        transforms=dict(),
        sampler_config=dict(n_samples=100, batch_size=34, in_order=True, top_k=0, num_positions=1, mask=True, num_iters=190, max_len=95, leader_length=40)
    ),
    trial(
        description=None,
        sampler_tuple=(sampler_1, {'device': device}),
        seeds=[("E_coli_CM2_first_20", "MTSENPLLALREKISALDEK")],
        transforms=dict(),
        sampler_config=dict(n_samples=100, batch_size=34, in_order=True, top_k=0, num_positions=1, mask=True, num_iters=190, max_len=95, leader_length=20)
    )
]

sequence_generator.generate_from_esm(trial_configs_adhoc_2, [esm1_t12_85M_UR50S_6fb55ea4_russ_model, esm1_t12_85M_UR50S_6fb55ea4_tautomerase_model])

Running Trial: trial1
Trial Desc: no description provided

Generating Sequences for AdhocSeed2.1 starting from seed: 
MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKL

Using sampler Sampler_1 with config:
{'n_samples': 100, 'batch_size': 34, 'in_order': True, 'top_k': 0, 'num_positions': 1, 'mask': True, 'num_iters': 190, 'max_len': 95, 'leader_length': 40, 'seed_seq': 'MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKL'}
0
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
iter 101
iter 111
iter 121
iter 131
iter 141
iter 151
iter 161
iter 171
iter 181
Finished batch 1 in 14.764s
1
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
iter 101
iter 111
iter 121
iter 131
iter 141
iter 151
iter 161
iter 171
iter 181
Finished batch 2 in 14.787s
2
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
iter 101
iter 111
iter 121
iter 131
iter 141
iter 151
iter 161
iter 171
iter 181
Finished batch 3 in 14.769s
Finished writing to:
/

### ADHOC 1

Adhoc request
```
Model Metadata:
Model ID         = esm1_t12_85M_UR50S
Dataset ID       = Russ_994_random
Model Run ID     = Dec04_13-47-23_6fb55ea4
Model Checkpoint = best-model-checkpoint-Dec04_13-47-23_6fb55ea4.pt
Generation Metadata:
Generation ID    = 6b9c30
Trial Desc       = Seq Complete Target Len 95 top_k=1
Trial ID         = trial8
Seed ID          = E_coli_CM2_first_20
Sampler ID       = Sampler_1
Sampler Config:
{"n_samples": 1, "batch_size": 1, "in_order": true, "top_k": 1, "num_positions": 1, "mask": true, "max_len": 95, "leader_length": 1, "num_iters": 190, "seed_seq": ""}

Model Metadata:
Model ID         = esm1_t12_85M_UR50S
Dataset ID       = Russ_994_random
Model Run ID     = Dec04_13-47-23_6fb55ea4
Model Checkpoint = best-model-checkpoint-Dec04_13-47-23_6fb55ea4.pt
Generation Metadata:
Generation ID    = 6b9c30
Trial Desc       = Seq Complete Target Len 95
Trial ID         = trial8
Seed ID          = E_coli_CM2_first_20
Sampler ID       = Sampler_1
Sampler Config:
{"n_samples": 100, "batch_size": 34, "in_order": true, "top_k": 0, "num_positions": 1, "mask": true, "max_len": 95, "leader_length": 0, "num_iters": 190, "seed_seq": ""}
```

In [33]:
trial_configs_adhoc_1 = [
    trial(
        description=None,
        sampler_tuple=(sampler_1, {'device': device}),
        seeds=[*NO_SEED_M_SEED], # [M_SEED, NO_SEED]
        transforms=dict(),
        sampler_config=dict(n_samples=1, batch_size=1, in_order=True, top_k=1, num_positions=1, mask=True, num_iters=190, max_len=95, leader_length=1)
    ),
    trial(
        description=None,
        sampler_tuple=(sampler_1, {'device': device}),
        seeds=[*NO_SEED_M_SEED], # [M_SEED, NO_SEED]
        transforms=dict(),
        sampler_config=dict(n_samples=100, batch_size=34, in_order=True, top_k=0, num_positions=1, mask=True, num_iters=190, max_len=95, leader_length=0)
    )
]

sequence_generator.generate_from_esm(trial_configs_adhoc_1, [esm1_t12_85M_UR50S_6fb55ea4_russ_model, esm1_t12_85M_UR50S_6fb55ea4_tautomerase_model])

Running Trial: trial1
Trial Desc: no description provided

Generating Sequences for NoSeed starting from seed: 


Using sampler Sampler_1 with config:
{'n_samples': 1, 'batch_size': 1, 'in_order': True, 'top_k': 1, 'num_positions': 1, 'mask': True, 'num_iters': 190, 'max_len': 95, 'leader_length': 1, 'seed_seq': ''}
0
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
iter 101
iter 111
iter 121
iter 131
iter 141
iter 151
iter 161
iter 171
iter 181
Finished batch 1 in 3.099s
Finished writing to:
/workspace/code/project/tasks/finetuning/comparision_testbed/models/esm1_t12_85M_UR50S/Russ_994_random/Dec04_13-47-23_6fb55ea4/generated_sequences/NoSeed_ac6305_trial1.fasta
/workspace/code/project/tasks/finetuning/comparision_testbed/models/esm1_t12_85M_UR50S/Russ_994_random/Dec04_13-47-23_6fb55ea4/generated_sequences/NoSeed_ac6305_trial1_config.txt
Running Trial: trial1
Trial Desc: no description provided

Generating Sequences for MSeed starting from seed: 
M

Using

### All Standard Generation / Sequence Completion Configs

In [31]:
all_standard_trial_configs = get_parameterized_standard_trials(
    sampler_generator=sampler_1, 
    batchsize=t12_batch_size,
    run_generate=True,
    run_seq_complete=True,
    num_samples=250,
    device=device
)

sequence_generator.generate_from_esm(all_standard_trial_configs, [esm1_t12_85M_UR50S_6fb55ea4_russ_model, esm1_t12_85M_UR50S_6fb55ea4_tautomerase_model])

Running Trial: trial1
Trial Desc: Sequences sequentially

Generating Sequences for E_coli_CM2 starting from seed: 
MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAHYITRLFQLIIEDSVLTQQALLQQH

Using sampler Sampler_1 with config:
{'n_samples': 250, 'batch_size': 32, 'in_order': True, 'num_positions': 1, 'mask': True, 'num_iters': 95, 'max_len': 95, 'seed_seq': 'MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAHYITRLFQLIIEDSVLTQQALLQQH'}
0
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
Finished batch 1 in 6.768s
1
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
Finished batch 2 in 6.718s
2
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
Finished batch 3 in 6.728s
3
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
Finished batch 4 in 6.740s
4
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
Finish

In [29]:
# # All standard generate / sequence completion

# model_metadata = ModelMetadata(
#     model_id="esm1_t12_85M_UR50S",
#     dataset_id="Russ_994_random",
#     model_run_id="Dec04_13-47-23_6fb55ea4",
#     model_checkpoint_id="best-model-checkpoint-Dec04_13-47-23_6fb55ea4.pt")
# model_artifacts = ModelArtifactsContainer(models_dir, model_metadata)
# model = ESMLocal(model_artifacts)

# trial_configs = get_parameterized_standard_trials(
#     sampler_generator=sampler_1, 
#     batchsize=t12_batch_size,
#     run_generate=True,
#     run_seq_complete=True,
#     num_samples=250,
#     device=device
# )

# sequence_generator.generate_from_esm(trial_configs, model)

#### Tautomerase

In [30]:
# model_metadata = ModelMetadata(
#     model_id="esm1_t12_85M_UR50S",
#     dataset_id="tautomerase_2953",
#     model_run_id="Dec04_14-13-14_c8588459",
#     model_checkpoint_id="best-model-checkpoint-Dec04_14-13-14_c8588459.pt")
# model_artifacts = ModelArtifactsContainer(models_dir, model_metadata)
# model = ESMLocal(model_artifacts)

# trial_configs = get_parameterized_standard_trials(
#     sampler_generator=sampler_1, 
#     batchsize=t12_batch_size,
#     run_generate=True,
#     run_seq_complete=True,
#     num_samples=250,
#     device=device
# )

# sequence_generator.generate_from_esm(trial_configs, model)