In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
import esm

import math
import time

from pgen import sampler_1
from pgen import vanilla_esm34
from pgen import vanilla_esm12
from pgen import vanilla_esm6
from pgen import utils

from pathlib import Path
common_vars = utils.CommonVars()

In [5]:
sampler34 = sampler_1.Sampler_1(vanilla_esm34.ESM34(),device="cpu")

In [8]:
sampler12 = sampler_1.Sampler_1(vanilla_esm12.ESM12(),device="cpu")

In [None]:
sampler6 = sampler_1.Sampler_1(vanilla_esm6.ESM6(),device="cpu")

In [20]:
E_coli_CM2 = "MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAHYITRLFQLIIEDSVLTQQALLQQH" #score 65.5
best_CM2 = "MDYQEKLKALRQEIDSIDNQILELINKRATLAKEVGEIKKANNLPIFVPSREKEIFDRLEKLNKGPLPTDIVKHIFREIISACRSIEENIKVVY" #score 100.7

E_coli_CM2_first_40 = "MTSENPLLALREKISALDEKLLALLAERRELAVEVGKAKL"
E_coli_CM2_first_20 = "MTSENPLLALREKISALDEK"

In [None]:
#settings for generating 500 e coli sequences sequentially, takes more than 24 hours on laptop of SRJ
seed = E_coli_CM2
E_coli_esm34 = sampler34.generate(n_samples=500, seed_seq=seed, batch_size=4, max_len=len(seed), in_order=True, num_positions=1, num_iters=len(seed), mask=True)

In [None]:
#settings for generating 100 e coli sequences random 10% at a time,
seed = E_coli_CM2
E_coli_esm34_parallel = sampler34.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=len(seed), in_order=False, num_positions=int(len(seed)/10), num_iters=20, mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_parallel_esm34.fasta", E_coli_esm34_parallel)

In [None]:
#settings for generating 100 e coli sequences sequentially, from ESM12
seed = E_coli_CM2
E_coli_esm12 = sampler12.generate(n_samples=100, seed_seq=seed, batch_size=10, max_len=len(seed), in_order=True, num_positions=1, num_iters=len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_esm12.fasta", E_coli_esm12)

In [None]:
seed = E_coli_CM2
E_coli_esm12_parallel = sampler12.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=len(seed), in_order=False, num_positions=int(len(seed)/10), num_iters=20, mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_parallel_esm12.fasta", E_coli_esm12_parallel)

In [None]:
#generate 
seed = best_CM2
Best_CM2_esm34_2 = sampler34.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=len(seed), in_order=True, num_positions=1, num_iters=len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "Best_CM2_esm34.fasta", Best_CM2_esm34_2)

In [None]:
#settings for generating 1 e coli sequence with k=1
seed = E_coli_CM2
E_coli_esm34_sequential_k1 = sampler34.generate(n_samples=1, seed_seq=seed, batch_size=1, max_len=len(seed), in_order=True, num_positions=1, num_iters=len(seed), top_k=1, mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_k1_esm34.fasta", E_coli_esm34_sequential_k1)

#settings for generating 1 e coli sequence with k=1 and no mask
seed = E_coli_CM2
E_coli_esm34_sequential_k1_no_mask = sampler34.generate(n_samples=1, seed_seq=seed, batch_size=1, max_len=len(seed), in_order=True, num_positions=1, num_iters=len(seed), top_k=1, mask=False)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_k1_no_mask_esm34.fasta", E_coli_esm34_sequential_k1_no_mask)

#settings for generating 100 e coli sequences random 10% at a time, with 10 round burnin
seed = E_coli_CM2
E_coli_esm34_parallel_burnin = sampler34.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=len(seed), in_order=False, num_positions=int(len(seed)/10), num_iters=20, burnin=10, mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_parallel_burnin_esm34.fasta", E_coli_esm34_parallel_burnin)

#settings for generating 100 e coli sequences random 10% at a time, with top_k=1 for the whole thing
seed = E_coli_CM2
E_coli_esm34_parallel_k1 = sampler34.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=len(seed), in_order=False, num_positions=int(len(seed)/10), num_iters=20, top_k=1, mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_parallel_k1_esm34.fasta", E_coli_esm34_parallel_k1)

In [None]:
#settings for generating 100 e coli sequences random 10% at a time, with 10 round burnin
seed = E_coli_CM2
E_coli_esm12_parallel_burnin = sampler12.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=len(seed), in_order=False, num_positions=int(len(seed)/10), num_iters=20, burnin=10, mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_parallel_burnin_esm12.fasta", E_coli_esm12_parallel_burnin)


In [None]:
# sequence completion
seed = E_coli_CM2_first_40
target_length=95
E_coli_esm34_40 = sampler34.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=target_length, in_order=True, leader_length=len(seed), num_positions=1, num_iters=target_length-len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_40seed_esm34.fasta", E_coli_esm34_40)

E_coli_esm12_40 = sampler12.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=target_length, in_order=True, leader_length=len(seed), num_positions=1, num_iters=target_length-len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_40seed_esm12.fasta", E_coli_esm12_40)

E_coli_esm34_k1_40 = sampler34.generate(n_samples=1, seed_seq=seed, batch_size=1, max_len=target_length, in_order=True, leader_length=len(seed), top_k=1, num_positions=1, num_iters=target_length-len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_40seed_k1_esm34.fasta", E_coli_esm34_k1_40)

E_coli_esm12_k1_40 = sampler12.generate(n_samples=1, seed_seq=seed, batch_size=1, max_len=target_length, in_order=True, leader_length=len(seed), top_k=1, num_positions=1, num_iters=target_length-len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_40seed_k1_esm12.fasta", E_coli_esm12_k1_40)



seed = E_coli_CM2_first_20
target_length=95
E_coli_esm34_20 = sampler34.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=target_length, in_order=True, leader_length=len(seed), num_positions=1, num_iters=target_length-len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_20seed_esm34.fasta", E_coli_esm34_20)

E_coli_esm12_20 = sampler12.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=target_length, in_order=True, leader_length=len(seed), num_positions=1, num_iters=target_length-len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_20seed_esm12.fasta", E_coli_esm12_20)

E_coli_esm34_k1_20 = sampler34.generate(n_samples=1, seed_seq=seed, batch_size=1, max_len=target_length, in_order=True, leader_length=len(seed), top_k=1, num_positions=1, num_iters=target_length-len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_20seed_k1_esm34.fasta", E_coli_esm34_k1_20)

E_coli_esm12_k1_20 = sampler12.generate(n_samples=1, seed_seq=seed, batch_size=1, max_len=target_length, in_order=True, leader_length=len(seed), top_k=1, num_positions=1, num_iters=target_length-len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_20seed_k1_esm12.fasta", E_coli_esm12_k1_20)

In [None]:
seed = E_coli_CM2_first_20
target_length=95
E_coli_esm12_k0_20 = sampler12.generate(n_samples=1, seed_seq=seed, batch_size=1, max_len=target_length, in_order=True, leader_length=20, top_k=0, num_positions=1, num_iters=target_length-len(seed), mask=True)
E_coli_esm12_k0_20_rollover = sampler12.generate(n_samples=1, seed_seq=seed, batch_size=1, max_len=target_length, in_order=True, leader_length=20, top_k=0, num_positions=1, num_iters=95, mask=True, rollover_from_start=True)

#utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_20seed_k1_esm12.fasta", E_coli_esm12_k1_20)

In [None]:
E_coli_esm34_k1_40 = sampler34.generate(n_samples=100, seed_seq=seed, batch_size=1, max_len=target_length, in_order=True, leader_length=len(seed), top_k=1, num_positions=1, num_iters=target_length-len(seed), mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_sequential_40seed_k1_esm34.fasta", E_coli_esm34_k1_40)

In [17]:
print([E_coli_CM2_first_20])
print(E_coli_esm12_k1_20)
print(E_coli_esm12_k1_20_rollover)
print(E_coli_esm34_k0_20)
print(E_coli_esm43_k0_20_rollover)

['MTSENPLLALREKISALDEK']
['MTSENPLLALREKISALDEKISALEEKISALEEKISALEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA']
['MSEENELKELLERIDKLEEKLDKLEEKLDKLEEKLDKLEEKLDKLEEKLDKLEEKLDKLEEKLDKLEEKLDKLEEKLDKLEEKLDKLEEKLDKLE']
['MTSENPLLALREKISALDEKYQQMSDKLDQLVALIDTLKAAASATVNGQPLVLNQPPAGISLPTQAWAFGHNGHTNYHSAADYDAMWAAETMRFL']
['MPDTDALTKVEERLAGLDGGDTPELIRAFHLARTVAEDELRSIPVDVVHADDLDGDGDLGETGSGPRVVVRPANPKPKADKKSKKDKGRRKNTGN']


In [22]:
E_coli_esm34_seed_20_rollover = sampler34.generate(n_samples=100, seed_seq="MTSENPLLALREKISALDEK", batch_size=4, max_len=95, in_order=True, leader_length=20, top_k=0, num_positions=1, num_iters=190, mask=True, rollover_from_start=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "E_coli_esm34_seed_20_rollover.fasta", E_coli_esm34_seed_20_rollover)

0
iter 1
iter 11
iter 21
iter 31
iter 41
iter 51
iter 61
iter 71
iter 81
iter 91
iter 101
iter 111
iter 121
iter 131
iter 141
iter 151
iter 161
iter 171
iter 181


KeyboardInterrupt: 

In [10]:
Syn_F4 = "MYGKLNQLFHNLNEIVEDLNKNWHRERRTLHDFADELHQLVKHVHHFMQGHKNEGKLQDIVNQLDKLFRDLDNHLQRKDDTVHHRHHQLNKLLAQLDNLVHR"

seed = Syn_F4
Syn_F4_parallel_burnin = sampler34.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=len(seed), in_order=False, num_positions=int(len(seed)/10), num_iters=20, burnin=10, mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "Syn_F4_parallel_burnin_esm34_2.fasta", Syn_F4_parallel_burnin)

In [11]:
#settings for generating 100 e coli sequences random 10% at a time, with top_k=1 for the whole thing
seed = Syn_F4
Syn_F4_esm34_parallel_k1 = sampler34.generate(n_samples=100, seed_seq=seed, batch_size=4, max_len=len(seed), in_order=False, num_positions=int(len(seed)/10), num_iters=20, top_k=1, mask=True)
utils.write_sequential_fasta(Path(common_vars.project_dir) / "data" / "Syn_F4_parallel_k1_esm34.fasta", Syn_F4_esm34_parallel_k1)

0
iter 1
iter 11
Finished batch 1 in 126.514s
1
iter 1
iter 11
Finished batch 2 in 111.611s
2
iter 1
iter 11
Finished batch 3 in 118.982s
3
iter 1
iter 11
Finished batch 4 in 131.272s
4
iter 1
iter 11
Finished batch 5 in 116.359s
5
iter 1
iter 11
Finished batch 6 in 119.369s
6
iter 1
iter 11
Finished batch 7 in 121.350s
7
iter 1
iter 11
Finished batch 8 in 116.110s
8
iter 1
iter 11
Finished batch 9 in 141.966s
9
iter 1
iter 11
Finished batch 10 in 123.962s
10
iter 1
iter 11
Finished batch 11 in 127.693s
11
iter 1
iter 11
Finished batch 12 in 120.710s
12
iter 1
iter 11
Finished batch 13 in 111.536s
13
iter 1
iter 11
Finished batch 14 in 111.792s
14
iter 1
iter 11
Finished batch 15 in 112.175s
15
iter 1
iter 11
Finished batch 16 in 113.840s
16
iter 1
iter 11
Finished batch 17 in 107.881s
17
iter 1
iter 11
Finished batch 18 in 109.909s
18
iter 1
iter 11
Finished batch 19 in 107.274s
19
iter 1
iter 11
Finished batch 20 in 106.175s
20
iter 1
iter 11
Finished batch 21 in 103.150s
21
iter 1
i