In [1]:
# mpnn design all the looped poses

from glob import glob

with open("03_pair_looped/03_paired_looped_poses.list", 'w') as f:
    for fname in sorted(glob("/home/broerman/crispy_shifty/projects/crispy_shifty_dimers/round_3/design/03_pair_looped/decoys/*/*.pdb.bz2")):
        f.write(fname + "\n")

In [6]:
import sys
sys.path.insert(0, '/home/broerman/crispy_shifty/')

from crispy_shifty.utils.io import gen_array_tasks

simulation_name = "04_two_state"
design_list_file = "/home/broerman/crispy_shifty/projects/crispy_shifty_dimers/round_3/design/03_pair_looped/03_paired_looped_poses.list"
output_path = f"/home/broerman/crispy_shifty/projects/crispy_shifty_dimers/round_3/design/{simulation_name}"

options = " ".join(
    [
        "out:level 200",
    ]
)

extra_kwargs = {
    "num_sequences": "10",
    "batch_size": "10",
    "mpnn_temperature": 0.2,
    "mpnn_design_area": "full",
    "mpnn_betas": "scan",
}

gen_array_tasks(
    distribute_func="crispy_shifty.protocols.mpnn.mpnn_dimers",
    design_list_file=design_list_file,
    output_path=output_path,
    queue="short",
    memory="12G",
    nstruct=1,
    nstruct_per_task=5,
    options=options,
    extra_kwargs=extra_kwargs,
    simulation_name=simulation_name,
)

Run the following command with your desired environment active:
sbatch -a 1-1012 /home/broerman/crispy_shifty/projects/crispy_shifty_dimers/round_3/design/04_two_state/run.sh


#### Collect scorefiles of the MPNN designed paired states and concatenate ####

In [2]:
import os, sys
sys.path.insert(0, '/home/broerman/crispy_shifty/')
from crispy_shifty.utils.io import collect_score_file

simulation_name = "04_two_state"
output_path = f"/home/broerman/crispy_shifty/projects/crispy_shifty_dimers/round_3/design/{simulation_name}"

if not os.path.exists(os.path.join(output_path, "scores.json")):
    collect_score_file(output_path, "scores")

  from distributed.utils import tmpfile


#### Load resulting concatenated scorefile ####

In [1]:
import sys
sys.path.insert(0, '/home/broerman/crispy_shifty/')
from crispy_shifty.utils.io import parse_scorefile_linear

simulation_name = "04_two_state"
output_path = f"/home/broerman/crispy_shifty/projects/crispy_shifty_dimers/round_3/design/{simulation_name}"

scores_df = parse_scorefile_linear(os.path.join(output_path, "scores.json"))
scores_df = scores_df.convert_dtypes()

  from distributed.utils import tmpfile
100%|██████████| 15345/15345 [01:29<00:00, 171.70it/s]


#### Save individual fastas ####

In [4]:
from crispy_shifty.utils.io import df_to_fastas

scores_df = df_to_fastas(scores_df, prefix="mpnn_seq")

  0%|          | 0/15345 [00:00<?, ?it/s]

  df["fasta_path"] = df.progress_apply(mask, args=(out_path), axis=1)


#### Save a list of outputs ####

In [5]:
from tqdm.auto import tqdm
with open(os.path.join(output_path, "mpnn_paired_states.list"), "w") as f:
    for path in tqdm(scores_df.index):
        print(path, file=f)

  0%|          | 0/15345 [00:00<?, ?it/s]

#### Concat the pdb.bz2 and fasta paths into a single list, for Superfold reasons ####

In [6]:
with open(os.path.join(output_path, "mpnn_paired_states.pair"), "w") as f:
    for path in tqdm(scores_df.index):
        line = path + "____" + path.replace("decoys", "fastas").replace("pdb.bz2", "fa")
        print(line, file=f)

  0%|          | 0/15345 [00:00<?, ?it/s]

Go fold on perlmutter

After folding a sample on perlmutter, noticed that the 50/50 beta split very rarely resulted in folded dimers. So, for the full folding, only fold sequences from 40/60 or 30/70 distributions.

In [2]:
query = "mpnn_msd_betas != '0.5,0.5'"

filtered_df = scores_df.query(query)
print(len(filtered_df))

10230


#### Save individual fastas ####

In [3]:
from crispy_shifty.utils.io import df_to_fastas

# exclude the rosetta sequences- we don't need to fold them
scores_df = df_to_fastas(filtered_df, prefix="mpnn_seq", exclude="mpnn_seq_0000")

100%|██████████| 10230/10230 [00:26<00:00, 387.28it/s]
  df["fasta_path"] = df.progress_apply(mask, args=(out_path), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["fasta_path"] = df.progress_apply(mask, args=(out_path), axis=1)


#### Save a list of outputs ####

In [4]:
from tqdm.auto import tqdm
with open(os.path.join(output_path, "mpnn_paired_states_filtered.list"), "w") as f:
    for path in tqdm(filtered_df.index):
        print(path, file=f)

100%|██████████| 10230/10230 [00:00<00:00, 769024.64it/s]


#### Concat the pdb.bz2 and fasta paths into a single list, for Superfold reasons ####

In [5]:
with open(os.path.join(output_path, "mpnn_paired_states_filtered.pair"), "w") as f:
    for path in tqdm(filtered_df.index):
        line = path + "____" + path.replace("decoys", "fastas").replace("pdb.bz2", "fa")
        print(line, file=f)

100%|██████████| 10230/10230 [00:00<00:00, 512949.71it/s]


Go fold on perlmutter