# Example 1: Naive MPNN redesign of some DHRs into AF2
Variables in ALL_CAPS are the ones you should change

### Imports

In [1]:
%load_ext lab_black
# Python standard library
from glob import glob
import os
import socket
import sys

# 3rd party library imports
import dask
import matplotlib.pyplot as plt
import pandas as pd
import pyrosetta
import numpy as np
import scipy
import seaborn as sns
from tqdm.auto import tqdm  # jupyter compatible progress bar

tqdm.pandas()  # link tqdm to pandas
# Notebook magic
# save plots in the notebook
%matplotlib inline
# reloads modules automatically before executing cells
%load_ext autoreload
%autoreload 2
print(f"running in directory: {os.getcwd()}")  # where are we?
print(f"running on node: {socket.gethostname()}")  # what node are we on?

running in directory: /mnt/projects/crispy_shifty/demos
running on node: dig36


### Set working directory to the root of the crispy_shifty repo
**Change this to wherever you cloned it**

In [2]:
WORKING_DIRECTORY = "/projects/crispy_shifty"

os.chdir(WORKING_DIRECTORY)

sys.path.insert(0, "/projects/crispy_shifty")

### Prep a list of designs
To do this, you'll need a list with the fullpath of your designs, one per line.
Here's how to do it with python, you could also `cd <dir full of your designs> ; realpath *.pdb > designs.list` or some such

In [3]:
import shutil

# we'll use TJ's DHRs for this demo, change this to the location of your designs
INPUT_PATH = os.path.join(os.getcwd(), "demos/mpnn_to_af2_example_1/inputs")
os.makedirs(INPUT_PATH, exist_ok=True)
# we'll use a directory in demos for our output, you should change this too
OUTPUT_PATH = os.path.join(os.getcwd(), "demos/mpnn_to_af2_example_1/outputs")
os.makedirs(OUTPUT_PATH, exist_ok=True)
# copy over the example inputs if they aren't there already
pdbs = os.listdir(INPUT_PATH)
if len(pdbs) == 0:
    for file in glob(
        "/projects/crispy_shifty/demos/mpnn_to_af2_example_1/inputs/DHR*.pdb"
    ):
        shutil.copy(file, INPUT_PATH)
else:
    pass
inputs_list = os.path.join(INPUT_PATH, "inputs.list")
# 'with open' statement is a context manager, when we exit the with statement,
# the file is closed, preventing filesystem errors
with open(inputs_list, "w") as f:
    for pdb in glob(os.path.join(INPUT_PATH, "DHR*.pdb")):
        print(pdb, file=f)

### Run MPNN to naively redesign entire DHRs
We'll make array tasks since that is easiest, if you need to profile your code or want better analytics, you should use the `dask` stack, just be aware that it can only scale up to around 20k CPU hours or so on the digs

In [4]:
# since we set the working dir to the root of the repo, python knows where to load this
from crispy_shifty.utils.io import gen_array_tasks

SIMULATION_NAME = "mpnn_dhr"  # change this to whatever you want
design_list_file = inputs_list  # the file we just made

# this is the relative path from the root of the repo to .py file
# `crispy_shifty/protocols/demos.py` and `mpnn_dhr` is the function in the file
DISTRIBUTE_FUNC = "crispy_shifty.protocols.demos.mpnn_dhr"

# pyrosetta init options
OPTIONS = " ".join(
    [
        "out:level 200",  # warnings and errors only, normal output is 300
    ]
)
# options for controlling function behavior
EXTRA_OPTIONS = {
    "num_sequences": "48",  # should be a multiple of 8
}

QUEUE = "medium"  # change if needed
MEMORY = "4G"  # change if needed
SHA1 = (
    None  # change this to "" when you are ready to do a production run in your branch
)
# if SHA1 is not None, save the notebook and `git commit -am "SOME MESSAGE"` ;
# then `git push; sleep 15` in your branch before running this cell

gen_array_tasks(
    distribute_func=DISTRIBUTE_FUNC,
    design_list_file=design_list_file,
    output_path=OUTPUT_PATH,  # the output dir we just made
    queue=QUEUE,
    memory=MEMORY,
    nstruct=1,
    nstruct_per_task=1,
    options=OPTIONS,
    extra_kwargs=EXTRA_OPTIONS,
    sha1=SHA1,
    simulation_name=SIMULATION_NAME,
)

  from distributed.utils import tmpfile


Run the following command with your desired environment active:
sbatch -a 1-2 /mnt/projects/crispy_shifty/demos/mpnn_to_af2_example_1/outputs/run.sh


In [5]:
!sbatch -a 1-2 /mnt/projects/crispy_shifty/demos/mpnn_to_af2_example_1/outputs/run.sh

Submitted batch job 24626055


### Collect scorefiles of designed dhrs and concatenate
**Change this to wherever you cloned it**

In [6]:
CLONE_LOC = "/projects/crispy_shifty"
sys.path.insert(0, CLONE_LOC)
from crispy_shifty.utils.io import collect_score_file

simulation_name = SIMULATION_NAME
output_path = OUTPUT_PATH

if not os.path.exists(os.path.join(output_path, "scores.json")):
    collect_score_file(output_path, "scores")

### Load resulting concatenated scorefile

In [7]:
from crispy_shifty.utils.io import parse_scorefile_linear

output_path = OUTPUT_PATH

scores_df = parse_scorefile_linear(os.path.join(output_path, "scores.json"))
scores_df = scores_df.convert_dtypes()

  0%|          | 0/2 [00:00<?, ?it/s]

### Data exploration
As we can see, the sequences were added to the decoy scores

In [8]:
scores_df.head()

Unnamed: 0,mpnn_seq_0000,mpnn_seq_0001,mpnn_seq_0002,mpnn_seq_0003,mpnn_seq_0004,mpnn_seq_0005,mpnn_seq_0006,mpnn_seq_0007,mpnn_seq_0008,mpnn_seq_0009,...,mpnn_seq_0040,mpnn_seq_0041,mpnn_seq_0042,mpnn_seq_0043,mpnn_seq_0044,mpnn_seq_0045,mpnn_seq_0046,mpnn_seq_0047,mpnn_seq_0048,path_in
/mnt/projects/crispy_shifty/demos/mpnn_to_af2_example_1/outputs/decoys/0000/mpnn_dhr_9a37f03526824c1089959cd6a702314f.pdb.bz2,DEEVQEAVERAEELREEAEELIKKARKTGDPELLRKALEALEEAVR...,EEEARRRVEEAKERRRRAERLRREAKRTGDPRLLREAGRELREAIR...,SERVKRRVEEAKRLKEKAERLVKEAERTGDPRLLREARRELEKAVR...,SSEVRRRVEEAEELRRRAERLVREAERTGDPRLLREAREALREGIR...,SAAVTRAVTEARRLRARAEELIREARRTGSPELLREAQEALRVGIR...,REEVRRRVREAERLRERARRLVEEAERRGDVELLKEARRALRVAVE...,SEEVRRRVRKARELLREAERLVEEARKTGDPELLKKAREALNRGIE...,EEEVRRRVREARERRRRAERLIEEAKRTGDARLLREARKELERGIE...,SEAVRRAVREAKRLRERAERLVREARRTGDPRLLREAREALREAIE...,EEEVRKRVEEAEERKRRAERLIREARRTGDPELLREAKRELKRAIE...,...,SAEVDRRVREARRRRERAERLIERARETGDPELLREARRELDEGIR...,REEVRRRVETAKELRRRAEELIKRARKEGDPELLKTAREALRRGIR...,SEEVERAVEEAERLREEAERLVEEAERTGDVRLLERAEEALRRAIR...,SAEVDRRVTRAEELRARAERLVKEARRKGDPRLLKQAQEALDVGVR...,REEVRRRVREARERRRRAERLVEEAKRTGDPRLLKEAREELRVGVR...,SEEVRRRVEEARRLERRAEELIRRAKEEGDEELLKEARDALEKAIE...,SAEARRRVEEAKRLRREAERLIERGEERGDPDLLKEARDALDEAIR...,SAEVTRAVEEAERLRRRAERLVEEARRTGDPELLEEARRELDRGIR...,SEEVRRRVEEAKELYREAERLIEEARKTGDPELLKEAVEALKRGVE...,/mnt/projects/crispy_shifty/demos/mpnn_to_af2_...
/mnt/projects/crispy_shifty/demos/mpnn_to_af2_example_1/outputs/decoys/0000/mpnn_dhr_b63133c4900e4891b27a361d064ba115.pdb.bz2,NDEKRKRAEKALQRAQEAEKKGDVEEAVRAAQEAVRAAKESGDNDV...,SEELRREARRAMEEARRAMERGDVRRAVRALERAVRAAGKAGSKEL...,GERLREEARRAMDEADRALRRGDVGRAVRALERAVRAAGLAGSRDL...,SEELRREAERAMEEAERALREGDVRRAVEALRRAVRAAGEAGDRDL...,AERLEEEARRAMREAREALRRGDVRRAVRALRRAVRAAGLAGRRDL...,SEELRREARRAMEEARRALKRGDVRRAVEALRRAVEAAGLAGSRDL...,GEELRREAERAMREARRALEKGDVRRAVRALERAVRAAGLAGSRDL...,GAELRRRARRAMEEAREALREGDVGRAVRAMEEAVRAAGLAGSRDL...,SEELRREAREAMREAERALEEGDVRRAVEALRRAVRAAGEAGSRDL...,GERLLEEARRAMEEAERALRRGDVRRAVRALERAVRAAGLAGSREL...,...,GEELRREAREAMREAEEALREGDVRRAVEALERAVRAAGLAGDRAL...,GEELRREARRAMEEARRALKKGDVRRAVEALRRAVRAAGEAGDRDL...,GEELRREAREAMERARRALERGDVREAVRALERAVRAAGLAGSRDL...,GEELEREAREAMREAERALDRGDVREAVRALERAVEAAGRAGRRDL...,GRELREEARRAMERAEEALKEGDVKTAVEAAKRAVEAAGLAGDKEL...,SEELRREARRAMEEAERALERGDVRRAVRELERAVRAAGLAGSREL...,GEELRRRAREAMERARRALDRGDVREAVRALREAVRAAGLAGSRDL...,GERLREEARRAMERAREALKKGDVKKAVEALKRAVKAAGEAGDKEL...,SEELRREAREAMREARRALERGDVRRAVEALERAVDAAALAGDRAL...,/mnt/projects/crispy_shifty/demos/mpnn_to_af2_...


### Save individual fastas

In [9]:
from crispy_shifty.utils.io import df_to_fastas

output_path = OUTPUT_PATH

scores_df = df_to_fastas(scores_df, prefix="mpnn_seq")

  0%|          | 0/2 [00:00<?, ?it/s]

### Save a list of outputs

In [10]:
output_path = OUTPUT_PATH

with open(os.path.join(output_path, "mpnn_designed.list"), "w") as f:
    for path in tqdm(scores_df.index):
        print(path, file=f)

  0%|          | 0/2 [00:00<?, ?it/s]

### Concat the pdb.bz2 and fasta paths into a single list, for an AF2 run

In [11]:
output_path = OUTPUT_PATH

with open(os.path.join(output_path, "mpnn_designed.pair"), "w") as f:
    for path in tqdm(scores_df.index):
        line = path + "____" + path.replace("decoys", "fastas").replace("pdb.bz2", "fa")
        print(line, file=f)

  0%|          | 0/2 [00:00<?, ?it/s]

### Run Superfold to fold the DHRs with redesigned sequences

In [12]:
# since we set the working dir to the root of the repo, python knows where to load this
from crispy_shifty.utils.io import gen_array_tasks

SIMULATION_NAME = "fold_dhr"  # change this to whatever you want
design_list_file = os.path.join(
    output_path, "mpnn_designed.pair"
)  # the file we just made

# this is the relative path from the root of the repo to .py file
# `crispy_shifty/protocols/demos.py` and `fold_dhr` is the function in the file
DISTRIBUTE_FUNC = "crispy_shifty.protocols.demos.fold_dhr"

# we'll use a directory in demos for our output, you should change this too
OUTPUT_PATH = os.path.join(os.getcwd(), "demos/mpnn_to_af2_example_1/folded")
os.makedirs(OUTPUT_PATH, exist_ok=True)

# pyrosetta init options
OPTIONS = " ".join(
    [
        "out:level 200",  # warnings and errors only, normal output is 300
    ]
)
# options for controlling function behavior
EXTRA_OPTIONS = {"models": "all"}  # could use just "3" for example

QUEUE = "gpu"  # change if needed, could use medium instead for example
MEMORY = "6G"  # change if needed
GRES = "--gres=gpu:rtx2080:1"
SHA1 = (
    None  # change this to "" when you are ready to do a production run in your branch
)
# if SHA1 is not None, save the notebook and `git commit -am "SOME MESSAGE"` ;
# then `git push; sleep 15` in your branch before running this cell

gen_array_tasks(
    distribute_func=DISTRIBUTE_FUNC,
    design_list_file=design_list_file,
    output_path=OUTPUT_PATH,  # the output dir we just made
    queue=QUEUE,
    memory=MEMORY,
    gres=GRES,
    nstruct=1,
    nstruct_per_task=1,
    options=OPTIONS,
    extra_kwargs=EXTRA_OPTIONS,
    sha1=SHA1,
    simulation_name=SIMULATION_NAME,
)

Run the following command with your desired environment active:
sbatch -a 1-2 /mnt/projects/crispy_shifty/demos/mpnn_to_af2_example_1/folded/run.sh


In [14]:
!sbatch -a 1-2 /mnt/projects/crispy_shifty/demos/mpnn_to_af2_example_1/folded/run.sh

Submitted batch job 24596894


### Collect scorefiles of folded dhrs and concatenate
**Change this to wherever you cloned it**

In [6]:
CLONE_LOC = "/projects/crispy_shifty"
sys.path.insert(0, CLONE_LOC)
from crispy_shifty.utils.io import collect_score_file

simulation_name = SIMULATION_NAME
output_path = OUTPUT_PATH

if not os.path.exists(os.path.join(output_path, "scores.json")):
    collect_score_file(output_path, "scores")

### Load resulting concatenated scorefile

In [7]:
from crispy_shifty.utils.io import parse_scorefile_linear

output_path = OUTPUT_PATH

scores_df = parse_scorefile_linear(os.path.join(output_path, "scores.json"))
scores_df = scores_df.convert_dtypes()

  0%|          | 0/2 [00:00<?, ?it/s]

### Data exploration
As we can see, the sequences were added to the decoy scores

In [8]:
scores_df.head()

Unnamed: 0,mpnn_seq_0000,mpnn_seq_0001,mpnn_seq_0002,mpnn_seq_0003,mpnn_seq_0004,mpnn_seq_0005,mpnn_seq_0006,mpnn_seq_0007,mpnn_seq_0008,mpnn_seq_0009,...,mpnn_seq_0040,mpnn_seq_0041,mpnn_seq_0042,mpnn_seq_0043,mpnn_seq_0044,mpnn_seq_0045,mpnn_seq_0046,mpnn_seq_0047,mpnn_seq_0048,path_in
/mnt/projects/crispy_shifty/demos/mpnn_to_af2_example_1/outputs/decoys/0000/mpnn_dhr_7c5290f17b604a00874b8e84ed32371b.pdb.bz2,DEEVQEAVERAEELREEAEELIKKARKTGDPELLRKALEALEEAVR...,SAEVRERVERARELLREAERLIEEAKKTGDPELLERARRALERAVR...,AEEVRRRVERARELRRRAEELVERARKTGDPRLLLRAVEALRVGVR...,SSAVRRAVEQAERLRREAEELVKRAKTTGDPELLKQAREALRVGIE...,SAEVRRRVRRAEELRREAEELVREARRTGSPELLRRARDALREGIR...,SEEVKRRVEQAKELRKRAEKLKKKAEKEGDPRLLEQAKEALRYGIE...,AEEVRRRVTEAEELRRRAEELRREAERTGDPRLLLEARRALEEGIR...,RKEVREAREKAEELREKAERLVEKAKEKGDPELLKQANEALKKAIR...,SAEVRRRVEEAERLRREAERLVKEGERTGDVRLLREARRALERAIE...,REEVERRVEEAERLREEAERLIEEARTTGDWELLERAREAARRGIE...,...,SAEVRRRVEEAEELRREAERLVREARRTGDPRLLREAREALRRAVE...,SEETRRRVREAEELRRRAERLIEEAKRTGDPELLKEAREALRYGIR...,REEVRRRVEEARRLRRRAERLVEEAERTGDPELLKEARRALEKGIE...,SEEVRRRVEEARRLRRRAEELVRRARERGDPRLLREARRALERGVR...,MEEVRRRVREAKRLKERAERLIKEAERTGSAELLQEAKEALEKGIE...,SEEVTRRVRRAEELLREAERLREEAERTGDPRLLRRAREALRRGIE...,SAEVRRRVEEAKERRRRAERLIERARREGDPELLREAKRELDRGIE...,REEVRRRVEEAERLRRRAERLIEEAKKKGDSELLKEAREALRRGIE...,SAEVRRRVEEAERLRREAERLVREAERTGDPRLLEEARRALERGVR...,/mnt/projects/crispy_shifty/demos/mpnn_to_af2_...
/mnt/projects/crispy_shifty/demos/mpnn_to_af2_example_1/outputs/decoys/0000/mpnn_dhr_39dad348b60b4b8d949301afba65049e.pdb.bz2,NDEKRKRAEKALQRAQEAEKKGDVEEAVRAAQEAVRAAKESGDNDV...,GEELEREARRAMERAREALRKGDVRRAVEELRRAVRAAGRAGSRDL...,SEELRREAERAMREAREALKKGDVGEAVRALRRAVRAAGLAGSRDL...,AEELRREAREAMERAREALKKGDVKEAVEYLRRAVRAAGEAGSRDL...,AEELRREARRAMEEARRALERGDVREAVRALERAVRAAGEAGDREL...,ASELERRARAAMEEADRALERGDVGSAVRALERAVRAAGEAGSRAL...,GSRLREEARRAMERAREALRRGDVGSAVRELERAVRAAGLAGDRRL...,GEELRREARRAMEEAERALERGDVRRAVEAMERAVRAAGLAGDREL...,GAELRREARRAMEEARRALERGDVKSAVEAMRRAVRAAGLAGSRDL...,GEELRRRARRAMEEAEEALKRGDVGKAVRALREAVRAAGLAGDKEL...,...,AEELRRRAREAMEEARRALERGDVRRAVRALEEAVRAAGLAGSRDL...,GEELRREAERAMERAKEALRKGDVKRAVEALKEAVRAAEEAGDKEL...,GEELRREARRAMERAEEALRRGDVRTAVEALRRAVEAAGEAGSKEL...,GERLREEARRAMEEAREALKKGDVKRAVESLRRAVRAAGLAGDRDL...,GEELRREAEEAMREAEEALRRGDVRRAVEALRRAVRAAGLAGSREL...,GEKLREKARKAMEKAREALKRGDVKRAVEALKEAVRAAGLAGDKDL...,SEELLRRARRALEEARRALEEGDVGRAVRYLERAVKAAGEAGSREL...,GEELKKEAKEAMKEAKKAMEKGDVGKAVEALKKAVKAAGKAGSREL...,SEELRRRARRALEEARRALEKGDVERAVEALREAVRAAGEAGSRDL...,/mnt/projects/crispy_shifty/demos/mpnn_to_af2_...
