# MPNN design using the crispy_shifty modules
Variables in ALL_CAPS are the ones you should change

## Example 1: Naive redesign of DHRs

### Imports

In [1]:
%load_ext lab_black
# Python standard library
from glob import glob
import os
import socket
import sys

# 3rd party library imports
import dask
import matplotlib.pyplot as plt
import pandas as pd
import pyrosetta
import numpy as np
import scipy
import seaborn as sns
from tqdm.auto import tqdm  # jupyter compatible progress bar

tqdm.pandas()  # link tqdm to pandas
# Notebook magic
# save plots in the notebook
%matplotlib inline
# reloads modules automatically before executing cells
%load_ext autoreload
%autoreload 2
print(f"running in directory: {os.getcwd()}")  # where are we?
print(f"running on node: {socket.gethostname()}")  # what node are we on?

running in directory: /mnt/home/pleung/projects/crispy_shifty/demos
running on node: dig201


### Set working directory to the root of the crispy_shifty repo
**Change this to wherever you cloned it**

In [2]:
WORKING_DIRECTORY = "/home/pleung/projects/crispy_shifty"

os.chdir(WORKING_DIRECTORY)

### Prep a list of designs
To do this, you'll need a list with the fullpath of your designs, one per line.
Here's how to do it with python, you could also `cd <dir full of your designs> ; realpath *.pdb > designs.list` or some such

In [3]:
# we'll use TJ's DHRs for this demo, change this to the location of your designs
INPUT_PATH = os.path.join(os.getcwd(), "demos/mpnn_pipeline_example/inputs")
# we'll use a directory in demos for our output, you should change this too
OUTPUT_PATH = os.path.join(os.getcwd(), "demos/mpnn_pipeline_example/outputs")
os.makedirs(OUTPUT_PATH, exist_ok=True)
inputs_list = os.path.join(INPUT_PATH, "inputs.list")
# 'with open' statement is a context manager, when we exit the with statement,
# the file is closed, preventing filesystem errors
with open(inputs_list, "w") as f:
    for pdb in glob(os.path.join(INPUT_PATH, "DHR*.pdb")):
        print(pdb, file=f)

### Run MPNN to naively redesign entire DHRs
We'll make array tasks since that is easiest, if you need to profile your code or want better analytics, you should use the `dask` stack, just be aware that it can only scale up to around 20k CPU hours or so on the digs

In [4]:
# since we set the working dir to the root of the repo, python knows where to load this
from crispy_shifty.utils.io import gen_array_tasks

SIMULATION_NAME = "mpnn_dhr"  # change this to whatever you want
design_list_file = inputs_list  # the file we just made


DISTRIBUTE_FUNC = "crispy_shifty.protocols.demos.mpnn_dhr"  # TODO

# pyrosetta init options
OPTIONS = " ".join(
    [
        "out:level 200",  # warnings and errors only, normal output is 300
    ]
)
# options for controlling function behavior
EXTRA_OPTIONS = "TODO"

QUEUE = "medium"  # change if needed
MEMORY = "4G"  # change if needed
SHA1 = (
    None  # change this to "" when you are ready to do a production run in your branch
)
# if SHA1 is not None, save the notebook and `git commit -am "SOME MESSAGE"; git push` in your branch before running this cell

gen_array_tasks(
    distribute_func=DISTRIBUTE_FUNC,
    design_list_file=design_list_file,
    output_path=OUTPUT_PATH,  # the output dir we just made
    queue=QUEUE,
    memory=MEMORY,
    nstruct=1,
    nstruct_per_task=1,
    options=OPTIONS,
    sha1=SHA1,
    simulation_name=SIMULATION_NAME,
)

  from distributed.utils import tmpfile


https://docs.anaconda.com/anaconda/install

Run the following command with your desired environment active:
sbatch -a 1-2 /mnt/home/pleung/projects/crispy_shifty/demos/mpnn_pipeline_example/outputs/run.sh


In [6]:
!sbatch -a 1-2 /mnt/home/pleung/projects/crispy_shifty/demos/mpnn_pipeline_example/outputs/run.sh

Submitted batch job 12863693


### Collect scorefiles of designed bound states and concatenate
**Change this to wherever you cloned it**

In [8]:
CLONE_LOC = "~/projects/crispy_shifty"
sys.path.insert(0, CLONE_LOC)  # TODO
from crispy_shifty.utils.io import collect_score_file

simulation_name = SIMULATION_NAME
output_path = OUTPUT_PATH

if not os.path.exists(os.path.join(output_path, "scores.json")):
    collect_score_file(output_path, "scores")

### Load resulting concatenated scorefile

In [9]:
from crispy_shifty.utils.io import parse_scorefile_linear

output_path = OUTPUT_PATH

scores_df = parse_scorefile_linear(os.path.join(output_path, "scores.json"))
scores_df = scores_df.convert_dtypes()

  0%|          | 0/2 [00:00<?, ?it/s]

### Setup for plotting

In [10]:
sns.set(
    context="talk",
    font_scale=1,  # make the font larger; default is pretty small
    style="ticks",  # make the background white with black lines
    palette="colorblind",  # a color palette that is colorblind friendly!
)

### Data exploration
As we can see, the sequences were added to the decoy scores

In [12]:
scores_df.head()

Unnamed: 0,mpnn_seq_0000,mpnn_seq_0001,mpnn_seq_0002,mpnn_seq_0003,mpnn_seq_0004,mpnn_seq_0005,mpnn_seq_0006,mpnn_seq_0007,mpnn_seq_0008,mpnn_seq_0009,...,mpnn_seq_0055,mpnn_seq_0056,mpnn_seq_0057,mpnn_seq_0058,mpnn_seq_0059,mpnn_seq_0060,mpnn_seq_0061,mpnn_seq_0062,mpnn_seq_0063,mpnn_seq_0064
/mnt/home/pleung/projects/crispy_shifty/demos/mpnn_pipeline_example/outputs/decoys/0000/mpnn_dhr_4060a530e985481fa006afaa12150c0b.pdb.bz2,DEEVQEAVERAEELREEAEELIKKARKTGDPELLRKALEALEEAVR...,REEVERRVEEAKERERRAERLIEEARETGDPELLRQAKEELREGIE...,SEEVRRRVEEAKRLREEAERLIKEAREKGDPELLERAQEALKEGIR...,SERVRRRVREAERLLEEAERLREEAEKTGDPELLLEAKEAAERGIE...,EEEVRRRVEEAKRLLEEAERLVEEAERRGDPELLKEAQRALDRGIE...,EEEVERRVEEAKRLLEEAERLVREAEETGDPELLKEAKEKLREGIE...,EEEVRRRVEEAKERLERARRLVEEAEKRGDPELLKEAQEELREGVR...,REEVERRVREARRLREEAERLIKEAKEKGDVELLKEAEEALRRGVE...,SAKVKERVEKAKELRKKAEELIEEAKKKGDPELLKKAKEELRKGVE...,RREVEERVERARELLRRARELVERARREGDPELLEEAREALRRAIE...,...,EEEVERRVEEAKRRKERAERLIERAEKEGDSELLKEAKEELDRGVR...,REEVERRVREAEERRRRAERLIEEAERTGDAELLREARRELEEGIR...,SEEVRRRVEEAKERRRRAEELKREAERTGDPELLKEAEEELREGVE...,SEEVKRRVEEAKRLLEEAERLIERARREGDPELLEKAREALRRGIE...,KEEVERRVEEAKELKEEARRLIEEAEETGDPELLKEAVEKLRRGIE...,REEVERRVREAEERRERAERLVEEAKRTGDPELLREAREELRRGVE...,REEVERRVREARRLREEAERLRREAEETGDPELLREARRALEEGIR...,SEEVRRRVEEARERRRRAERLVEEAERTGDPELLRQAREEVRVGIE...,EEEVKRRVEEAKELKEKAEELVKKAKEKGDPELLKEAKEALKRGVE...,REEVERRVREAEELKREAEELLREARERGDPELLLEAKRALDEGIE...
/mnt/home/pleung/projects/crispy_shifty/demos/mpnn_pipeline_example/outputs/decoys/0000/mpnn_dhr_306e80b8d1f149a88955579892084c50.pdb.bz2,NDEKRKRAEKALQRAQEAEKKGDVEEAVRAAQEAVRAAKESGDNDV...,AEELEREAREAMEEARRALEEGDVRRAVEYLRRAVRAAGLAGSREL...,GEELKREAIRAMEEAEEALKKGDVKRAVESLEKAVEAAGLAGSKEL...,GEELEREAKAAMEEAKKALEKGDVKRAVESLERAVEAAGEAGRKDL...,AEELRRRARRAMEEAREALRRGDVRRAVEALREAVRAAGLAGDKEL...,GEELLREARRAMEEAREALRKGDVRSAVEYLRRAVEAAGLAGSREE...,KEKLKEEAKKAMEKAEEALKEGDVEKAVEYLEKAVKAAGEAGSKEL...,AEELRREAIEAMEKAEEALRKGDVRRAVEYLERAVRAAGLAGSKEL...,GEELRREAREAMREAEEALRKGDVGRAVEALERAVRAAEEAGSREL...,GKELEKEAKEAMEKAEKALEKGDVKKAVEYLERAVKAAGEAGSKEL...,...,AEELRREAREAMEEAERALEEGDVRRAVEYLERAVRAAGEAGSKDL...,AEELREEARRAMEKAREALRKGDVRRAVEYLREAVEAAGKAGSKEL...,AEELLREARRAMEEAREALRRGDVERAVRALERAVRAAGEAGSKEL...,AEELLRRAREAMEEARRALREGDVRRAVEYLREAVEAAGLAGSREL...,AERLREEAREAMREAERALRKGDVRRAVEALRRAVEAAGLAGSKEL...,AEELKREAKRAMEEAREALRVGDVRRAVEYLEKAVEAAGLAGDRDL...,GEELRREAEEAMKKAEEALKKGDVKKAVEYLKEAVKAAEEAGDKDL...,AEELLREAEEAMEKAREALERGDVEEAVRYLERAVEAAGKAGSKEL...,AEELRRRARRAMEEARRALEVGDVERAVEALREAVRAAGEAGSKEL...,AEELRREARRAMEEAERALEKGDVEKAVEELRRAVEAAGLAGDREL...


### Save individual fastas

In [13]:
from crispy_shifty.utils.io import df_to_fastas

output_path = OUTPUT_PATH

scores_df = df_to_fastas(scores_df, prefix="mpnn_seq")

  0%|          | 0/2 [00:00<?, ?it/s]

### Save a list of outputs

In [14]:
output_path = OUTPUT_PATH

with open(os.path.join(output_path, "mpnn_designed.list"), "w") as f:
    for path in tqdm(scores_df.index):
        print(path, file=f)

  0%|          | 0/2 [00:00<?, ?it/s]

### Concat the pdb.bz2 and fasta paths into a single list, for reasons

In [15]:
output_path = OUTPUT_PATH

with open(os.path.join(output_path, "mpnn_designed.pair"), "w") as f:
    for path in tqdm(scores_df.index):
        line = path + "____" + path.replace("decoys", "fastas").replace("pdb.bz2", "fa")
        print(line, file=f)

  0%|          | 0/2 [00:00<?, ?it/s]

## Example 2: Junction design, generating all decoys

### Imports

In [16]:
%load_ext lab_black
# Python standard library
from glob import glob
import os
import socket
import sys

# 3rd party library imports
import dask
import matplotlib.pyplot as plt
import pandas as pd
import pyrosetta
import numpy as np
import scipy
import seaborn as sns
from tqdm.auto import tqdm  # jupyter compatible progress bar

tqdm.pandas()  # link tqdm to pandas
# Notebook magic
# save plots in the notebook
%matplotlib inline
# reloads modules automatically before executing cells
%load_ext autoreload
%autoreload 2
print(f"running in directory: {os.getcwd()}")  # where are we?
print(f"running on node: {socket.gethostname()}")  # what node are we on?

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
running in directory: /mnt/home/pleung/projects/crispy_shifty
running on node: dig201


### Set working directory to the root of the crispy_shifty repo
**Change this to wherever you cloned it**

In [17]:
WORKING_DIRECTORY = "/home/pleung/projects/crispy_shifty"

os.chdir(WORKING_DIRECTORY)

### Prep a list of designs
To do this, you'll need a list with the fullpath of your designs, one per line.
Here's how to do it with python, you could also `cd <dir full of your designs> ; realpath *.pdb > designs.list` or some such

In [3]:
# # we'll use TJ's DHRs for this demo, change this to the location of your designs
# INPUT_PATH = os.path.join(os.getcwd(), "demos/mpnn_pipeline_example/inputs")
# # we'll use a directory in demos for our output, you should change this too
# OUTPUT_PATH = os.path.join(os.getcwd(), "demos/mpnn_pipeline_example/outputs")
# os.makedirs(OUTPUT_PATH, exist_ok=True)
# inputs_list = os.path.join(INPUT_PATH, "inputs.list")
# # 'with open' statement is a context manager, when we exit the with statement,
# # the file is closed, preventing filesystem errors
# with open(inputs_list, "w") as f:
#     for pdb in glob(os.path.join(INPUT_PATH, "DHR*.pdb")):
#         print(pdb, file=f)

### Make a dask distributable / PyRosettaCluster compatible design function

In [None]:
# Python standard library
from typing import Dict, Iterator, List, Optional, Union

# 3rd party library imports
# Rosetta library imports
from pyrosetta.distributed.packed_pose.core import PackedPose
from pyrosetta.distributed import requires_init

# Custom library imports

@requires_init
def mpnn_dhr(
    packed_pose_in: Optional[PackedPose] = None, **kwargs
) -> Iterator[PackedPose]:
    """
    :param: packed_pose_in: a PackedPose object to be interface designed with MPNN.
    :param: kwargs: keyword arguments to be passed to MPNNDesign, or this function.
    :return: an iterator of PackedPose objects.
    """

    from pathlib import Path
    import sys
    from time import time
    import pyrosetta
    import pyrosetta.distributed.io as io
    from pyrosetta.rosetta.core.select.residue_selector import ChainSelector
    
    CLONE_LOC = "~/projects/crispy_shifty"

    # insert the root of the repo into the sys.path
    sys.path.insert(0, CLONE_LOC)
    from crispy_shifty.protocols.cleaning import path_to_pose_or_ppose
    from crispy_shifty.protocols.design import interface_between_selectors
    from crispy_shifty.protocols.mpnn import MPNNDesign
    from crispy_shifty.utils.io import print_timestamp

    start_time = time()

    # generate poses or convert input packed pose into pose
    if packed_pose_in is not None:
        poses = [io.to_pose(packed_pose_in)]
        pdb_path = "none"
    else:
        pdb_path = kwargs["pdb_path"]
        poses = path_to_pose_or_ppose(
            path=pdb_path, cluster_scores=True, pack_result=False
        )

    for pose in poses:
        pose.update_residue_neighbors()
        scores = dict(pose.scores)
        print_timestamp("Setting up design selector", start_time)
        # MAKE A SELECTOR THAT SELECTS THE RESIDUES TO BE DESIGNED
#         DESIGN_SELECTOR = ChainSelector(1)
        print_timestamp("Designing interface with MPNN", start_time)
        # construct the MPNNDesign object
#         mpnn_design = MPNNDesign(
#             design_selector=DESIGN_SELECTOR,
#             # MPNN understands layers, unsats, helix caps etc so just ban CYS
#             omit_AAs="CX",
#             **kwargs,
#         )
#         # design the pose
#         mpnn_design.apply(pose)
        print_timestamp("MPNN design complete, updating pose datacache", start_time)
        # update the scores dict
        scores.update(pose.scores)
        # update the pose with the updated scores dict
        for key, value in scores.items():
            pyrosetta.rosetta.core.pose.setPoseExtraScore(pose, key, value)
        # generate decoys with the new sequences
        for decoy_pose in mpnn_design.generate_all_poses(pose, include_native=False)
            ppose = io.to_packed(decoy_pose)
            yield ppose

## Example 3: Sequence design with repeat symmetry

## Example 4: Conformational switch design