# MPNN design using the crispy_shifty modules
Variables in ALL_CAPS are the ones you should change

### Imports

In [None]:
%load_ext lab_black
# Python standard library
from glob import glob
import os
import socket
import sys

# 3rd party library imports
import dask
import matplotlib.pyplot as plt
import pandas as pd
import pyrosetta
import numpy as np
import scipy
import seaborn as sns
from tqdm.auto import tqdm  # jupyter compatible progress bar

tqdm.pandas()  # link tqdm to pandas
# Notebook magic
# save plots in the notebook
%matplotlib inline
# reloads modules automatically before executing cells
%load_ext autoreload
%autoreload 2
print(f"running in directory: {os.getcwd()}")  # where are we?
print(f"running on node: {socket.gethostname()}")  # what node are we on?

### Set working directory to the root of the crispy_shifty repo
**Change this to wherever you cloned it**

In [None]:
WORKING_DIRECTORY = "/home/pleung/projects/crispy_shifty"

os.chdir(WORKING_DIRECTORY)

### Prep a list of designs
To do this, you'll need a list with the fullpath of your designs, one per line.
Here's how to do it with python, you could also `cd <dir full of your designs> ; realpath *.pdb > designs.list` or some such

In [None]:
# we'll use TJ's DHRs for this demo, change this to the location of your designs
INPUT_PATH = "TODO"
# we'll use a directory in demos for our output, you should change this too
OUTPUT_PATH = "TODO"
os.makedirs(OUTPUT_PATH, exist_ok=True)
inputs_list = os.path.join(OUTPUT_PATH, "inputs.list")
# 'with open' statement is a context manager, when we exit the with statement,
# the file is closed, preventing filesystem errors
with open(inputs_list) as f: 
    for pdb in glob(os.path.join(INPUT_PATH, "*.pdb")):
        print(pdb, file=f)

### Run MPNN on your designs
We'll make array tasks since that is easiest, if you need to profile your code or want better analytics, you should use the `dask` stack, just be aware that it can only scale up to around 20k CPU hours or so on the digs

In [3]:
# since we set the working dir to the root of the repo, python knows where to load this 
from crispy_shifty.utils.io import gen_array_tasks 

SIMULATION_NAME = "mpnn_demo" # change this to whatever you want
design_list_file = inputs_list # the file we just made


DISTRIBUTE_FUNC = "crispy_shifty.protocols.mpnn.mpnn_bound_state" # TODO

# pyrosetta init options
OPTIONS = " ".join(
    [
        "out:level 200", # warnings and errors only, normal output is 300
    ]
)
# options for controlling function behavior
EXTRA_OPTIONS = "TODO"

QUEUE = "medium" # change if needed
MEMORY = "4G" # change if needed

gen_array_tasks(
    distribute_func=DISTRIBUTE_FUNC,
    design_list_file=design_list_file,
    output_path=OUTPUT_PATH, # the output dir we just made
    queue=QUEUE,
    memory=MEMORY,
    nstruct=1,
    nstruct_per_task=1,
    options=OPTIONS,
    simulation_name=SIMULATION_NAME,
)

  from distributed.utils import tmpfile


https://docs.anaconda.com/anaconda/install

Run the following command with your desired environment active:
sbatch -a 1-$(cat /mnt/home/pleung/projects/crispy_shifty/projects/crispy_shifties/02_mpnn_bound_states/tasks.cmds | wc -l) /mnt/home/pleung/projects/crispy_shifty/projects/crispy_shifties/02_mpnn_bound_states/run.sh


In [4]:
!sbatch -a 1-$(cat /mnt/home/pleung/projects/crispy_shifty/projects/crispy_shifties/02_mpnn_bound_states/tasks.cmds | wc -l) /mnt/home/pleung/projects/crispy_shifty/projects/crispy_shifties/02_mpnn_bound_states/run.sh

Submitted batch job 5826055


### Collect scorefiles of designed bound states and concatenate
TODO change to projects dir

In [3]:
sys.path.insert(0, "~/projects/crispy_shifty")  # TODO
from crispy_shifty.utils.io import collect_score_file

simulation_name = "02_mpnn_bound_states"
output_path = os.path.join(os.getcwd(), f"projects/crispy_shifties/{simulation_name}")

if not os.path.exists(os.path.join(output_path, "scores.json")):
    collect_score_file(output_path, "scores")

https://docs.anaconda.com/anaconda/install



  from distributed.utils import tmpfile


### Load resulting concatenated scorefile
TODO change to projects dir

In [4]:
sys.path.insert(0, "~/projects/crispy_shifty")  # TODO
from crispy_shifty.utils.io import parse_scorefile_linear

output_path = os.path.join(os.getcwd(), f"projects/crispy_shifties/{simulation_name}")

scores_df = parse_scorefile_linear(os.path.join(output_path, "scores.json"))
scores_df = scores_df.convert_dtypes()

  0%|          | 0/1305 [00:00<?, ?it/s]

### Setup for plotting

In [5]:
sns.set(
    context="talk",
    font_scale=1,  # make the font larger; default is pretty small
    style="ticks",  # make the background white with black lines
    palette="colorblind",  # a color palette that is colorblind friendly!
)

### Data exploration
Gonna remove the Rosetta sfxn scoreterms for now

In [6]:
from crispy_shifty.protocols.design import beta_nov16_terms

scores_df = scores_df[
    [term for term in scores_df.columns if term not in beta_nov16_terms]
]
print(len(scores_df))
print(list(scores_df.columns))

1305
['bb_clash', 'best_average_DAN_plddts', 'best_average_plddts', 'best_model', 'best_ptm', 'best_rmsd_to_input', 'buns_parent', 'cms_AcB', 'cms_AnAc', 'cms_AnAcB', 'cms_AnB', 'docked_helix', 'dssp', 'exposed_hydrophobics_parent', 'geometry_parent', 'holes_all_parent', 'holes_core_parent', 'loop_dist', 'loop_sc', 'looped_length', 'mismatch_probability_parent', 'mpnn_seq_0000', 'mpnn_seq_0001', 'mpnn_seq_0002', 'mpnn_seq_0003', 'mpnn_seq_0004', 'mpnn_seq_0005', 'mpnn_seq_0006', 'mpnn_seq_0007', 'mpnn_seq_0008', 'mpnn_seq_0009', 'mpnn_seq_0010', 'mpnn_seq_0011', 'mpnn_seq_0012', 'mpnn_seq_0013', 'mpnn_seq_0014', 'mpnn_seq_0015', 'mpnn_seq_0016', 'mpnn_seq_0017', 'mpnn_seq_0018', 'mpnn_seq_0019', 'mpnn_seq_0020', 'mpnn_seq_0021', 'mpnn_seq_0022', 'mpnn_seq_0023', 'mpnn_seq_0024', 'mpnn_seq_0025', 'mpnn_seq_0026', 'mpnn_seq_0027', 'mpnn_seq_0028', 'mpnn_seq_0029', 'mpnn_seq_0030', 'mpnn_seq_0031', 'mpnn_seq_0032', 'mpnn_seq_0033', 'mpnn_seq_0034', 'mpnn_seq_0035', 'mpnn_seq_0036', 'mpnn_

### Save individual fastas
TODO change to projects dir

In [7]:
sys.path.insert(0, "~/projects/crispy_shifty")  # TODO
from crispy_shifty.utils.io import df_to_fastas

output_path = os.path.join(os.getcwd(), f"projects/crispy_shifties/{simulation_name}")

scores_df = df_to_fastas(scores_df, prefix="mpnn_seq")

  0%|          | 0/1305 [00:00<?, ?it/s]

  df["fasta_path"] = df.progress_apply(mask, args=(out_path), axis=1)


### Save a list of outputs

In [8]:
simulation_name = "02_mpnn_bound_states"
output_path = os.path.join(os.getcwd(), f"projects/crispy_shifties/{simulation_name}")

with open(os.path.join(output_path, "mpnn_states.list"), "w") as f:
    for path in tqdm(scores_df.index):
        print(path, file=f)

  0%|          | 0/1305 [00:00<?, ?it/s]

### Concat the pdb.bz2 and fasta paths into a single list, for reasons

In [9]:
simulation_name = "02_mpnn_bound_states"
output_path = os.path.join(os.getcwd(), f"projects/crispy_shifties/{simulation_name}")

with open(os.path.join(output_path, "mpnn_states.pair"), "w") as f:
    for path in tqdm(scores_df.index):
        line = path + "____" + path.replace("decoys", "fastas").replace("pdb.bz2", "fa")
        print(line, file=f)

  0%|          | 0/1305 [00:00<?, ?it/s]