# Prep filtered scaffold sets for distributed design

### Imports

In [1]:
%load_ext lab_black
# Python standard library
from glob import glob
import os
import socket
import sys

# 3rd party library imports
import dask
import matplotlib.pyplot as plt
import pandas as pd
import pyrosetta
import numpy as np
import scipy
import seaborn as sns
from tqdm.auto import tqdm  # jupyter compatible progress bar

tqdm.pandas()  # link tqdm to pandas
# Notebook magic
# save plots in the notebook
%matplotlib inline
# reloads modules automatically before executing cells
%load_ext autoreload
%autoreload 2
print(f"running in directory: {os.getcwd()}")  # where are we?
print(f"running on node: {socket.gethostname()}")  # what node are we on?

running in directory: /mnt/projects/crispy_shifty/notebooks
running on node: dig136


### Set working directory to the root of the crispy_shifty repo

In [2]:
os.chdir("/projects/crispy_shifty")

### Load a dataframe of filtered scaffolds and associated metadata
These scaffolds had AF2 run on them; for their best quality prediction out of the 5 AF2 ptm models they have > 92 plddt and < 1.5 RMSD to design.  
We will also make the task generator here

In [3]:
def create_tasks(scaffolds, options):
    metadata_to_keep = [
        "pdb",
        "topo",
        "best_model",
        "best_average_plddts",
        "best_ptm",
        "best_rmsd_to_input",
        "best_average_DAN_plddts",
        "scaffold_type",
    ]
    for _, row in scaffolds.iterrows():
        metadata = dict(row)
        metadata = {k: v for k, v in metadata.items() if k in metadata_to_keep}
        pdb_path = metadata["pdb"]
        tasks = {}
        tasks["extra_options"] = options
        tasks["metadata"] = metadata
        tasks["pdb_path"] = pdb_path
        yield tasks


scaffolds = pd.read_csv(
    os.path.join(os.getcwd(), "scaffolds/00_filter_scaffold_sets/all_filtered.csv")
)

### Domesticate the scaffolds by trimming off leading and trailing loops, designing away disulfides and adding metadata to the output pdb.bz2s. 

In [4]:
# Python standard library
import os
import pwd
import socket
import sys

# 3rd party library imports
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

# Rosetta library imports
from pyrosetta.distributed.cluster.core import PyRosettaCluster

# Custom library imports
sys.path.insert(0, os.getcwd())
from crispy_shifty.protocols.cleaning import (
    remove_terminal_loops,
    redesign_disulfides,
)  # the functions we will distribute


print(
    "run the following from your local terminal to port forward the dashboard to localhost"
)
print(
    f"ssh -L 8000:localhost:8787 {pwd.getpwuid(os.getuid()).pw_name}@{socket.gethostname()}"
)
print("dashboard is now visible at localhost:8000")
print(f"can also view dashboard at {socket.gethostname()}:8787 without port forwarding")
options = {
    "-out:level": "200",  # warning outputs only
    "-corrections::beta_nov16": "true",
    "-detect_disulf": "false",
    "-holes:dalphaball": "/home/bcov/ppi/tutorial_build/main/source/external/DAlpahBall/DAlphaBall.gcc",
    "-indexed_structure_store:fragment_store": "/home/bcov/sc/scaffold_comparison/data/ss_grouped_vall_all.h5",
}
output_path = os.path.join(os.getcwd(), "scaffolds/01_prep_inputs")
os.makedirs(output_path, exist_ok=True)

if __name__ == "__main__":
    # configure SLURM cluster as a context manager
    with SLURMCluster(
        cores=1,
        processes=1,
        job_cpu=1,
        memory="12GB",
        queue="short",
        walltime="3:00:00",
        death_timeout=120,
        local_directory="$TMPDIR", # spill worker litter on local node temp storage
        log_directory="/mnt/home/pleung/logs/slurm_logs",
        extra=["--lifetime", "3h", "--lifetime-stagger", "5m"],
    ) as cluster:
        print(cluster.job_script())
        # scale between 1-1000 workers,
        cluster.adapt(
            minimum=1,
            maximum=1000,
            wait_count=999,  # Number of consecutive times that a worker should be suggested for removal it is removed
            interval="5s",  # Time between checks
        )
        # setup a client to interact with the cluster as a context manager
        with Client(cluster) as client:
            print(client)
            client.upload_file(
                os.path.join(os.getcwd(), "crispy_shifty/protocols/cleaning.py")
            )  # upload the script that contains the functions to distribute
            PyRosettaCluster(
                client=client,
                logging_level="WARNING",
                output_path=output_path,
                project_name="crispy_shifty",
                scratch_dir=output_path,
                simulation_name="notebooks_01_prep_inputs",
                tasks=create_tasks(scaffolds, options),
            ).distribute(protocols=[remove_terminal_loops, redesign_disulfides])
            client.close()
        cluster.scale(0)
        cluster.close()
    print("distributed run complete")

run the following from your local terminal to port forward the dashboard to localhost
ssh -L 8000:localhost:8787 pleung@dig136
dashboard is now visible at localhost:8000
can also view dashboard at dig136:8787 without port forwarding
#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -e /mnt/home/pleung/logs/slurm_logs/dask-worker-%J.err
#SBATCH -o /mnt/home/pleung/logs/slurm_logs/dask-worker-%J.out
#SBATCH -p medium
#SBATCH -n 1
#SBATCH --cpus-per-task=1
#SBATCH --mem=12G
#SBATCH -t 23:30:00
export PYTHONPATH='$PATH:/software/conda'
/home/pleung/.conda/envs/crispy/bin/python -m distributed.cli.dask_worker tcp://172.16.131.240:45613 --nthreads 1 --memory-limit 11.18GiB --name dummy-name --nanny --death-timeout 120 --local-directory $TMPDIR/dask --lifetime 23h --lifetime-stagger 5m --protocol tcp://

<Client: 'tcp://172.16.131.240:45613' processes=0 threads=0, memory=0 B>
distributed run complete


### Unused blocks

In [4]:
import pyrosetta

pyrosetta.init(
    "-corrections::beta_nov16 true \
    -detect_disulf false \
    -holes:dalphaball /software/rosetta/DAlphaBall.gcc \
    -indexed_structure_store:fragment_store /home/bcov/sc/scaffold_comparison/data/ss_grouped_vall_all.h5"
)


sys.path.insert(0, "/projects/crispy_shifty/")
from crispy_shifty.protocols.cleaning import remove_terminal_loops, redesign_disulfides


t = next(
    remove_terminal_loops(
        None,
        pdb_path="/net/shared/scaffolds/pre_scaffold_DB/tj_junctions/DHR52_DHR80_l3_t0_t0_1_v4c.pdb",
        metadata={
            "pdb": "/net/shared/scaffolds/pre_scaffold_DB/tj_junctions/DHR52_DHR80_l3_t0_t0_1_v4c.pdb",
            "topo": "HHHHHHHH",
            "best_model": 2,
            "best_average_plddts": 96.0650150399,
            "best_ptm": 0.8458813818,
            "best_rmsd_to_input": 1.2088154485,
            "best_average_DAN_plddts": 0.947265625,
            "scaffold_type": "tj_junctions",
        },
    )
)

t2 = redesign_disulfides(t)

PyRosetta-4 2021 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python38.Release 2021.31+release.c7009b3115c22daa9efe2805d9d1ebba08426a54 2021-08-07T10:04:12] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.
core.init: {0} Checking for fconfig files in pwd and ./rosetta/flags
core.init: {0} Rosetta version: PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python38.Release r292 2021.31+release.c7009b3 c7009b3115c22daa9efe2805d9d1ebba08426a54 http://www.pyrosetta.org 2021-08-07T10:04:12
core.init: {0} command: PyRosetta -corrections::beta_nov16 true -detect_disulf false -holes:dalphaball /home/bcov/ppi/tutorial_build/main/source/external/DAlpahBall/DAlphaBall.gcc -indexed_structure_store:fragment_store /home/bcov/sc/scaffold_comparison/data/ss_grouped_vall_all.h5 -database /home/pleung/.conda/envs/crispy/lib/python3.8/site-packages/pyrosetta/database
basic.random.init_r

In [5]:
t2 = next(t2)

protocols.rosetta_scripts.RosettaScriptsParser: {0} Generating XML Schema for rosetta_scripts...
protocols.rosetta_scripts.RosettaScriptsParser: {0} ...done
protocols.rosetta_scripts.RosettaScriptsParser: {0} Initializing schema validator...
protocols.rosetta_scripts.RosettaScriptsParser: {0} ...done
protocols.rosetta_scripts.RosettaScriptsParser: {0} Validating input script...
protocols.rosetta_scripts.RosettaScriptsParser: {0} ...done
protocols.rosetta_scripts.RosettaScriptsParser: {0} Parsed script:
<ROSETTASCRIPTS>
	<SCOREFXNS>
		<ScoreFunction name="sfxn" symmetric="0" weights="beta_nov16"/>
	</SCOREFXNS>
	<RESIDUE_SELECTORS>
		<Index name="cys" resnums="1,43"/>
		<Not name="not_cys" selector="cys"/>
		<Neighborhood name="around_cys" selector="cys"/>
		<Or name="cys_or_around_cys" selectors="cys,around_cys"/>
		<Not name="not_cys_or_around_cys" selector="cys_or_around_cys"/>
		<Layer name="surface" select_boundary="false" select_core="false" select_surface="true" use_sidechain_nei

In [6]:
t2.sequence()

AttributeError: 'PackedPose' object has no attribute 'sequence'

In [7]:
t2.scores

{'best_average_DAN_plddts': '0.947265625',
 'best_average_plddts': '96.0650150399',
 'best_model': '2',
 'best_ptm': '0.8458813818',
 'best_rmsd_to_input': '1.2088154485',
 'dslf_fa13': 0.0,
 'fa_atr': -607.6482203100481,
 'fa_dun_dev': 9.232464616839039,
 'fa_dun_rot': 90.82582021504003,
 'fa_dun_semi': 177.14759405039172,
 'fa_elec': -229.15349882051757,
 'fa_intra_atr_xover4': -40.8593768827536,
 'fa_intra_elec': -31.136729079718407,
 'fa_intra_rep_xover4': 32.00859062092555,
 'fa_intra_sol_xover4': 32.17840356736296,
 'fa_rep': 98.7096720844281,
 'fa_sol': 536.9930580874362,
 'hbond_bb_sc': -7.657561777173297,
 'hbond_lr_bb': 0.0,
 'hbond_sc': -21.04185735505566,
 'hbond_sr_bb': -103.44243962608486,
 'hxl_tors': 7.44414321298515,
 'lk_ball': 303.1222445313518,
 'lk_ball_bridge': 2.950289574729179,
 'lk_ball_bridge_uncpl': 23.770311471080888,
 'lk_ball_iso': 737.7302354518708,
 'omega': 6.246759476485635,
 'p_aa_pp': -37.57802230767181,
 'pdb': '/net/shared/scaffolds/pre_scaffold_DB