# Prep filtered scaffold sets for distributed design

### Imports

In [1]:
%load_ext lab_black
# Python standard library
from glob import glob
import os
import socket
import sys

# 3rd party library imports
import dask
import matplotlib.pyplot as plt
import pandas as pd
import pyrosetta
import numpy as np
import scipy
import seaborn as sns
from tqdm.auto import tqdm  # jupyter compatible progress bar

tqdm.pandas()  # link tqdm to pandas
# Notebook magic
# save plots in the notebook
%matplotlib inline
# reloads modules automatically before executing cells
%load_ext autoreload
%autoreload 2
print(f"running in directory: {os.getcwd()}")  # where are we?
print(f"running on node: {socket.gethostname()}")  # what node are we on?

running in directory: /mnt/home/pleung/projects/crispy_shifty/notebooks
running on node: dig30


### Load a dataframe of filtered scaffolds and associated metadata
These scaffolds had AF2 run on them; for their best quality prediction out of the 5 AF2 ptm models they have > 92 plddt and < 1.5 RMSD to design.  
We will also make the task generator here

In [2]:
def create_tasks(scaffolds, options):
    metadata_to_keep = [
        "pdb",
        "topo",
        "best_model",
        "best_average_plddts",
        "best_ptm",
        "best_rmsd_to_input",
        "best_average_DAN_plddts",
        "scaffold_type",
    ]
    for _, row in scaffolds.iterrows():
        metadata = dict(row)
        metadata = {k: v for k, v in metadata.items() if k in metadata_to_keep}
        pdb_path = metadata["pdb"]
        tasks = {}
        tasks["extra_options"] = options
        tasks["metadata"] = metadata
        tasks["pdb_path"] = pdb_path
        yield tasks


scaffolds = pd.read_csv(
    f"/mnt/home/pleung/projects/crispy_shifty/scaffolds/all_filtered.csv"
)

### Domesticate the scaffolds by trimming off leading and trailing loops, designing away disulfides and adding metadata to the output pdb.bz2s. 
TODO: refold the resultant no_cys variants to check that they're ok?  
TODO: install conda into the crispy env?

In [None]:
# Python standard library                                                       
import os
import pwd
import socket
import sys
# 3rd party library imports                                                     
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
# Rosetta library imports                                                       
from pyrosetta.distributed.cluster.core import PyRosettaCluster
# Custom library imports
sys.path.append("/home/pleung/projects/crispy_shifty") # TODO change to projects dir for production
from protocols.cleaning import remove_terminal_loops, redesign_disulfides 


print("run the following from your local terminal to port forward the dashboard to localhost")
print(
    f"ssh -L 8000:localhost:8787 {pwd.getpwuid(os.getuid()).pw_name}@{socket.gethostname()}"
)
print("dashboard is now visible at localhost:8000")
print(f"can also view dashboard at {socket.gethostname()}:8787 without port forwarding")
options = {
    "-out:level": "200", # warning outputs only
    "-corrections::beta_nov16": "true",
    "-detect_disulf": "false",
    "-holes:dalphaball": "/home/bcov/ppi/tutorial_build/main/source/external/DAlpahBall/DAlphaBall.gcc",
    "-indexed_structure_store:fragment_store": "/home/bcov/sc/scaffold_comparison/data/ss_grouped_vall_all.h5",
}
output_path = "/home/pleung/projects/crispy_shifty/scaffolds/01_prep_inputs" # TODO change to projects dir for production

if __name__ == "__main__":
    # configure SLURM cluster as a context manager
    with SLURMCluster(
        cores=1,
        processes=1,
        job_cpu=1,
        memory="4GB",
        queue="medium",
        walltime="23:30:00",
        death_timeout=120,
        local_directory="$TMPDIR/dask",
        log_directory="/mnt/home/pleung/logs/slurm_logs",
        extra=["--lifetime", "23h", "--lifetime-stagger", "5m"],
    ) as cluster:
        print(cluster.job_script())
        # scale between 1-100 workers,
        cluster.adapt(
            minimum=1,
            maximum=100,
            wait_count=999,  # Number of consecutive times that a worker should be suggested for removal it is removed
            interval="5s",  # Time between checks
        )
        # setup a client to interact with the cluster as a context manager
        with Client(cluster) as client:
            print(client)
            PyRosettaCluster(
                client=client,
                logging_level="WARNING",
                output_path=output_path,
                project_name="crispy_shifty",
                scratch_dir=output_path,
                simulation_name="notebooks_01_prep_inputs",
                tasks=create_tasks(scaffolds, options),
            ).distribute(protocols=[remove_terminal_loops, redesign_disulfides])
            client.close()
        cluster.scale(0)
        cluster.close()
    print("distributed run complete")

https://docs.anaconda.com/anaconda/install

run the following from your local terminal to port forward the dashboard to localhost
ssh -L 8000:localhost:8787 pleung@dig30
dashboard is now visible at localhost:8000
can also view dashboard at dig30:8787 without port forwarding
#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -e /mnt/home/pleung/logs/slurm_logs/dask-worker-%J.err
#SBATCH -o /mnt/home/pleung/logs/slurm_logs/dask-worker-%J.out
#SBATCH -p short
#SBATCH -n 1
#SBATCH --cpus-per-task=1
#SBATCH --mem=4G
#SBATCH -t 3:30:00

/home/pleung/.conda/envs/crispy/bin/python -m distributed.cli.dask_worker tcp://172.16.131.60:44615 --nthreads 1 --memory-limit 3.73GiB --name dummy-name --nanny --death-timeout 120 --local-directory $TMPDIR/dask --lifetime 3h --lifetime-stagger 5m --protocol tcp://

<Client: 'tcp://172.16.131.60:44615' processes=0 threads=0, memory=0 B>


`conda env export --prefix /home/pleung/.conda/envs/crispy > environment.yml`
to reproduce this simulation later.
tornado.application - ERROR - Exception in callback functools.partial(<bound method IOLoop._discard_future_result of <zmq.eventloop.ioloop.ZMQIOLoop object at 0x7fcc9a1d8a30>>, <Task finished name='Task-45' coro=<SpecCluster._correct_state_internal() done, defined at /home/pleung/.conda/envs/crispy/lib/python3.8/site-packages/distributed/deploy/spec.py:327> exception=RuntimeError('Command exited with non-zero exit code.\nExit code: 1\nCommand:\nsbatch /scratch/pleung/48072701/tmp4525scis.sh\nstdout:\n\nstderr:\nsbatch: error: Batch job submission failed: Requested time limit is invalid (missing or exceeds some limit)\n\n')>)
Traceback (most recent call last):
  File "/home/pleung/.conda/envs/crispy/lib/python3.8/site-packages/tornado/ioloop.py", line 741, in _run_callback
    ret = callback()
  File "/home/pleung/.conda/envs/crispy/lib/python3.8/site-packages/tornado/ioloop

In [None]:
tpposes2 = redesign_disulfides(tpposes[0])

In [None]:
tpposes2[0].scores