# Make apo version for enzdes

### Boilerplate

In [1]:
%load_ext lab_black
# python internal
import collections
import copy
import gc
from glob import glob
import h5py
import itertools
import os
import random
import re
import socket
import shutil
import subprocess
import sys

# conda/pip
import dask
import graphviz
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
from tqdm import tqdm

# special packages on the DIGS
import py3Dmol
import pymol
import pyrosetta

# notebook magic
%matplotlib inline
%load_ext autoreload
%autoreload 2

print(os.getcwd())
print(socket.gethostname())

/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders
dig75


### Make function to delete chB

In [2]:
from pyrosetta.distributed.packed_pose.core import PackedPose
from pyrosetta.distributed import requires_init
from typing import *


@requires_init
def del_chB(packed_pose_in=None, **kwargs) -> PackedPose:
    """"""
    import bz2
    import pyrosetta
    import pyrosetta.distributed.io as io
    from pyrosetta.distributed.tasks.rosetta_scripts import (
        SingleoutputRosettaScriptsTask,
    )

    if packed_pose_in == None:
        file = kwargs["-s"]
        with open(file, "rb") as f:
            packed_pose_in = io.pose_from_pdbstring(bz2.decompress(f.read()).decode())
        scores = pyrosetta.distributed.cluster.get_scores_dict(file)["scores"]
    else:
        raise RuntimeError("Need to supply an input")

    xml = """
    <ROSETTASCRIPTS>
        <SCOREFXNS>
        </SCOREFXNS>
        <RESIDUE_SELECTORS>
        </RESIDUE_SELECTORS>
        <TASKOPERATIONS>
        </TASKOPERATIONS>
        <MOVERS>
            <SwitchChainOrder name="delete" chain_order="1"/>
        </MOVERS>
        <PROTOCOLS>
            <Add mover="delete"/>
        </PROTOCOLS>
    </ROSETTASCRIPTS>
    """
    delete = SingleoutputRosettaScriptsTask(xml)
    chain1 = delete(packed_pose_in.pose.clone())
    pose = io.to_pose(chain1)
    for key, value in scores.items():
        pyrosetta.rosetta.core.pose.setPoseExtraScore(pose, key, value)
    final_ppose = io.to_packed(pose)
    return final_ppose

### Setup dask, set command line options, make tasks and submit to client

In [3]:
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
import logging
import pwd
from pyrosetta.distributed.cluster.core import PyRosettaCluster


print("run the following from your local terminal:")
print(
    f"ssh -L 8000:localhost:8787 {pwd.getpwuid(os.getuid()).pw_name}@{socket.gethostname()}"
)


def create_tasks(selected, options):
    with open(selected, "r") as f:
        for file in f:
            tasks = {"options": ""}
            tasks["extra_options"] = options
            tasks["-s"] = file.rstrip()
            yield tasks


logging.basicConfig(level=logging.INFO)
selected = os.path.join(os.getcwd(), "03_enumerate_loops/closed.list")

options = {
    "-out:level": "300",
    "-holes:dalphaball": "/home/bcov/ppi/tutorial_build/main/source/external/DAlpahBall/DAlphaBall.gcc",
    "-indexed_structure_store:fragment_store": "/net/databases/VALL_clustered/connect_chains/ss_grouped_vall_helix_shortLoop.h5",
}

output_path = os.path.join(os.getcwd(), "04_del_chB")

if __name__ == "__main__":
    # configure SLURM cluster as a context manager
    with SLURMCluster(
        cores=1,
        processes=1,
        job_cpu=1,
        memory="16GB",
        queue="long",
        walltime="23:30:00",
        death_timeout=120,
        local_directory="$TMPDIR/dask",
        log_directory="/mnt/home/pleung/logs/slurm_logs",
        extra=["--lifetime", "23h", "--lifetime-stagger", "4m"],
    ) as cluster:
        print(cluster.job_script())
        # scale between 1-510 workers,
        cluster.adapt(
            minimum=1,
            maximum=50,
            wait_count=360,  # Number of consecutive times that a worker should be suggested for removal it is removed
            interval="5s",  # Time between checks
        )
        # setup a client to interact with the cluster as a context manager
        with Client(cluster) as client:
            print(client)
            PyRosettaCluster(
                tasks=create_tasks(selected, options),
                client=client,
                scratch_dir=output_path,
                output_path=output_path,
            ).distribute(protocols=[del_chB])

run the following from your local terminal:
ssh -L 8000:localhost:8787 pleung@dig75
#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -e /mnt/home/pleung/logs/slurm_logs/dask-worker-%J.err
#SBATCH -o /mnt/home/pleung/logs/slurm_logs/dask-worker-%J.out
#SBATCH -p long
#SBATCH -n 1
#SBATCH --cpus-per-task=1
#SBATCH --mem=15G
#SBATCH -t 23:30:00

JOB_ID=${SLURM_JOB_ID%;*}

/home/pleung/.conda/envs/phil/bin/python -m distributed.cli.dask_worker tcp://172.16.131.105:41377 --nthreads 1 --memory-limit 14.90GiB --name name --nanny --death-timeout 120 --local-directory $TMPDIR/dask --lifetime 23h --lifetime-stagger 4m

<Client: 'tcp://172.16.131.105:41377' processes=0 threads=0, memory=0 B>


INFO:pyrosetta.distributed:maybe_init performing pyrosetta initialization: {'options': '-run:constant_seed 1 -multithreading:total_threads 1', 'extra_options': '-mute all', 'set_logging_handler': 'interactive', 'silent': True}
INFO:pyrosetta.rosetta:Found rosetta database at: /home/pleung/.conda/envs/phil/lib/python3.8/site-packages/pyrosetta/database; using it....
INFO:pyrosetta.rosetta:PyRosetta-4 2021 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python38.Release 2021.27+release.7ce64884a77d606b7b667c363527acc846541030 2021-07-09T18:10:05] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.


### Look at scores
Hacky function to load JSON-like data

In [4]:
def read_scorefile(scores):
    import pandas as pd

    scores = pd.read_json(scores, orient="records", typ="frame", lines=True)
    scores = scores.T
    mat = scores.values
    n = mat.shape[0]
    dicts = list(mat[range(n), range(n)])
    index = scores.index
    tabulated_scores = pd.DataFrame(dicts, index=index)
    return tabulated_scores


output_path = os.path.join(os.getcwd(), "04_del_chB")
scores = os.path.join(output_path, "scores.json")
scores_df = read_scorefile(scores)
scores_df.head()

Unnamed: 0,abego_str,bb_clash,closure_type,cmsa_AB,cmsa_AC,cmsa_BC,docked_helix,dslf_fa13,dssp,fa_atr,...,sc_int_AC,sc_int_BC,score_AB,score_AC,score_BC,score_per_res,shift,total_length,total_score,yhh_planarity
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/04_del_chB/decoys/0000/2021.07.19.18.09.35.674613_f822a0c7ad704f77aa7bd0c289009a87.pdb.bz2,XAAAAAAAAAAAAAAAABABAAAAAAAAAAAAAAAAABABAAAAAA...,43.63765335083008,loop_match,342.604248046875,285.3521728515625,204.82159423828125,3,0.0,LHHHHHHHHHHHHHHHHLLLHHHHHHHHHHHHHHHHLLLLHHHHHH...,-936.6592,...,0.8240406513214111,0.711965799331665,-283.4659423828125,-144.99749755859375,-127.64130401611328,-2.837507486343384,-3,173.0,2415.301751,0.03688
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/04_del_chB/decoys/0000/2021.07.19.18.09.35.674613_bff5c3be584c47889c31944aec1744de.pdb.bz2,XBAAAAAAAAAAAAAAAAAAAAGBBAAAAAAAAAAAAAAAAAAAAG...,66.96741485595703,strict_remodel,441.9161071777344,225.524169921875,186.22886657714844,3,0.0,LLHHHHHHHHHHHHHHHHHHHLLLLHHHHHHHHHHHHHHHHHHHLL...,-1122.371624,...,0.7762454152107239,0.6835819482803345,-284.0643615722656,-128.49952697753906,-125.55496215820312,-2.6200191974639893,-1,200.0,2041.728893,0.0
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/04_del_chB/decoys/0000/2021.07.19.18.09.35.674613_2b10de661a904070b60c9473ec8c19c7.pdb.bz2,XAAAAAAAAAAAAAAAAAAAAAGBBAAAAAAAAAAAAAAAAAAAAA...,53.40845489501953,loop_match,511.8804321289063,312.5047607421875,293.65386962890625,3,0.0,LHHHHHHHHHHHHHHHHHHHHHLLLHHHHHHHHHHHHHHHHHHHHH...,-1253.147342,...,0.7499569654464722,0.6986057758331299,-363.5997924804688,-169.5240478515625,-178.569091796875,-2.9849483966827397,-1,217.0,3130.174273,0.007313
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/04_del_chB/decoys/0000/2021.07.19.18.09.35.674613_85ef30d123ce42b48d8c40088e457465.pdb.bz2,XAAAAAAAAAAAAAAAABABAAAAAAAAAAAAAAAAABABAAAAAA...,40.77840042114258,loop_match,326.8579406738281,196.2958221435547,171.2842559814453,6,0.0,LHHHHHHHHHHHHHHHHLLLHHHHHHHHHHHHHHHHLLLLHHHHHH...,-961.289629,...,0.8170599937438965,0.6784964799880981,-279.16326904296875,-126.04275512695312,-141.20352172851562,-2.747830390930176,5,172.0,5955.705562,0.057909
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/04_del_chB/decoys/0000/2021.07.19.18.09.35.674613_dd5aa4fadee04097aa22f5ff0b155a17.pdb.bz2,XAAAAAAAAAAAAAAAAAAGBBAAAAAAAAAAAAAAAAAGBBAAAA...,120.49967193603516,strict_remodel,488.9514465332031,210.4895782470703,198.03411865234372,6,0.0,LHHHHHHHHHHHHHHHHHLLLLHHHHHHHHHHHHHHHHHLLLHHHH...,-1064.752222,...,0.7616071701049805,0.7869426012039185,-273.3611145019531,-107.79598236083984,-115.1090087890625,-2.58038592338562,-5,184.0,4233.830029,0.036149


In [5]:
output_path = os.path.join(os.getcwd(), "04_del_chB/scores_pd.json")
scores_df.to_json(output_path)

In [6]:
with open(os.path.join(os.getcwd(), "04_del_chB", "chA_only.list"), "w") as f:
    for i in scores_df.index:
        print(i, file=f)