# Final filtering with AF2

### Boilerplate

In [1]:
%load_ext lab_black
# python internal
import collections
import copy
import gc
from glob import glob
import h5py
import itertools
import os
import random
import re
import socket
import shutil
import subprocess
import sys

# conda/pip
import dask
import graphviz
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
from tqdm import tqdm

# special packages on the DIGS
import py3Dmol
import pymol
import pyrosetta

# notebook magic
%matplotlib inline
%load_ext autoreload
%autoreload 2

print(os.getcwd())
print(socket.gethostname())

/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders
dig152


### Run AF2 on the sequences

In [2]:
from pyrosetta.distributed.packed_pose.core import PackedPose


def predict_chA(packed_pose_in=None, **kwargs) -> PackedPose:
    """
    Generate a fasta of chain A only. Predict with AF2.
    Load prediction and compute LDDT and RMSD to chA
    """
    import binascii, bz2, os, subprocess
    from glob import glob
    import pyrosetta
    import pyrosetta.distributed.io as io
    from pyrosetta.distributed import cluster
    from pyrosetta.distributed.tasks.rosetta_scripts import (
        SingleoutputRosettaScriptsTask,
    )

    def cmd(command, wait=True):
        """@nrbennet @bcov"""
        the_command = subprocess.Popen(
            command,
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True,
        )
        if not wait:
            return
        the_stuff = the_command.communicate()
        return str(the_stuff[0]) + str(the_stuff[1])

    if packed_pose_in == None:
        file = kwargs["-s"]
        with open(file, "rb") as f:
            ppose = io.pose_from_pdbstring(bz2.decompress(f.read()).decode())
        scores = dict(pyrosetta.distributed.cluster.get_scores_dict(file)["scores"])
    else:
        raise RuntimeError("Need to supply an input")

    pose = io.to_pose(ppose)

    if kwargs["out_path"] == None:
        out_path = os.getcwd()
    else:
        out_path = kwargs["out_path"]
        os.makedirs(os.path.join(os.getcwd(), out_path), exist_ok=True)

    xml_string = """
    <ROSETTASCRIPTS>
        <SCOREFXNS>
        </SCOREFXNS>
        <RESIDUE_SELECTORS>
        </RESIDUE_SELECTORS>
        <TASKOPERATIONS>
        </TASKOPERATIONS>
        <SIMPLE_METRICS>
        </SIMPLE_METRICS>
        <MOVERS>
            <SwitchChainOrder name="A_only" chain_order="1"/>
        </MOVERS>
        <FILTERS>
        </FILTERS>
        <APPLY_TO_POSE>
        </APPLY_TO_POSE>
        <PROTOCOLS>
            <Add mover="A_only" />
        </PROTOCOLS>
    </ROSETTASCRIPTS>
    """
    chA_only = SingleoutputRosettaScriptsTask(xml_string)
    chA = chA_only(pose.clone())
    sequence = chA.pose.sequence()
    file_handle = binascii.b2a_hex(os.urandom(15)).decode("ascii")
    with open(os.path.join(os.getcwd(), out_path, file_handle), "w+") as f:
        print(">" + file, file=f)
        print(sequence, file=f)
    # run AF2
    python = "/software/conda/envs/SE3/bin/python"
    af2 = "/projects/ml/alphafold/alphafold_git/predict_single_seq.py"
    fasta = os.path.join(os.getcwd(), out_path, file_handle)
    command = f"{python} {af2} {fasta} {fasta}"
    print(command)
    out = cmd(command)
    lines = list(iter(out.splitlines()))
    for line in lines:
        if "lDDT" in line:
            lddt_af2 = float(line.split()[-1])
            break
        else:
            pass

    # get lddt design, prediction
    design = os.path.join(os.getcwd(), out_path, file_handle + ".pdb")
    chA.pose.dump_pdb(design)
    lddt_path = "/home/aivan/prog/lddt-linux/lddt"
    command = f"{lddt_path} {design} {fasta}_unrelaxed_model_4.pdb"
    print(command)
    out = cmd(command)
    lines = list(iter(out.splitlines()))
    for line in lines:
        if "Global" in line:
            lddt_des_pred_X = float(line.split()[-1])
            break
        else:
            pass
    # get rmsd to design
    rmsd_calc = pyrosetta.rosetta.core.simple_metrics.metrics.RMSDMetric()
    rmsd_calc.set_rmsd_type(pyrosetta.rosetta.core.scoring.rmsd_atoms(3))
    rmsd_calc.set_run_superimpose(True)
    chA = io.to_pose(chA)
    rmsd_calc.set_comparison_pose(chA)
    pred_pose = io.to_pose(io.pose_from_file(f"{fasta}_unrelaxed_model_4.pdb"))
    rmsd = rmsd_calc.calculate(pred_pose)
    scores["lddt_af2_X"] = lddt_af2
    scores["lddt_des_pred_X"] = lddt_des_pred_X
    scores["rmsd_pred_X"] = rmsd
    scores["pred_prefix"] = fasta

    for key, value in scores.items():
        pyrosetta.rosetta.core.pose.setPoseExtraScore(pose, key, value)
    ppose = io.to_packed(pose)
    return ppose

### Setup dask, set command line options, make tasks and submit to client for predictions

In [None]:
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
import logging
import pwd
from pyrosetta.distributed.cluster.core import PyRosettaCluster


logging.basicConfig(level=logging.INFO)
selected = os.path.join(os.getcwd(), "06_score_dslf/best_deltas.list")
options = {}


def create_tasks(selected, options):
    with open(selected, "r") as f:
        for file in f:
            tasks = {"options": ""}
            tasks["extra_options"] = options
            tasks["-s"] = file.rstrip()
            tasks["out_path"] = f"07_predictions"
            yield tasks


print("run the following from your local terminal:")
print(
    f"ssh -L 8000:localhost:8787 {pwd.getpwuid(os.getuid()).pw_name}@{socket.gethostname()}"
)

output_path = os.path.join(os.getcwd(), "07_predict_chA")

if __name__ == "__main__":
    # configure SLURM cluster as a context manager
    with SLURMCluster(
        cores=1,
        processes=1,
        job_cpu=1,
        memory="7GB",
        queue="backfill",
        walltime="11:30:00",
        death_timeout=120,
        local_directory="$TMPDIR/dask",
        log_directory="/mnt/home/pleung/logs/slurm_logs",
        extra=["--lifetime", "10h", "--lifetime-stagger", "4m"],
    ) as cluster:
        print(cluster.job_script())
        # scale between 1-1020 workers,
        cluster.adapt(
            minimum=1,
            maximum=1020,
            wait_count=400,  # Number of consecutive times that a worker should be suggested for removal it is removed
            interval="5s",  # Time between checks
        )
        # setup a client to interact with the cluster as a context manager
        with Client(cluster) as client:
            print(client)
            PyRosettaCluster(
                tasks=create_tasks(selected, options),
                client=client,
                scratch_dir=output_path,
                output_path=output_path,
            ).distribute(protocols=[predict_chA])

run the following from your local terminal:
ssh -L 8000:localhost:8787 pleung@dig152
#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -e /mnt/home/pleung/logs/slurm_logs/dask-worker-%J.err
#SBATCH -o /mnt/home/pleung/logs/slurm_logs/dask-worker-%J.out
#SBATCH -p long
#SBATCH -n 1
#SBATCH --cpus-per-task=1
#SBATCH --mem=6G
#SBATCH -t 11:30:00

JOB_ID=${SLURM_JOB_ID%;*}

/home/pleung/.conda/envs/phil/bin/python -m distributed.cli.dask_worker tcp://172.16.131.36:46059 --nthreads 1 --memory-limit 5.59GiB --name name --nanny --death-timeout 120 --local-directory $TMPDIR/dask --lifetime 10h --lifetime-stagger 4m

<Client: 'tcp://172.16.131.36:46059' processes=0 threads=0, memory=0 B>


INFO:pyrosetta.distributed:maybe_init performing pyrosetta initialization: {'options': '-run:constant_seed 1 -multithreading:total_threads 1', 'extra_options': '-mute all', 'set_logging_handler': 'interactive', 'silent': True}
INFO:pyrosetta.rosetta:Found rosetta database at: /home/pleung/.conda/envs/phil/lib/python3.8/site-packages/pyrosetta/database; using it....
INFO:pyrosetta.rosetta:PyRosetta-4 2021 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python38.Release 2021.27+release.7ce64884a77d606b7b667c363527acc846541030 2021-07-09T18:10:05] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.


### Look at scores
Hacky function to load JSON-like data

In [3]:
def read_scorefile(scores):
    import pandas as pd
    from tqdm import tqdm

    dfs = []
    with open(scores, "r") as f:
        for line in tqdm(f):
            dfs.append(pd.read_json(line).T)
    tabulated_scores = pd.concat(dfs)
    return tabulated_scores


output_path = os.path.join(os.getcwd(), "06_cleanup")
scores = os.path.join(output_path, "scores.json")
scores_df = read_scorefile(scores)
scores_df.head()

21773it [04:33, 79.71it/s]


Unnamed: 0,9mer_X,9mer_Y,X_nocys_path,X_path,X_seq,Y_nocys_path,Y_path,Y_seq,abego_str_Y,ala_penalty,...,total_score,total_score_X,total_score_Y,twosided_Y_resis,vbuns_X,vbuns_Y,wnm_all_X,wnm_all_Y,wnm_hlx_X,wnm_hlx_Y
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_cleanup/decoys/0000/2021.07.19.14.50.31.324850_7e97d5d141b548c0b579349610570759.pdb.bz2,0.432344,2.472573,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,DEMKKVMEALKKAVELAKKNNDDEVAREIERVAKEIVEALREDRSS...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,DEMKKVMEALKKAVELAKKNNDDEVAREIERVAKEIVEALREDRSS...,XAAAAAAAAAAAAAAAAAAAGBAAAAAAAAAAAAAAAAAAAABABA...,2.0,...,-527.188964,-652.62561,-677.171997,"False,False,False,False,False,False,False,Fals...",0.0,0.0,0.479511,0.524308,0.114906,0.126014
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_cleanup/decoys/0000/2021.07.19.14.50.31.324850_f348d325150a4090933e8e65f88de1ac.pdb.bz2,0.432344,2.492159,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,DEMKKVMELLKKAVELAKKNNDDEVAREIERAAKEIVEALREDRSE...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,DEMKKVMELLKKAVELAKKNNDDEVAREIERAAKEIVEALREDRSE...,XAAAAAAAAAAAAAAAAAAAGBAAAAAAAAAAAAAAAAAAAABABA...,1.0,...,-371.977964,-647.187927,-677.092529,"False,False,False,False,False,False,False,Fals...",0.0,0.0,0.462409,0.631355,0.103965,0.137593
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_cleanup/decoys/0000/2021.07.19.14.50.31.324850_0ca226d9f1a34514b8d937613022304d.pdb.bz2,0.336074,3.13845,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,DERQKQREEVRKLAEELASKATDEELIKEIKKVAQKAEELVSRTTD...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,DERQKQREEVRKLAEELASKATDEELIKEIKKVAQKAEELVSRTTD...,XAAAAAAAAAAAAAAAAAAABABAAAAAAAAAAAAAAAAAAAABAB...,1.0,...,-114.304931,-624.700745,-647.475159,"False,False,False,False,False,False,False,Fals...",0.0,0.0,0.406526,0.824593,0.104812,0.146689
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_cleanup/decoys/0000/2021.07.19.14.50.31.324850_ddc6a10100e343879cdebc7b0dce3694.pdb.bz2,0.386789,2.755677,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,DEEERLRQEVEKAEKELEKLAKQSTDEKVRRIAREVAKLLRRLAEE...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,DEEERLRQEVEKAEKELEKLAKQSTDEKVRRIAREVAKLLRRLAEE...,XAAAAAAAAAAAAAAAAAAAAAABABAAAAAAAAAAAAAAAAAAAA...,1.0,...,-283.280073,-801.145874,-848.991455,"False,False,False,False,False,False,False,Fals...",0.0,0.0,0.399673,0.488587,0.115578,0.105711
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_cleanup/decoys/0000/2021.07.19.14.50.31.324850_48816afac9ad43c39d7ec435c3f65341.pdb.bz2,0.514498,2.789464,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,SEKEKVEELAQRIREQLPDTELAREAQELADEARKSDDPKQLEVVY...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,SEKEKVEELAQRIREQLPDTELAREAQELADEARKSDDPKQLEVVY...,XAAAAAAAAAAAAAAABBGBAAAAAAAAAAAAAAABABAAAAAAAA...,2.0,...,4.513756,-572.15155,-580.824341,"False,False,False,False,False,False,False,Fals...",0.0,0.0,0.466182,0.903392,0.104979,0.093915


In [10]:
scores_df.to_json(os.path.join(output_path, "scores_pd.json"))

In [11]:
scores_df = pd.read_json(os.path.join(output_path, "scores_pd.json"), dtype=float)

### Reload updated scores

In [5]:
def read_scorefile(scores):
    import pandas as pd
    from tqdm import tqdm

    dfs = []
    with open(scores, "r") as f:
        for line in tqdm(f):
            dfs.append(pd.read_json(line).T)
    tabulated_scores = pd.concat(dfs)
    return tabulated_scores


output_path = os.path.join(os.getcwd(), "06_score_dslf")
scores = os.path.join(output_path, "scores.json")
scores_df = read_scorefile(scores)
scores_df.to_json(os.path.join(output_path, "scores_pd.json"))
scores_df = pd.read_json(os.path.join(output_path, "scores_pd.json"), dtype=float)
scores_df.head()

5235it [01:07, 77.62it/s]


Unnamed: 0,9mer_X,9mer_Y,X_nocys_path,X_path,X_seq,Y_nocys_path,Y_path,Y_seq,abego_str_Y,ala_penalty,...,total_score_Y,twosided_Y_resis,vbuns_X,vbuns_Y,wnm_all_X,wnm_all_Y,wnm_hlx_X,wnm_hlx_Y,cart_bonded_Y,coordinate_constraint_Y
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_score_dslf/decoys/0000/2021.07.19.22.26.08.731625_5dd736a512924af7b1702e210581ab94.pdb.bz2,0.514498,2.262725,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,SEKEKVEELAQRIREQLPDTELAREAQELADEARKSDDPEQLKVVY...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,SEKEKVEELAQRIREQLPDTELAREAQELADEARKSDDPEQLKVVY...,XAAAAAAAAAAAAAAABBGBAAAAAAAAAAAAAAABABAAAAAAAA...,2.0,...,-585.981201,"False,False,False,False,False,False,False,Fals...",0.0,0.0,0.410465,0.777123,0.099616,0.108181,,
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_score_dslf/decoys/0000/2021.07.19.22.26.08.731625_206a3aab17a44a2289b2d3e23a75ff86.pdb.bz2,0.514498,2.606664,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,SEKEKVEELAQRIREQLPDTELAREAQELADEARKSDNPKVLRVVL...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,SEKEKVEELAQRIREQLPDTELAREAQELADEARKSDNPKVLRVVL...,XAAAAAAAAAAAAAAABBGBAAAAAAAAAAAAAAABABAAAAAAAA...,2.0,...,-566.267761,"False,False,False,False,False,False,False,Fals...",0.0,0.0,0.41673,0.747878,0.09974,0.125306,,
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_score_dslf/decoys/0000/2021.07.19.22.26.08.731625_27cf97c933e74bbb99c7bccc70af1ddb.pdb.bz2,0.370115,3.092684,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,SEEVNERVKQLAEKAKEATDKEEVIEIVKELAELAKQSTDPRLVKE...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,SEEVNERVKQLAEKAKEATDKEEVIEIVKELAELAKQSTDPRLVKE...,XAAAAAAAAAAAAAAAABABAAAAAAAAAAAAAAAAABABAAAAAA...,1.0,...,-582.222412,"False,False,False,False,False,False,False,Fals...",0.0,0.0,0.36835,0.382479,0.093177,0.101968,,
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_score_dslf/decoys/0000/2021.07.19.22.26.08.731625_ec352774fbd94066ae2bb3af11370f7e.pdb.bz2,0.432344,2.472573,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,DEMKKVMELLKKAVELAKKNNDDEVAREIERAAKEIVEALRETDSE...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,DEMKKVMELLKKAVELAKKNNDDEVAREIERAAKEIVEALRETDSE...,XAAAAAAAAAAAAAAAAAAAGBAAAAAAAAAAAAAAAAAAAABABA...,2.0,...,-648.485901,"False,False,False,False,False,False,False,Fals...",0.0,0.0,0.474192,0.539119,0.116614,0.128329,64.695625,84.301338
/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_score_dslf/decoys/0000/2021.07.19.22.26.08.731625_a5b3fe94195747f995e87cd6b8d5b00a.pdb.bz2,0.418133,2.618307,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,TKEDAKETARKAARKAAESNDEEVAKQAAKDVIEVAKQAGMPEQEA...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,/mnt/home/pleung/projects/bistable_bundle/r4/h...,TKEDAKETARKAARKAAESNDEEVAKQAAKDVIEVAKQAGMPEQEA...,XAAAAAAAAAAAAAAAAAAGBAAAAAAAAAAAAAAAAAAGBBAAAA...,1.0,...,-604.035156,"False,False,False,False,False,False,False,Fals...",0.0,0.0,0.45331,0.546515,0.105866,0.11682,,


### Get LDDT arrays with `DeepAccNet`

In [53]:
with open(os.path.join(os.getcwd(), "06_score_dslf", "best_deltas.list"), "w") as f:
    for i in best.index:
        print(i, file=f)

### Unused blocks

In [32]:
%%time
import pyrosetta
from pyrosetta.distributed import cluster
import pyrosetta.distributed.io as io

# flags = """
# -holes:dalphaball /home/bcov/ppi/tutorial_build/main/source/external/DAlpahBall/DAlphaBall.gcc
# -indexed_structure_store:fragment_store /net/databases/VALL_clustered/connect_chains/ss_grouped_vall_helix_shortLoop.h5
# """
# pyrosetta.distributed.init(" ".join(flags.replace("\n\t", " ").split()))
pyrosetta.distributed.init()
t = predict_chA(
    None,
    **{
        "-s": "/mnt/home/pleung/projects/bistable_bundle/r4/helix_binders/06_score_dslf/decoys/0000/2021.07.19.22.26.08.731625_6e1e66d90c364051ae9e07bf5e28a9e1.pdb.bz2",
        "out_path": "test",
    }
)

CPU times: user 3.7 s, sys: 43 ms, total: 3.74 s
Wall time: 2min 39s
