# Prep filtered scaffold sets for distributed design

### Imports

In [1]:
# %load_ext lab_black
# Python standard library
from glob import glob
import os
import socket
import sys

# 3rd party library imports
import dask
import matplotlib.pyplot as plt
import pandas as pd
import pyrosetta
import numpy as np
import scipy
import seaborn as sns
from tqdm.auto import tqdm  # jupyter compatible progress bar

tqdm.pandas()  # link tqdm to pandas
# Notebook magic
# save plots in the notebook
# %matplotlib inline
# # reloads modules automatically before executing cells
# %load_ext autoreload
# %autoreload 2
print(f"running in directory: {os.getcwd()}")  # where are we?
print(f"running on node: {socket.gethostname()}")  # what node are we on?

running in directory: /home/broerman/crispy_shifty/projects/OPS
running on node: sofia


In [None]:
# fixed messed up inputs
pyrosetta.init()
pose = pyrosetta.pose_from_pdb("/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs/input_switched/R6CP33_m1.pdb")
sw = pyrosetta.rosetta.protocols.simple_moves.SwitchChainOrderMover()
sw.chain_order("21")
sw.apply(pose)
pose.dump_pdb("/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs/R6CP33_m1.pdb")
pose = pyrosetta.pose_from_pdb("/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs/input_switched/DLPx6_PDL_0_4_5.pdb")
pdb_info = pyrosetta.rosetta.core.pose.PDBInfo(pose)
pose.pdb_info(pdb_info)
pose.dump_pdb("/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs/DLPx6_PDL_0_4_5.pdb")

### Domesticate the scaffolds by trimming off leading and trailing loops and adding metadata to the output pdb.bz2s. 

In [2]:
design_list_file = "/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs.list"

with open(design_list_file, "w") as f:
    f.write("\n".join(sorted(glob("/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs/*.pdb"))))

In [2]:
sys.path.insert(0, "/home/broerman/crispy_shifty")
from crispy_shifty.utils.io import gen_array_tasks

simulation_name = "01_prep_inputs"
design_list_file = "/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs.list"
output_path = os.path.join("/home/broerman/crispy_shifty/projects/OPS/", f"{simulation_name}")

options = " ".join(
    [
        "out:level 200",
        "corrections:beta_nov16 true",
        "holes:dalphaball /software/rosetta/DAlphaBall.gcc",
        "run:preserve_header true" # for "Tomponents"
    ]
)
extra_kwargs = {
    "metadata_csv": "/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/metadata.csv",
    "chains_to_keep": "12",
    "num_ss_per_repeat": "2",
    "fixed_resis": "distribute"
}

gen_array_tasks(
    distribute_func="crispy_shifty.protocols.cleaning.add_metadata_to_input",
    design_list_file=design_list_file,
    output_path=output_path,
    queue="short",
    memory="2G",
    nstruct=1,
    nstruct_per_task=1,
    options=options,
    extra_kwargs=extra_kwargs,
    simulation_name=simulation_name,
)

  from distributed.utils import tmpfile


InvalidGitRepositoryError: The working directory is dirty! Commit local changes to ensure reproducibility.

### Collect scorefiles of the prepped input scaffolds

In [8]:
sys.path.insert(0, "/home/broerman/crispy_shifty")
from crispy_shifty.utils.io import collect_score_file

simulation_name = "01_prep_inputs"
output_path = os.path.join("/home/broerman/crispy_shifty/projects/OPS/", f"{simulation_name}")

if not os.path.exists(os.path.join(output_path, "scores.json")):
    collect_score_file(output_path, "scores")

### Load resulting concatenated scorefile

In [9]:
sys.path.insert(0, "/home/broerman/crispy_shifty")
from crispy_shifty.utils.io import parse_scorefile_linear

output_path = os.path.join("/home/broerman/crispy_shifty/projects/OPS/", f"{simulation_name}")

if not os.path.exists(os.path.join(output_path, "scores.csv")):
    scores_df = parse_scorefile_linear(os.path.join(output_path, "scores.json"))

  0%|          | 0/34 [00:00<?, ?it/s]

### Dump scores_df as a CSV and then reload, for performance reasons

In [10]:
if not os.path.exists(os.path.join(output_path, "scores.csv")):
    scores_df.to_csv(os.path.join(output_path, "scores.csv"))

scores_df = pd.read_csv(os.path.join(output_path, "scores.csv"), index_col="Unnamed: 0")

### Save a list of outputs

In [11]:
with open(
    "01_prep_inputs/prepped_inputs.list", "w"
) as f:
    for path in scores_df.index:
        print(path, file=f)

In [15]:
# check fixed resis in pymol
for index, row in scores_df.iterrows():
    name = index.split('/')[-1].split('.')[0]
    print(f"select {row['pdb'].split('/')[-1].split('.')[0]}, {name} and resid {'+'.join(row['fixed_resis'].split(','))};")

select sl_S2_06, 01_prep_inputs_6502abb5dc7a4e5fa37a4727a2f46254 and resid 6+14+15+18+49+57+58+61+92+100+101+104+135+143+144+147;
select KW_b1_07_repeat, 01_prep_inputs_d5050856607840828f7cf8499f8a7daa and resid 1+2+5+9+13+16+43+44+47+51+55+58+85+86+89+93+97+100+127+128+131+135+139+142;
select s8_LRT6, 01_prep_inputs_dfb593e567524924b446e02f4bf0718e and resid 10+14+15+18+19+53+57+58+61+62+96+100+101+104+105+139+143+144+147+148+182+186+187+190+191+225+229+230+233+234;
select KW_b1_11_DY, 01_prep_inputs_8467fc2f59f2466f84a944cc741f888c and resid 6+10+14+48+52+56+90+94+98+132+136+140;
select R6_04, 01_prep_inputs_6b538cd212394dd098a4847af6b4f864 and resid 7+11+14+18+52+56+59+63+97+101+104+108+142+146+149+153+187+191+194+198+232+236+239+243;
select R6ST3, 01_prep_inputs_8f4cffc51181488cba665c6b56b48dc1 and resid 10+14+18+19+53+57+61+62+96+100+104+105+139+143+147+148+182+186+190+191+225+229+233+234;
select R6M4, 01_prep_inputs_515773ce8d6143e5987629d0f3b29caf and resid 7+11+14+18+52+56+59+6