# Prep filtered scaffold sets for distributed design

### Imports

In [1]:
# %load_ext lab_black
# Python standard library
from glob import glob
import os
import socket
import sys

# 3rd party library imports
import dask
import matplotlib.pyplot as plt
import pandas as pd
import pyrosetta
import numpy as np
import scipy
import seaborn as sns
from tqdm.auto import tqdm  # jupyter compatible progress bar

tqdm.pandas()  # link tqdm to pandas
# Notebook magic
# save plots in the notebook
# %matplotlib inline
# # reloads modules automatically before executing cells
# %load_ext autoreload
# %autoreload 2
print(f"running in directory: {os.getcwd()}")  # where are we?
print(f"running on node: {socket.gethostname()}")  # what node are we on?

running in directory: /home/broerman/crispy_shifty/projects/OPS
running on node: sofia


In [None]:
# fixed messed up inputs
pyrosetta.init()
pose = pyrosetta.pose_from_pdb("/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs/input_switched/R6CP33_m1.pdb")
sw = pyrosetta.rosetta.protocols.simple_moves.SwitchChainOrderMover()
sw.chain_order("21")
sw.apply(pose)
pose.dump_pdb("/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs/R6CP33_m1.pdb")
pose = pyrosetta.pose_from_pdb("/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs/input_switched/DLPx6_PDL_0_4_5.pdb")
pdb_info = pyrosetta.rosetta.core.pose.PDBInfo(pose)
pose.pdb_info(pdb_info)
pose.dump_pdb("/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs/DLPx6_PDL_0_4_5.pdb")

### Domesticate the scaffolds by trimming off leading and trailing loops and adding metadata to the output pdb.bz2s. 

In [2]:
design_list_file = "/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs.list"

with open(design_list_file, "w") as f:
    f.write("\n".join(sorted(glob("/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs/*.pdb"))))

In [3]:
sys.path.insert(0, "/home/broerman/crispy_shifty")
from crispy_shifty.utils.io import gen_array_tasks

simulation_name = "01_prep_inputs"
design_list_file = "/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/inputs.list"
output_path = os.path.join("/home/broerman/crispy_shifty/projects/OPS/", f"{simulation_name}")

options = " ".join(
    [
        "out:level 200",
        "corrections:beta_nov16 true",
        "holes:dalphaball /software/rosetta/DAlphaBall.gcc",
        "run:preserve_header true" # for "Tomponents"
    ]
)
extra_kwargs = {
    "metadata_csv": "/home/broerman/crispy_shifty/projects/OPS/00_scaffold_sets/metadata.csv",
    "chains_to_keep": "12",
    "num_ss_per_repeat": "2",
    "fixed_resis": "distribute"
}

gen_array_tasks(
    distribute_func="crispy_shifty.protocols.cleaning.add_metadata_to_input",
    design_list_file=design_list_file,
    output_path=output_path,
    queue="short",
    memory="2G",
    nstruct=1,
    nstruct_per_task=1,
    options=options,
    extra_kwargs=extra_kwargs,
    simulation_name=simulation_name,
)

Run the following command with your desired environment active:
sbatch -a 1-26 /home/broerman/crispy_shifty/projects/OPS/01_prep_inputs/run.sh


### Collect scorefiles of the prepped input scaffolds

In [9]:
sys.path.insert(0, "/home/broerman/crispy_shifty")
from crispy_shifty.utils.io import collect_score_file

simulation_name = "01_prep_inputs"
output_path = os.path.join("/home/broerman/crispy_shifty/projects/OPS/", f"{simulation_name}")

if not os.path.exists(os.path.join(output_path, "scores.json")):
    collect_score_file(output_path, "scores")

### Load resulting concatenated scorefile

In [10]:
sys.path.insert(0, "/home/broerman/crispy_shifty")
from crispy_shifty.utils.io import parse_scorefile_linear

output_path = os.path.join("/home/broerman/crispy_shifty/projects/OPS/", f"{simulation_name}")

if not os.path.exists(os.path.join(output_path, "scores.csv")):
    scores_df = parse_scorefile_linear(os.path.join(output_path, "scores.json"))

  0%|          | 0/35 [00:00<?, ?it/s]

### Dump scores_df as a CSV and then reload, for performance reasons

In [11]:
if not os.path.exists(os.path.join(output_path, "scores.csv")):
    scores_df.to_csv(os.path.join(output_path, "scores.csv"))

scores_df = pd.read_csv(os.path.join(output_path, "scores.csv"), index_col="Unnamed: 0")

### Save a list of outputs

In [12]:
with open(
    "01_prep_inputs/prepped_inputs.list", "w"
) as f:
    for path in scores_df.index:
        print(path, file=f)

In [13]:
# check fixed resis in pymol
for index, row in scores_df.iterrows():
    name = index.split('/')[-1].split('.')[0]
    # print(f"select {row['pdb'].split('/')[-1].split('.')[0]}, {name} and resid {'+'.join(row['fixed_resis'].split(','))};")
    print(f"color green, {name} and resid {'+'.join(row['fixed_resis'].split(','))};")

color green, 01_prep_inputs_644d3fd8dc9b484c97d9e994a4698701 and resid 1+2+5+9+13+16+43+44+47+51+55+58+85+86+89+93+97+100+127+128+131+135+139+142;
color green, 01_prep_inputs_009cc77e2cce4ce282adc21ec14800c3 and resid 3+6+11+14+46+49+54+57+89+92+97+100+132+135+140+143+175+178+183+186+218+221+226+229;
color green, 01_prep_inputs_fb7b2906199e4b53999089730d48e4db and resid 10+11+14+15+17+18+19+53+54+57+58+60+61+62+96+97+100+101+103+104+105+139+140+143+144+146+147+148+182+183+186+187+189+190+191+225+226+229+230+232+233+234;
color green, 01_prep_inputs_49da073b32594a4d810fe6a81a5f495b and resid 6+7+10+11+13+14+51+52+55+56+58+59+96+97+100+101+103+104+141+142+145+146+148+149+186+187+190+191+193+194+231+232+235+236+238+239;
color green, 01_prep_inputs_2742c96d62e0494592b7d5b9435eed08 and resid 5+9+12+36+40+43+67+71+74+98+102+105+129+133+136+160+164+167;
color green, 01_prep_inputs_6036ba4febb047d4836e2a2c49766136 and resid 7+11+14+18+52+56+59+63+97+101+104+108+142+146+149+153+187+191+194+198+2