# Filter down the folded paired states to an orderable subset

### Imports

In [1]:
%load_ext lab_black
# Python standard library
from glob import glob
import os
import socket
import sys

# 3rd party library imports
import dask
import matplotlib.pyplot as plt
import pandas as pd
import pyrosetta
import numpy as np
import scipy
import seaborn as sns
from tqdm.auto import tqdm  # jupyter compatible progress bar

tqdm.pandas()  # link tqdm to pandas
# Notebook magic
# save plots in the notebook
%matplotlib inline
# reloads modules automatically before executing cells
%load_ext autoreload
%autoreload 2
print(f"running in directory: {os.getcwd()}")  # where are we?
print(f"running on node: {socket.gethostname()}")  # what node are we on?

running in directory: /global/cfs/cdirs/m4129/projects/crispy_shifty_adam/projects/OPS/round_1/design
running on node: nid002121


### Fix the paths
Necessary because we copied these designs from perlmutter

In [3]:
sys.path.insert(0, "/global/cfs/cdirs/m4129/projects/crispy_shifty_adam")
from crispy_shifty.utils.io import fix_path_prefixes

folded = "/global/cfs/cdirs/m4129/projects/crispy_shifty_adam/projects/OPS/round_1/design/13_filter_and_order/to_order.list"

new_folded = fix_path_prefixes(
    find="/home/broerman/crispy_shifty",
    replace="/global/cfs/cdirs/m4129/projects/crispy_shifty_adam",
    file=folded,
    overwrite=True,
)

  from distributed.utils import tmpfile


### Resurface the predicted states peptides

In [None]:
from crispy_shifty.utils.io import gen_array_tasks

simulation_name = "14_resurface"
design_list_file = "/global/cfs/cdirs/m4129/projects/crispy_shifty_adam/projects/OPS/round_1/design/13_filter_and_order/to_order.list"
output_path = os.path.join(f"/pscratch/sd/b/broerman/{simulation_name}")
options = " ".join(
    [
        "out:level 200",
    ]
)

extra_kwargs = {"models": "1"}

gen_array_tasks(
    distribute_func="crispy_shifty.protocols.cleaning.finalize_peptide",
    design_list_file=design_list_file,
    output_path=output_path,
    queue="gpu-remote",
    cores=2,
    gres="--gres=gpu:a4000:1",
    memory="10G",
    nstruct=1,
    nstruct_per_task=4,
    options=options,
    extra_kwargs=extra_kwargs,
    simulation_name=simulation_name
)

### Remove AF2 decoy data from the score JSONs
Forgot to do it in the script

In [None]:
sys.path.insert(0, "/home/broerman/crispy_shifty")
from crispy_shifty.utils.io import collect_and_clean_score_file

simulation_name = "14_resurface"
output_path = os.path.join("/home/broerman/crispy_shifty/projects/OPS/round_1/design/", simulation_name)

if not os.path.exists(os.path.join(output_path, "scores.json")):
    collect_and_clean_score_file(output_path, "mpnn_seq", "scores")

### Load resulting concatenated scorefile

In [3]:
from crispy_shifty.utils.io import parse_scorefile_linear

simulation_name = "14_resurface"
output_path = os.path.join("/home/broerman/crispy_shifty/projects/OPS/round_1/design/", simulation_name)

if not os.path.exists(os.path.join(output_path, "scores.csv")):
    scores_df = parse_scorefile_linear(os.path.join(output_path, "scores.json"))

  from distributed.utils import tmpfile


### Dump scores_df as a CSV and then reload, for performance reasons

In [4]:
if not os.path.exists(os.path.join(output_path, "scores.csv")):
    scores_df.to_csv(os.path.join(output_path, "scores.csv"))

scores_df = pd.read_csv(os.path.join(output_path, "scores.csv"), index_col="Unnamed: 0")

### Save a list of outputs

In [5]:
with open(os.path.join(output_path, "resurfaced_states.list"), "w") as f:
    for path in tqdm(scores_df.index):
        print(path, file=f)

  0%|          | 0/48561 [00:00<?, ?it/s]

### Fix the paths
Necessary because we copied these designs from perlmutter

In [6]:
from crispy_shifty.utils.io import fix_path_prefixes

resurfaced = os.path.join(output_path, "resurfaced_states.list")
new_resurfaced = fix_path_prefixes(
    find="/pscratch/sd/b/broerman",
    replace="/home/broerman/crispy_shifty/projects/OPS/round_1/design",
    file=resurfaced,
    overwrite=True,
)

### Filter the predicted states

In [7]:
from crispy_shifty.utils.io import gen_array_tasks

simulation_name = "14_filter"
design_list_file = resurfaced
output_path = os.path.join("/home/broerman/crispy_shifty/projects/OPS/round_1/design/", simulation_name)
options = " ".join(
    [
        "out:level 200",
        "corrections:beta_nov16 true",
        "indexed_structure_store:fragment_store /net/databases/VALL_clustered/connect_chains/ss_grouped_vall_helix_shortLoop.h5",
    ]
)

# TODO add extra kwarg for not relaxing state X + peptide

gen_array_tasks(
    distribute_func="crispy_shifty.protocols.msd.filter_paired_state_OPS",
    design_list_file=design_list_file,
    output_path=output_path,
    queue="short",
    memory="4G",
    nstruct=1,
    nstruct_per_task=4,
    options=options,
    simulation_name=simulation_name,
)

Run the following command with your desired environment active:
sbatch -a 1-12141 /mnt/projects/crispy_shifty/projects/crispy_shifties/09_filter/run.sh


### Remove AF2 decoy data from the score JSONs
Forgot to do it in the script, again

In [7]:
sys.path.insert(0, "/home/broerman/crispy_shifty")
from crispy_shifty.utils.io import collect_and_clean_score_file

simulation_name = "14_filter"
output_path = os.path.join("/home/broerman/crispy_shifty/projects/OPS/round_1/design/", simulation_name)

if not os.path.exists(os.path.join(output_path, "scores.json")):
    collect_and_clean_score_file(output_path, "mpnn_seq", "scores")

### Load resulting concatenated scorefile

In [8]:
from crispy_shifty.utils.io import parse_scorefile_linear

if not os.path.exists(os.path.join(output_path, "scores.csv")):
    scores_df = parse_scorefile_linear(os.path.join(output_path, "scores.json"))

### Dump scores_df as a CSV and then reload, for performance reasons

In [9]:
if not os.path.exists(os.path.join(output_path, "scores.csv")):
    scores_df.to_csv(os.path.join(output_path, "scores.csv"))

scores_df = pd.read_csv(os.path.join(output_path, "scores.csv"), index_col="Unnamed: 0")

### Save a list of outputs

In [10]:
with open(os.path.join(output_path, "filtered_states.list"), "w") as f:
    for path in tqdm(scores_df.index):
        print(path, file=f)

  0%|          | 0/48561 [00:00<?, ?it/s]