# Filter and order

### Imports

In [1]:
%load_ext lab_black
# Python standard library
from glob import glob
import os
import socket
import sys

# 3rd party library imports
import dask
import matplotlib.pyplot as plt
import pandas as pd
import pyrosetta
import numpy as np
import scipy
import seaborn as sns
from tqdm.auto import tqdm  # jupyter compatible progress bar

tqdm.pandas()  # link tqdm to pandas
# Notebook magic
# save plots in the notebook
%matplotlib inline
# reloads modules automatically before executing cells
%load_ext autoreload
%autoreload 2
print(f"running in directory: {os.getcwd()}")  # where are we?
print(f"running on node: {socket.gethostname()}")  # what node are we on?

running in directory: /mnt/home/pleung/projects/grad_reqs
running on node: dig57


### Insert the root of the crispy_shifty repo

In [2]:
sys.path.insert(0, "/projects/crispy_shifty")

### Filter docks

In [None]:
from crispy_shifty.utils.io import gen_array_tasks

simulation_name = "04_filter"
design_list_file = os.path.join(os.getcwd(), "03_redock_ligand/redocked_binders.list")
output_path = os.path.join(os.getcwd(), f"{simulation_name}")
options = " ".join(
    [
        "out:level 200",
        "keep_input_protonation_state true",
        "extra_res_fa /home/pleung/projects/grad_reqs/04_filter/AFO_0001_linker_tors.params",
    ]
)

extra_kwargs = {
    "atom_ids": "O6,C18,H14,H16,CX20,OX31,NX28,HX29,CX23,HX24,HX34,CX21,HX22,HX33,CX25,HX26,HX27,HX35",
}

gen_array_tasks(
    distribute_func="deployables.check_linker_accessibility",
    func_root="/home/pleung/projects/grad_reqs",
    design_list_file=design_list_file,
    output_path=output_path,
    queue="short",
    memory="4G",
    nstruct=1,
    nstruct_per_task=10,
    options=options,
    extra_kwargs=extra_kwargs,
    simulation_name=simulation_name,
)

### Collect scorefiles of the redocked ligand

In [None]:
sys.path.insert(0, "/projects/crispy_shifty")
from crispy_shifty.utils.io import collect_score_file

simulation_name = "03_redock_ligand"
output_path = os.path.join(os.getcwd(), f"{simulation_name}")

if not os.path.exists(os.path.join(output_path, "scores.json")):
    collect_score_file(output_path, "scores")

### Load resulting concatenated scorefile

In [None]:
sys.path.insert(0, "/projects/crispy_shifty")
from crispy_shifty.utils.io import parse_scorefile_linear

output_path = os.path.join(os.getcwd(), f"{simulation_name}")

if not os.path.exists(os.path.join(output_path, "scores.csv")):
    scores_df = parse_scorefile_linear(os.path.join(output_path, "scores.json"))

### Dump scores_df as a CSV and then reload, for performance reasons

In [None]:
if not os.path.exists(os.path.join(output_path, "scores.csv")):
    scores_df.to_csv(os.path.join(output_path, "scores.csv"))

scores_df = pd.read_csv(os.path.join(output_path, "scores.csv"), index_col="Unnamed: 0")

### Setup for plotting

In [None]:
sns.set(
    context="talk",
    font_scale=1,  # make the font larger; default is pretty small
    style="ticks",  # make the background white with black lines
    palette="colorblind",  # a color palette that is colorblind friendly!
)

### Data exploration

In [None]:
print(len(scores_df))
print(list(scores_df.columns))

In [None]:
from crispy_shifty.utils.plotting import histplot_df, pairplot_df

### Plot GALigandDock metrics

In [None]:
cols = [
    "dH",
    "fa_rmsd",
    "lig_rms",
    "ligscore",
    "mean_plddt",
    "pTMscore",
    "ranking_prerelax",
    "recscore",
    "rmsd_to_reference",
    "total_score",
    "score",
    "tol",
]
the_fig = histplot_df(
    df=scores_df,
    cols=cols,
    bins=10,
    hue="design_type",
)
plt.savefig(os.path.join(output_path, "dock_scores.png"))

In [None]:
the_fig = pairplot_df(
    df=scores_df.sample(1000, random_state=0),
    cols=cols,
    hue="design_type",
)
plt.savefig(os.path.join(output_path, "dock_scores_paired.png"))

In [None]:
len(set(scores_df["path_in"].values))

### Save a list of outputs
Sort by length

In [None]:
simulation_name = "03_redock_ligand"
output_path = os.path.join(os.getcwd(), f"{simulation_name}")

with open(os.path.join(output_path, "redocked_binders.list"), "w") as f:
    for path in tqdm(scores_df.index):
        print(path, file=f)

### Prototyping blocks