In [None]:
from omegaconf import OmegaConf
from sklearn.model_selection import ParameterGrid

In [None]:
tasks = [
            "halogenase",
            "bkace",
            "gt",
            "esterase",
            "kinase",
            "phosphatase"
        ]
model_type = ['SimpleCoembedding']
drug_featurizers = ['MorganFeaturizer']
target_featurizers = ['ProtBertFeaturizer']
fold = list(range(10))

In [None]:
param_grid = ParameterGrid(
    {
        "task": tasks,
        "drug_featurizer": drug_featurizers,
        "target_featurizer": target_featurizers,
        "replicate": fold,
    }
)

defaults = {
    "contrastive_split": "within",
    "model_architecture": "SimpleCoembedding",
    "latent_dimension": 1024,
    "latent_distance": "Cosine",
    "batch_size": 32,
    "shuffle": True,
    "num_workers": 0,
    "epochs": 50,
    "every_n_val": 1,
    "lr": 1e-4,
    "clr": 1e-5,
    "verbosity": 3,
    "wandb_proj": "DTI_Benchmarking",
}

N_GPUS = 8
config_files = {}

param_sets = list(param_grid)
param_sets.sort(key=lambda x: x["task"], reverse=True)

for i, param in enumerate(param_sets):
    param_name = "_".join([f"{k}:{v}" for k, v in param.items()])
    oc = OmegaConf.structured(param)
    oc.device = 0
    oc.update(defaults)
    oc.model_save_dir = f"./best_models/enzpred_contrastive/{param_name}"
    oc.log_file = f"{oc.model_save_dir}/log.txt"

    filename = f"./configs/enzpred_contrastive/config_{param_name}.yaml"
    config_files[param_name] = filename
    OmegaConf.save(config=oc, f=f"../{filename}")

base_cmd = "python train_DTI.py --config {} --exp-id {} --wandb-proj EnzPred --contrastive"
list_file = "../configs/enzpred_contrastive/benchmark_sweep_list.txt"
with open(list_file, "w+") as f:
    for fi, key in config_files.items():
        cmd = base_cmd.format(key, fi)
        f.write(f"{cmd}\n")

bash_file = "../configs/enzpred_contrastive/benchmark_sweep_run.sh"
with open(bash_file, "w+") as f:
    f.write(
        f"simple_gpu_scheduler --gpus {' '.join([str(i) for i in range(N_GPUS)])} < {list_file}"
    )


# Mimic Splits and Eval

In [24]:
import os
os.chdir("/afs/csail.mit.edu/u/s/samsl/Work/Adapting_PLM_DTI")

In [25]:
from src.architectures import SimpleCoembedding
from src.featurizers import ProtBertFeaturizer, MorganFeaturizer
from src.utils import config_logger

In [26]:
logg = config_logger(
        None,
        "%(asctime)s [%(levelname)s] %(message)s",
        0,
        use_stdout=True,
    )

In [27]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.autograd import Variable

import numpy as np
import pandas as pd
import pickle as pk
from tqdm.notebook import tqdm

from sklearn.model_selection import KFold
from sklearn.metrics import average_precision_score

from dataclasses import dataclass
from src.data import BinaryDataset, drug_target_collate_fn

In [28]:
TASK_PATH = {        
    "halogenase": "./dataset/EnzPred/halogenase_NaCl_binary.csv",
    "gt": "./dataset/EnzPred/gt_acceptors_achiral_binary.csv",
    "bkace": "./dataset/EnzPred/duf_binary.csv",
    "esterase": "./dataset/EnzPred/esterase_binary.csv",
    "phosphatase": "./dataset/EnzPred/phosphatase_chiral_binary.csv",
    "kinase": "./dataset/EnzPred/davis_filtered.csv",
}

In [29]:
N_SPLITS = {        
    "halogenase": "N",
    "gt": "N",
    "bkace": 10,
    "esterase": 10,
    "phosphatase": 10,
    "kinase": 10,
}

In [30]:
@dataclass
class Config:
    drug_shape: int = 2048
    target_shape: int = 1024
    latent_shape: int = 1024
    lr: float = 1e-4
    epochs: int = 25
    batch_size: int = 32
    shuffle: bool = True
    num_workers: int = 0
    
conf = Config()

In [31]:
def create_model(conf):
    model = SimpleCoembedding(
        conf.drug_shape,
        conf.target_shape,
        latent_dimension=conf.latent_shape,
        latent_distance="Cosine",
        classify=True,
    ).to(conf.device)
    return model

In [32]:
def create_data(dataframe, conf, drug_feat, target_feat):

    bdataset = BinaryDataset(
        dataframe[conf.drug_col],
        dataframe[conf.target_col],
        dataframe[conf.label_col],
        drug_feat,
        target_feat,
    )
    bdataloader = DataLoader(
        bdataset,
        batch_size = conf.batch_size,
        shuffle = conf.shuffle,
        num_workers = conf.num_workers,
        collate_fn = drug_target_collate_fn,
    )
    return bdataloader

In [33]:
def step(model, batch, device=None):

    if device is None:
        device = torch.device("cpu")

    drug, target, label = batch

    pred = model(drug.to(device), target.to(device))
    label = Variable(torch.from_numpy(np.array(label)).float()).to(device)
    return pred, label

def train_model(model, dataloader, conf):
    
    opt = torch.optim.Adam(model.parameters(), lr=conf.lr)
    loss_fct = torch.nn.BCELoss()
    model.train()
    
    for epo in tqdm(range(conf.epochs), leave=False, desc="Train"):
        
        # for i, batch in tqdm(enumerate(dataloader), leave=False, desc="Epoch", total=len(dataloader)):
        for i, batch in enumerate(dataloader):

            pred, label = step(model, batch, conf.device)
            loss = loss_fct(pred, label)

            opt.zero_grad()
            loss.backward()
            opt.step()
            
    return model

In [34]:
def eval_model(model, dataloader, conf):
    model.eval()
    
    preds = []
    labels = []
    
    with torch.set_grad_enabled(False):
        for i, batch in enumerate(dataloader):
        
            pred, label = step(model, batch, conf.device)
            preds.append(pred)
            labels.append(label)
    
    preds = torch.cat(preds).detach().cpu().numpy()
    labels = torch.cat(labels).detach().cpu().numpy()
    aupr = average_precision_score(labels, preds)
    
    return aupr

In [64]:
from pprint import PrettyPrinter

In [65]:
pp = PrettyPrinter()

In [70]:
x = pp.pformat(vars(conf))

In [74]:
from pathlib import Path

In [35]:
conf.enzyme_type = "halogenase"
conf.device = torch.device("cuda:0")

In [48]:
for sub in full_df[conf.drug_col].unique():
    task_df = full_df[full_df[conf.drug_col] == sub]
    print(task_df[conf.label_col].sum(), len(task_df) - task_df[conf.label_col].sum(), len(task_df))

0 42 42
2 40 42
0 42 42
0 42 42
3 39 42
0 42 42
0 42 42
0 42 42
0 42 42
0 42 42
0 42 42
0 42 42
1 41 42
0 42 42
0 42 42
0 42 42
0 42 42
0 42 42
0 42 42
0 42 42
0 42 42
2 40 42
0 42 42
6 36 42
1 41 42
5 37 42
11 31 42
4 38 42
2 40 42
1 41 42
0 42 42
3 39 42
3 39 42
1 41 42
1 41 42
1 41 42
0 42 42
0 42 42
1 41 42
0 42 42
0 42 42
0 42 42
2 40 42
0 42 42
0 42 42
0 42 42
0 42 42
4 38 42
0 42 42
1 41 42
1 41 42
3 39 42
0 42 42
0 42 42
4 38 42
0 42 42
0 42 42
0 42 42
0 42 42
0 42 42
0 42 42
0 42 42


In [46]:
full_df[conf.label_col]

0       0
1       0
2       0
3       0
4       0
       ..
2599    0
2600    0
2601    0
2602    0
2603    0
Name: Conversion_NaCl, Length: 2604, dtype: int64

In [36]:
data_file = TASK_PATH[conf.enzyme_type]
full_df = pd.read_csv(data_file,index_col=0)
conf.target_col = full_df.columns[0]
conf.drug_col = full_df.columns[1]
conf.label_col = full_df.columns[2]

substrates = full_df[conf.drug_col].unique()
enzymes = full_df[conf.target_col].unique()

n_splits = len(enzymes) if N_SPLITS[conf.enzyme_type] == "N" else N_SPLITS[conf.enzyme_type]
kfsplitter = KFold(n_splits)

# drug_feat = MorganFeaturizer().to(conf.device)
# drug_feat.preload(substrates)

# target_feat = ProtBertFeaturizer().to(conf.device)
# target_feat.preload(enzymes)

In [51]:
task_aupr = {s: [] for s in substrates}
for curr_task in tqdm(substrates,leave=False,desc="Task"):
    
    aupr_list = []
    
    for i, (train_ind, test_ind) in tqdm(enumerate(kfsplitter.split(enzymes)), leave=False, desc="Split", total=n_splits):
            
        train_enzymes = [enzymes[i] for i in train_ind]
        train_df = full_df[full_df[conf.target_col].isin(train_enzymes)]
        train_dataloader = create_data(train_df, conf, drug_feat, target_feat)
        
        test_enzymes = [enzymes[i] for i in test_ind]
        test_df = full_df[(full_df[conf.target_col].isin(test_enzymes)) & (full_df[conf.drug_col] == curr_task)]
        test_dataloader = create_data(test_df, conf, drug_feat, target_feat)

        model = create_model(conf)
        model = train_model(model, train_dataloader, conf)

        aupr = eval_model(model, test_dataloader, conf)
        aupr_list.append(aupr)
        task_aupr[curr_task] = aupr_list

Task:   0%|          | 0/96 [00:00<?, ?it/s]

Split:   0%|          | 0/10 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

Split:   0%|          | 0/10 [00:00<?, ?it/s]

Train:   0%|          | 0/25 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [93]:
task_aupr = {k: np.array(v) for k,v in task_aupr.items()}

In [99]:
each_avg_aupr = []
for curr_task in substrates:
    avg_aupr = np.nanmean(task_aupr[curr_task])
    logg.info(f"Substrate {curr_task} AUPR: {avg_aupr}")
    each_avg_aupr.append(avg_aupr)

logg.info(f"AUPR: {np.nanmean(np.array(each_avg_aupr))}")
logg.info(task_aupr)

2022-07-22 13:44:05,808 [INFO] Substrate CC(=O)OC1=CC=CC2=CC=CC=C21 AUPR: 0.9629192797942798
2022-07-22 13:44:05,808 [INFO] Substrate CC(=O)OC1=CC=CC2=CC=CC=C21 AUPR: 0.9629192797942798
2022-07-22 13:44:05,808 [INFO] Substrate CC(=O)OC1=CC=CC2=CC=CC=C21 AUPR: 0.9629192797942798
2022-07-22 13:44:05,813 [INFO] Substrate CCCC(=O)OC1=CC=CC2=CC=CC=C21 AUPR: nan
2022-07-22 13:44:05,813 [INFO] Substrate CCCC(=O)OC1=CC=CC2=CC=CC=C21 AUPR: nan
2022-07-22 13:44:05,813 [INFO] Substrate CCCC(=O)OC1=CC=CC2=CC=CC=C21 AUPR: nan
2022-07-22 13:44:05,823 [INFO] Substrate CC(=O)OCC(COC(=O)C)OC(=O)C AUPR: nan
2022-07-22 13:44:05,823 [INFO] Substrate CC(=O)OCC(COC(=O)C)OC(=O)C AUPR: nan
2022-07-22 13:44:05,823 [INFO] Substrate CC(=O)OCC(COC(=O)C)OC(=O)C AUPR: nan
2022-07-22 13:44:05,825 [INFO] Substrate CCC(=O)OCC(COC(=O)CC)OC(=O)CC AUPR: nan
2022-07-22 13:44:05,825 [INFO] Substrate CCC(=O)OCC(COC(=O)CC)OC(=O)CC AUPR: nan
2022-07-22 13:44:05,825 [INFO] Substrate CCC(=O)OCC(COC(=O)CC)OC(=O)CC AUPR: nan
2022

In [86]:
with open(f"{conf.enzyme_type}_results.pk", "wb") as fi:
    pk.dump(tapr, fi)

In [97]:
labels = [0,0,0,0,0]
preds = [0.1,0.9,0.3,0.5,0.4]

average_precision_score(labels, preds)

nan

In [104]:
from pathlib import Path

In [105]:
p = Path("src/featurizers/base.py")

In [110]:
p.absolute().with_suffix(".log")

PosixPath('/data/cb/samsl/Adapting_PLM_DTI/src/featurizers/base.log')

In [1]:
import torch

In [3]:
labels = [1,1,1,1,1]
preds = [0.1,0.2,0.3,0.4,0.5]

In [12]:
t1 = torch.tensor(labels)
t2 = torch.tensor(preds)
t3 = torch.tensor(0.5)
torch.cat([t1,t2])

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 0.1000, 0.2000, 0.3000, 0.4000,
        0.5000])

In [13]:
torch.cat([t3])

RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated

In [16]:
t1

tensor([1, 1, 1, 1, 1])

In [19]:
t1.reshape(-1)

tensor([1, 1, 1, 1, 1])

In [17]:
t3

tensor(0.5000)

In [21]:
t3.view(-1)

tensor([0.5000])

In [22]:
TASK_PATH

NameError: name 'TASK_PATH' is not defined