# Examples for Various Use Cases in `pydebiaseddta`

This notebook examines the use of various hyperparameters pertaining to guides, predictors, or the debiased training process.

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pydebiaseddta.guides import BoWDTA, IDDTA, RFDTA, OutDTA
from pydebiaseddta.debiasing import DebiasedDTA
from pydebiaseddta.predictors import DeepDTA, BPEDTA, LMDTA
from pydebiaseddta.utils import load_sample_dta_data
from pydebiaseddta.evaluation import evaluate_predictions

guides = [BoWDTA, IDDTA]
predictors = [DeepDTA, BPEDTA]
train_ligands, train_proteins, train_labels = load_sample_dta_data(mini=True)["train"]
test_ligands, test_proteins, test_labels = load_sample_dta_data(mini=True)["test"]

  from .autonotebook import tqdm as notebook_tqdm


Training with various guides and predictors.

In [2]:
for guide in guides:
    for predictor in predictors:
        print(guide.__name__, predictor.__name__)
        debiaseddta = DebiasedDTA(guide, predictor, predictor_params={"n_epochs": 2})
        train_hist = debiaseddta.train(train_ligands,
                                       train_proteins,
                                       train_labels,
                                       val_splits = {"cold_both": [test_ligands, test_proteins, test_labels]},
                                       metrics_tracked=["mae", "mse", "r2"])
        print(train_hist)
        preds = debiaseddta.predictor_instance.predict(test_ligands, test_proteins)
        scores = evaluate_predictions(test_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"])   

BoWDTA DeepDTA
{'train': {'mae': [5.376427, 4.969446], 'mse': [30.473136, 26.256895], 'r2': [-18.421735, -15.734558]}, 'val_splits': {'cold_both': {'mae': [5.148385, 4.741596], 'mse': [26.797692, 22.774888], 'r2': [-90.656661, -76.897386]}}}
BoWDTA BPEDTA
{'train': {'mae': [5.397292, 5.025335], 'mse': [30.696363, 26.802896], 'r2': [-18.564006, -16.082546]}, 'val_splits': {'cold_both': {'mae': [5.171109, 4.812014], 'mse': [27.032222, 23.450272], 'r2': [-91.458828, -79.207416]}}}
IDDTA DeepDTA
{'train': {'mae': [5.376427, 4.971248], 'mse': [30.473136, 26.274572], 'r2': [-18.421735, -15.745824]}, 'val_splits': {'cold_both': {'mae': [5.148385, 4.743513], 'mse': [26.797692, 22.792888], 'r2': [-90.656661, -76.958952]}}}
IDDTA BPEDTA
{'train': {'mae': [5.397292, 5.026908], 'mse': [30.696363, 26.818357], 'r2': [-18.564006, -16.0924]}, 'val_splits': {'cold_both': {'mae': [5.171109, 4.814146], 'mse': [27.032222, 23.470198], 'r2': [-91.458828, -79.275569]}}}


Training using various non-default predictor hyperparameters: early stopping based on validation overfitting.

In [8]:
debiaseddta = DebiasedDTA(BoWDTA, DeepDTA, predictor_params={
    "n_epochs": 100,
    "model_folder": "./temp/",
    "early_stopping_metric": "mae",
    "early_stopping_num_epochs": 3,
    "early_stopping_split": "val_set",
    "min_epochs": 15,
    "optimizer": "adam"})
train_hist = debiaseddta.train(train_ligands,
                                train_proteins,
                                train_labels,
                                val_splits = {"val_set": [test_ligands, test_proteins, test_labels]},
                                metrics_tracked=["mae", "mse", "r2"])
preds = debiaseddta.predictor_instance.predict(test_ligands, test_proteins)
scores = evaluate_predictions(test_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"])   
print("MAE in val_set for last 5 epochs:", train_hist["val_splits"]["val_set"]["mae"][-5:])
print("MAE in val_set in the final model:", scores["mae"])

Early stopping due to no increase to mae in val_set split for 3 epochs.
Retrieved the saved best model.
MAE in val_set for last 5 epochs: [0.630919, 0.417429, 0.494799, 0.47176, 0.453359]
MAE in val_set in the final model: 0.4174285904109863


Training using various non-default predictor hyperparameters: early stopping based on training convergence (based on predefined error levels).

In [9]:
debiaseddta = DebiasedDTA(IDDTA, BPEDTA, predictor_params={
    "n_epochs": 100,
    "model_folder": "./temp/",
    "early_stopping_metric": "mse",
    "early_stopping_metric_threshold": 1.6,
    "early_stopping_split": "train",
    "optimizer": "sgd",
    "learning_rate": 0.1})
train_hist = debiaseddta.train(train_ligands,
                                train_proteins,
                                train_labels,
                                val_splits = {"val_set": [test_ligands, test_proteins, test_labels]},
                                metrics_tracked=["mae", "mse", "r2"])
preds = debiaseddta.predictor_instance.predict(train_ligands, train_proteins)
scores = evaluate_predictions(train_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"])    
print("MSE in train split for last 5 epochs:", train_hist["train"]["mse"][-5:])
print("MSE in train split in the final model:", scores["mse"])

Early stopping training due to convergence on the train split.
MSE in train split for last 5 epochs: [14.559056, 8.229033, 3.157013, 1.702, 1.57221]
MSE in train split in the final model: 1.572210015864044


Training using various non-default guide hyerparameters.

In [10]:
debiaseddta = DebiasedDTA(BoWDTA, BPEDTA, predictor_params={"n_epochs": 10}, guide_params={
    "max_depth": 4,
    "min_samples_split": 5,
    "min_samples_leaf": 3,
    "ligand_vector_mode": "freq",
    "prot_vector_mode": "binary",
    "vocab_size": "low",
    "criterion": "poisson",
    "input_rank": 10,
})
train_hist = debiaseddta.train(train_ligands, train_proteins, train_labels, metrics_tracked=["mae", "mse", "r2"])
preds = debiaseddta.predictor_instance.predict(test_ligands, test_proteins)
scores = evaluate_predictions(test_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"])   



Training using various non-default guide hyerparameters, saving importance weights.

In [11]:
debiaseddta = DebiasedDTA(IDDTA,
                          BPEDTA,
                          predictor_params={"n_epochs": 4},
                          guide_error_exponent=1,
                          weight_tempering_num_epochs=5,
                          weight_temperature=2,
                          weight_prior=0.01,
                          weight_rank_based=True
                          )
train_hist = debiaseddta.train(train_ligands,
                               train_proteins,
                               train_labels,
                               metrics_tracked=["mae", "mse", "r2"],
                               weights_save_path="./temp/exp.coef")
preds = debiaseddta.predictor_instance.predict(test_ligands, test_proteins)
scores = evaluate_predictions(test_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"])   



Training using pre-computed importance weights, ignoring importance weight hyperparameters.

In [3]:
debiaseddta = DebiasedDTA(BoWDTA,
                          BPEDTA,
                          predictor_params={"n_epochs": 4},
                          guide_error_exponent=1,
                          weight_tempering_num_epochs=5,
                          weight_temperature=2,
                          weight_prior=0.01,
                          weight_rank_based=True
                          )
train_hist = debiaseddta.train(train_ligands,
                               train_proteins,
                               train_labels,
                               metrics_tracked=["mae", "mse", "r2"],
                               weights_load_path="./temp/exp.coef")
preds = debiaseddta.predictor_instance.predict(test_ligands, test_proteins)
scores = evaluate_predictions(test_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"])   



Using `RFDTA` for training.

In [3]:
debiaseddta = DebiasedDTA(RFDTA,
                          BPEDTA,
                          predictor_params={"n_epochs": 6},
                          guide_params={"max_depth": 3, "num_trees": 100})
train_hist = debiaseddta.train(train_ligands, train_proteins, train_labels, metrics_tracked=["mae", "mse", "r2"])
preds = debiaseddta.predictor_instance.predict(test_ligands, test_proteins)
scores = evaluate_predictions(test_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"])   



Using `OutDTA` for training with inverse frequency.

In [4]:
train_ligand_ids, train_protein_ids = load_sample_dta_data(mini=True)["train_ids"]
debiaseddta = DebiasedDTA(OutDTA,
                          DeepDTA,
                          predictor_params={"n_epochs": 6},
                          guide_params={"ligand_id": train_ligand_ids, "protein_id": train_protein_ids, "rarity_indicator": "inv_frequency"})
train_hist = debiaseddta.train(train_ligands, train_proteins, train_labels, metrics_tracked=["mae", "mse", "r2"])
preds = debiaseddta.predictor_instance.predict(test_ligands, test_proteins)
scores = evaluate_predictions(test_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"])   



Using `OutDTA` for training with average distance.

In [5]:
train_ligand_ids, train_protein_ids = load_sample_dta_data(mini=True)["train_ids"]
debiaseddta = DebiasedDTA(OutDTA,
                          DeepDTA,
                          predictor_params={"n_epochs": 6},
                          guide_params={
                              "ligand_id": train_ligand_ids,
                              "protein_id": train_protein_ids,
                              "rarity_indicator": "avg_distance",
                              "prot_sim_matrix_path": "pydebiaseddta/data/dta/dta_sample_sw_sim_matrix.csv",
                              })
train_hist = debiaseddta.train(train_ligands, train_proteins, train_labels, metrics_tracked=["mae", "mse", "r2"])
preds = debiaseddta.predictor_instance.predict(test_ligands, test_proteins)
scores = evaluate_predictions(test_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"])   



Using an early-stopped `DeepDTA` as a guide.

In [2]:
train_ligand_ids, train_protein_ids = load_sample_dta_data(mini=True)["train_ids"]
debiaseddta = DebiasedDTA(DeepDTA,
                          DeepDTA,
                          predictor_params={"n_epochs": 6},
                          guide_params={
                              "n_epochs": 3,
                              })
train_hist = debiaseddta.train(train_ligands, train_proteins, train_labels, metrics_tracked=["mae", "mse", "r2"])
preds = debiaseddta.predictor_instance.predict(test_ligands, test_proteins)
scores = evaluate_predictions(test_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"])   

Using predictor DeepDTA as guide.
Guide training completed.


Using `BoWDTA` as predictor with no guides.

In [3]:
train_ligand_ids, train_protein_ids = load_sample_dta_data(mini=True)["train_ids"]
debiaseddta = DebiasedDTA(None,
                          BoWDTA,
                          predictor_params={"max_depth": 3},
                          )
train_hist = debiaseddta.train(train_ligands, train_proteins, train_labels, metrics_tracked=["mae", "mse", "r2"])
preds = debiaseddta.predictor_instance.predict(test_ligands, test_proteins)
scores = evaluate_predictions(test_labels, preds, metrics=["ci", "mse", "r2", "mae", "rmse"]) 
print(scores)

{'ci': 0.5, 'mse': 0.30609147224940664, 'r2': -0.04693053373586897, 'mae': 0.4422590312797102, 'rmse': 0.5532553409135845}
