# Processing Drug-Target Interaction Data

This notebook covers:
- Converting merged DTI data into an `h5torch` dataset
- Splitting the dataset (stratified) into train/val/test in two settings: random split, and cold-start split
- Computing embeddings from foundation models and storing them in the `h5torch` file
    - Drugs: `MMELON` (graph, image, text), and `RDKit` fingerprints
    - Targets: `NT`, `ESM`, and `ESPF` fingerprints
- Visualizing the foundatoin model embeddings

In [1]:
from resolve import *

Setting working directory to: /home/robsyc/Desktop/thesis/MB-VAE-DTI


In [2]:
import pandas as pd

df = pd.read_csv("data/processed/dti.csv")
# df = df.sample(100)
df

Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA
0,D000001,HYTVYLVVJDEURY-AUCFXJAVSA-N,C#CC(=O)C1(C)CCC2c3ccc(O)cc3CCC2C1,T000001,P14061,HSD17B1,NM_000413,MARTVVLITGCSSGIGLHLAVRLASDPSQSFKVYATLRDLKTQGRL...,ATGGCCCGCACCGTGGTGCTCATCACCGGCTGTTCCTCGGGCATCG...,False,,5.552826,,False,False,True,False,False
1,D000002,CFCGTXOJJUJIIE-UHFFFAOYSA-N,C#CC(C#C)=C1CCC(N(CCC)CCC)CC1,T000002,P35462,DRD3,NM_000796,MASLSQLSSHLNYTCGAENSTGASQARPHAYYALSYCALILAIVFG...,ATGGCATCTCTGAGCCAGCTGAGTGGCCACCTGAACTACACCTGTG...,False,,5.356537,,False,False,True,False,False
2,D000002,CFCGTXOJJUJIIE-UHFFFAOYSA-N,C#CC(C#C)=C1CCC(N(CCC)CCC)CC1,T000003,P14416,DRD2,NM_000795,MDPLNLSWYDDDLERQNWSRPFNGSDGKADRPHYNYYATLLTLLIA...,ATGGATCCACTGAATCTGTCCTGGTATGATGATGATCTGGAGAGGC...,False,,4.809891,,False,False,True,False,False
3,D000002,CFCGTXOJJUJIIE-UHFFFAOYSA-N,C#CC(C#C)=C1CCC(N(CCC)CCC)CC1,T000004,Q95136,DRD1,NM_174042,MRTLNTSTMEGTGLVAERDFSFRILTACFLSLLILSTLLGNTLVCA...,ATGAGGACTCTCAACACGTCTACCATGGAAGGCACCGGGCTGGTGG...,False,,4.795877,,False,False,True,False,False
4,D000003,PPWNCLVNXGCGAF-UHFFFAOYSA-N,C#CC(C)(C)C,T000005,P05182,Cyp2e1,NM_031543,MAVLGITIALLVWVATLLVISIWKQIYNSWNLPPGPFPLPILGNIF...,ATGGCGGTTCTTGGCATCACCATTGCCTTGCTGGTGTGGGTGGCCA...,False,,3.000000,,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339192,D136571,ZWXHJFIYTQUPOJ-UHFFFAOYSA-N,c1ncc(Cc2cc3c(s2)CCCC3)[nH]1,T000568,P25100,ADRA1D,NM_000678,MTFRDLLSVSFEGPRPDSSAGGSSAGGGGGSAGGAAPSEGPAVGGV...,ATGACTTTCCGCGATCTCCTGAGCGTCAGTTTCGAGGGACCCCGCC...,True,,8.420216,,False,False,True,False,False
339193,D136572,MPRYSKNMRJZZIQ-UHFFFAOYSA-N,c1ncc(Cc2ccsc2)[nH]1,T000568,P25100,ADRA1D,NM_000678,MTFRDLLSVSFEGPRPDSSAGGSSAGGGGGSAGGAAPSEGPAVGGV...,ATGACTTTCCGCGATCTCCTGAGCGTCAGTTTCGAGGGACCCCGCC...,True,,8.494850,,False,False,True,False,False
339194,D136573,XYXCOZZITXKLLF-BETUJISGSA-N,c1ncc(N2CC3CNCC3C2)cc1N1CCOCC1,T000235,P09483,Chrna4,NM_024354,MANSGTGAPPPLLLLPLLLLLGTGLLPASSHIETRAHAEERLLKRL...,GGCCCCGGGGCGCCGCCGCCGCTGCTGCTACTGCCGCTGCTGCTGC...,False,,7.326979,,False,False,True,False,False
339195,D136573,XYXCOZZITXKLLF-BETUJISGSA-N,c1ncc(N2CC3CNCC3C2)cc1N1CCOCC1,T000514,Q05941,Chrna7,NM_012832,MCGGRGGIWLALAAALLHVSLQGEFQRRLYKELVKNYNPLERPVAN...,ATGTGCGGCGGGCGGGGAGGCATCTGGCTGGCTCTGGCCGCGGCGC...,False,,4.999996,,False,False,True,False,False


In [3]:
from mb_vae_dti.processing.embedding import save_representations_to_h5

save_representations_to_h5(
    df,
    "Drug_SMILES",
    "dti_smiles.hdf5"
)
save_representations_to_h5(
    df,
    "Target_AA",
    "dti_aa.hdf5"
)
save_representations_to_h5(
    df,
    "Target_DNA",
    "dti_dna.hdf5"
)
# do same for pretrain_... data

2025-04-27 01:31:18,392 - embedding - INFO - Created H5 file with 126811 unique Drug_IDs at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_smiles.hdf5
2025-04-27 01:31:18,726 - embedding - INFO - Created H5 file with 1976 unique Target_IDs at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_aa.hdf5
2025-04-27 01:31:19,347 - embedding - INFO - Created H5 file with 1976 unique Target_IDs at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_dna.hdf5


PosixPath('/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_dna.hdf5')

In [6]:
from mb_vae_dti.processing.embedding import run_embedding_script

# run_embedding_script(
#     "dti_smiles.hdf5",
#     "MorganFP"            # 2048 (done)
# )
# run_embedding_script(
#     "dti_aa.hdf5",
#     "ESPF"                  # 4170 (done)
# )
# run_embedding_script(
#     "dti_smiles.hdf5",
#     "biomed-multi-view"   # 512, 512, 768 (done)
# )
# run_embedding_script(
#     "dti_aa.hdf5",
#     "ESM"                   # 1152 (doesn't fit in mem)
# )
run_embedding_script(
    "dti_dna.hdf5",
    "nucleotide-transformer" # 1024 (done, but no CUDO-enabled jaxlib)
)

2025-04-27 02:11:44,594 - embedding - INFO - Running embedding script with command: /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/run_embeddings.sh nucleotide-transformer /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_dna.hdf5


Activating virtual environment at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/nucleotide-transformer/venv
Checking for installation requirements...
Installing using setup.py
Running: python /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/nucleotide-transformer/script.py --input "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_dna.hdf5"
Loading Nucleotide Transformer model (500M_multi_species_v2), padding to 650...
Downloading model's hyperparameters json file...
Downloaded model's hyperparameters.
Downloading model's weights...
Downloaded model's weights...
Model loaded successfully
Configured to extract embeddings from layer: 29
Processing DNA sequences with Nucleotide Transformer (500M_multi_species_v2)
--- Starting embedding generation for 'EMB-NT' ---
Processing HDF5 file: /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_dna.hdf5
Using batch size: 16
Processing first batch to determine embedding dimensions...




Determined embedding shape: (1024,), dtype: float32
Created HDF5 dataset 'embeddings/EMB-NT' with shape (1976, 1024)
Processing remaining 1960 items in 123 batches...


Generating EMB-NT embeddings: 100%|██████████| 1960/1960 [1:25:06<00:00,  2.61s/it]


Successfully added 'EMB-NT' embeddings to /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_dna.hdf5
--- Finished embedding generation for 'EMB-NT' ---
Embedding generation completed successfully


---
---

## Add split columns and create h5torch file

In [4]:
from mb_vae_dti.processing import add_split_cols, create_h5torch

df_split = add_split_cols(df)
create_h5torch(df_split, output_filename="DTI_small.h5torch")

14:37:25 - INFO - Old pandas version detected. Patching DataFrame.map to DataFrame.applymap


Creating h5torch file from dataframe with 1000 rows...
Found 955 unique drugs and 418 unique targets
Creating central interaction matrix of shape (955, 418) with 1000 observed interactions
Processing drug features...
Processing target features...
Created h5torch file at data/processed/DTI_small.h5torch


In [14]:
from mb_vae_dti.processing import load_h5torch_DTI

test_davis_metz = load_h5torch_DTI(
    filename="DTI_small.h5torch",
    setting="split_cold",
    split="test",
    datasets=["in_DAVIS", "in_Metz", "in_BindingDB_Kd", "in_BindingDB_Ki"]
)
test_davis_metz[20]

Using boolean mask for mapping (93 indices)
Verified alignment: all unstructured data has 1000 elements


{'central': False,
 '0/Drug_ID': 'D020044',
 '0/Drug_InChIKey': 'WUTRVCAUFFQVGX-HNNXBMFYSA-N',
 '0/Drug_SMILES': 'CC(C)CN(C(=O)c1ccccc1C(C)C)[C@H]1CCNC1',
 '1/Target_AA': 'MSKSKCSVGLMSSVVAPAKEPNAVGPKEVELILVKEQNGVQLTSSTLTNPRQSPVEAQDRETWGKKIDFLLSVIGFAVDLANVWRFPYLCYKNGGGAFLVPYLLFMVIAGMPLFYMELALGQFNREGAAGVWKICPILKGVGFTVILISLYVGFFYNVIIAWALHYLFSSFTTELPWIHCNNSWNSPNCSDAHPGDSSGDSSGLNDTFGTTPAAEYFERGVLHLHQSHGIDDLGPPRWQLTACLVLVIVLLYFSLWKGVKTSGKVVWITATMPYVVLTALLLRGVTLPGAIDGIRAYLSVDFYRLCEASVWIDAATQVCFSLGVGFGVLIAFSSYNKFTNNCYRDAIVTTSINSLTSFSSGFVVFSFLGYMAQKHSVPIGDVAKDGPGLIFIIYPEAIATLPLSSAWAVVFFIMLLTLGIDSAMGGMESVITGLIDEFQLLHRHRELFTLFIVLATFLLSLFCVTNGGIYVFTLLDHFAAGTSILFGVLIEAIGVAWFYGVGQFSDDIQQMTGQRPSLYWRLCWKLVSPCFLLFVVVVSIVTFRPPHYGAYIFPDWANALGWVIATSSMAMVPIYAAYKFCSLPGSFREKLAYAIAPEKDRELVDRGEVRQFTLRHWLKV',
 '1/Target_DNA': 'ATGAGTAAGAGCAAATGCTCCGTGGGACTCATGTCTTCCGTGGTGGCCCCGGCTAAGGAGCCCAATGCCGTGGGCCCGAAGGAGGTGGAGCTCATCCTTGTCAAGGAGCAGAACGGAGTGCAGCTCACCAGCTCCACCCTCACCAACCCGCGGCAGAGCCCCGTGGAGGCCCAGGATCGGGAGACCT

In [15]:
# get row where Drug_ID is ... and Target_ID is ...
drug_id = 'D020044'
target_id = 'T000469'

df_split[(df_split["Drug_ID"] == drug_id) & (df_split["Target_ID"] == target_id)]

Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA,split_rand,split_cold
58456,D020044,WUTRVCAUFFQVGX-HNNXBMFYSA-N,CC(C)CN(C(=O)c1ccccc1C(C)C)[C@H]1CCNC1,T000469,Q01959,SLC6A3,NM_001044,MSKSKCSVGLMSSVVAPAKEPNAVGPKEVELILVKEQNGVQLTSST...,ATGAGTAAGAGCAAATGCTCCGTGGGACTCATGTCTTCCGTGGTGG...,False,,5.444893,,False,False,True,False,False,train,test


## Adding embeddings to the h5torch file

In [7]:
from mb_vae_dti.processing import load_h5torch_DTI

test_davis_metz = load_h5torch_DTI(
    filename="DTI_small.h5torch",
    setting="split_cold",
    split="test",
    datasets=["in_DAVIS", "in_Metz", "in_BindingDB_Kd"]
)
test_davis_metz[20]

Using boolean mask for mapping (22 indices)
Verified alignment: all unstructured data has 1000 elements


{'central': True,
 '0/Drug_FP': array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 '0/Drug_ID': 'D005704',
 '0/Drug_InChIKey': 'BPNUQXPIQBZCMR-IBGZPJMESA-N',
 '0/Drug_SMILES': 'Cc1n[nH]c2ccc(-c3cncc(OC[C@@H](N)Cc4ccccc4)c3)cc12',
 '0/Drug_emb_graph': array([ 1.90964546e-02,  3.65307368e-02, -7.67227039e-02, -2.17817098e-01,
         2.92698480e-02, -7.65806576e-03,  6.17058277e-02, -3.02248616e-02,
        -3.07081044e-01, -1.00169100e-01, -1.62992515e-02, -4.32518095e-01,
         7.75575042e-02,  2.66861264e-02, -2.08349586e-01,  6.72742501e-02,
         4.46739830e-02,  1.43983409e-01,  3.29942629e-02,  8.80348533e-02,
        -1.41768217e-01, -3.20785753e-02,  2.06791297e-01,  2.04960611e-02,
         1.23171246e+00, -1.98846664e-02,  2.10213590e+00, -6.31648824e-02,
         3.30971442e-02,  2.19252086e+00, -5.95130119e-03, -2.86935985e-01,
        -4.31635641e-02,  4.11687195e-02,  1.21839553e-01, -4.30982709e-01,
        -4.06590328e-02,  6.95760772e-02, -9.02318507e-02,  2.5

In [6]:
from mb_vae_dti.processing.embedding import add_all_embeddings_to_h5torch

add_all_embeddings_to_h5torch(
    h5torch_file_name="DTI_small.h5torch"
)

2025-04-19 14:39:47,131 - embedding - INFO - Retrieved 955 sequences from Drug_SMILES
2025-04-19 14:39:47,131 - embedding - INFO - Generating embeddings using rdMorganFP...
2025-04-19 14:39:47,132 - embedding - INFO - Wrote 955 sequences to external/temp/rdmorganfp_input_1745066387.txt
2025-04-19 14:39:47,133 - embedding - INFO - Running embedding command: /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/run_embeddings.sh rdMorganFP /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/rdmorganfp_input_1745066387.txt /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/rdmorganfp_output_1745066387.npy
2025-04-19 14:39:47,980 - embedding - INFO - Activating virtual environment at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/rdMorganFP/venv
Running: python /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/rdMorganFP/script.py --input "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/rdmorganfp_input_1745066387.txt" --output "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/rdmorga

In [5]:
from mb_vae_dti.processing import add_embeddings_to_h5torch

add_embeddings_to_h5torch(
    h5torch_file_name="DTI_small.h5torch",
    repo_name="biomed-multi-view",
    entity_axis=0,
    entity_representation="Drug_SMILES",
    feature_name="Drug_emb_graph",
    use_batch=False,
    overwrite=True
)

2025-04-19 13:29:35,624 - embedding - INFO - Retrieved 433 sequences from Target_AA
2025-04-19 13:29:35,625 - embedding - INFO - Generating embeddings using ESPF...
2025-04-19 13:29:35,626 - embedding - INFO - Wrote 433 sequences to external/temp/espf_input_1745062175.txt
2025-04-19 13:29:35,626 - embedding - INFO - Running embedding command: /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/run_embeddings.sh ESPF /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/espf_input_1745062175.txt /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/espf_output_1745062175.npy
2025-04-19 13:29:39,705 - embedding - INFO - Activating virtual environment at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/ESPF/venv
Running: python /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/ESPF/script.py --input "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/espf_input_1745062175.txt" --output "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/espf_output_1745062175.npy"
Processing 433 sequences...
S

---

## Drug Generation

In [7]:
import os
os.chdir("/home/robsyc/Desktop/thesis/MB-VAE-DTI")

from mb_vae_dti.processing import add_split_cols_drug_generation, create_h5torch_smiles, SMILESDataset
import pandas as pd

df_drug_generation = pd.read_csv("data/processed/data_drug_generation.csv")
df_drug_generation_split = add_split_cols_drug_generation(df_drug_generation)

In [8]:
create_h5torch_smiles(df_drug_generation_split)

Creating h5torch file from dataframe with 4007090 SMILES strings...
Created h5torch file for SMILES data at data/processed/data_drug_generation.h5torch


In [9]:
import h5torch

# open data_drug_generation.h5torch
with h5torch.File("data/processed/data_drug_generation.h5torch", "r") as f:
    print(f.keys())
    print(f["central"])
    print(f["0"])
    print(f["0"].keys())
    print(f["0/smiles"][:10])
    print("0/smiles" in f)


<KeysViewHDF5 ['0', 'central']>
<HDF5 dataset "central": shape (4007090,), type "<i8">
<HDF5 group "/0" (2 members)>
<KeysViewHDF5 ['smiles', 'split']>
[b'Cc1cccc(N2CC(c3nc4ccccc4n3C)CC2=O)c1C'
 b'CCn1c(C2CC(=O)N(c3cccc(C)c3C)C2)nc2ccccc21'
 b'Cc1cc(C)c(N2CC(c3nc4ccccc4[nH]3)CC2=O)c(C)c1'
 b'Cc1cc(C)c(N2CC(c3nc4ccccc4n3C)CC2=O)c(C)c1'
 b'Cn1c(C2CC(=O)N(Cc3ccccc3)C2)nc2ccccc21'
 b'Cc1ccc(N2CC(c3nc4ccccc4n3C)CC2=O)cc1Cl'
 b'COc1ccc(Cl)cc1N1CC(c2nc3ccccc3[nH]2)CC1=O'
 b'CC1CN(c2cc(=O)n(-c3ccccc3Cl)c(=O)[nH]2)CC(C)O1'
 b'O=C(c1ccccc1)N1CCN(S(=O)(=O)Cc2ccccc2)CC1'
 b'O=C(c1ccncc1)N1CCN(S(=O)(=O)Cc2ccccc2)CC1']
True


In [1]:
# Create dataset for all data
dataset = SMILESDataset("data/processed/data_drug_generation.h5torch")

# Or for a specific split
train_dataset = SMILESDataset(
    filename="data/processed/data_drug_generation.h5torch", 
    split="train"
)

dataset[0]

03:07:01 - INFO - Old pandas version detected. Patching DataFrame.map to DataFrame.applymap


'Cc1cccc(N2CC(c3nc4ccccc4n3C)CC2=O)c1C'