# Processing Drug-Target Interaction Data

This notebook covers:
- Converting merged DTI data into an `h5torch` dataset
- Splitting the dataset (stratified) into train/val/test in two settings: random split, and cold-start split
- Computing embeddings from foundation models and storing them in the `h5torch` file
    - Drugs: `MMELON` (graph, image, text), and `RDKit` fingerprints
    - Targets: `NT`, `ESM`, and `ESPF` fingerprints
- Visualizing the foundatoin model embeddings

In [1]:
from resolve import *

Setting working directory to: /home/robsyc/Desktop/thesis/MB-VAE-DTI


In [2]:
import pandas as pd

df = pd.read_csv("data/processed/dti.csv")
# df = df.sample(100)
df

Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA
0,D000001,HYTVYLVVJDEURY-AUCFXJAVSA-N,C#CC(=O)C1(C)CCC2c3ccc(O)cc3CCC2C1,T000001,P14061,HSD17B1,NM_000413,MARTVVLITGCSSGIGLHLAVRLASDPSQSFKVYATLRDLKTQGRL...,ATGGCCCGCACCGTGGTGCTCATCACCGGCTGTTCCTCGGGCATCG...,False,,5.552826,,False,False,True,False,False
1,D000002,CFCGTXOJJUJIIE-UHFFFAOYSA-N,C#CC(C#C)=C1CCC(N(CCC)CCC)CC1,T000002,P35462,DRD3,NM_000796,MASLSQLSSHLNYTCGAENSTGASQARPHAYYALSYCALILAIVFG...,ATGGCATCTCTGAGCCAGCTGAGTGGCCACCTGAACTACACCTGTG...,False,,5.356537,,False,False,True,False,False
2,D000002,CFCGTXOJJUJIIE-UHFFFAOYSA-N,C#CC(C#C)=C1CCC(N(CCC)CCC)CC1,T000003,P14416,DRD2,NM_000795,MDPLNLSWYDDDLERQNWSRPFNGSDGKADRPHYNYYATLLTLLIA...,ATGGATCCACTGAATCTGTCCTGGTATGATGATGATCTGGAGAGGC...,False,,4.809891,,False,False,True,False,False
3,D000002,CFCGTXOJJUJIIE-UHFFFAOYSA-N,C#CC(C#C)=C1CCC(N(CCC)CCC)CC1,T000004,Q95136,DRD1,NM_174042,MRTLNTSTMEGTGLVAERDFSFRILTACFLSLLILSTLLGNTLVCA...,ATGAGGACTCTCAACACGTCTACCATGGAAGGCACCGGGCTGGTGG...,False,,4.795877,,False,False,True,False,False
4,D000003,PPWNCLVNXGCGAF-UHFFFAOYSA-N,C#CC(C)(C)C,T000005,P05182,Cyp2e1,NM_031543,MAVLGITIALLVWVATLLVISIWKQIYNSWNLPPGPFPLPILGNIF...,ATGGCGGTTCTTGGCATCACCATTGCCTTGCTGGTGTGGGTGGCCA...,False,,3.000000,,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339192,D136571,ZWXHJFIYTQUPOJ-UHFFFAOYSA-N,c1ncc(Cc2cc3c(s2)CCCC3)[nH]1,T000568,P25100,ADRA1D,NM_000678,MTFRDLLSVSFEGPRPDSSAGGSSAGGGGGSAGGAAPSEGPAVGGV...,ATGACTTTCCGCGATCTCCTGAGCGTCAGTTTCGAGGGACCCCGCC...,True,,8.420216,,False,False,True,False,False
339193,D136572,MPRYSKNMRJZZIQ-UHFFFAOYSA-N,c1ncc(Cc2ccsc2)[nH]1,T000568,P25100,ADRA1D,NM_000678,MTFRDLLSVSFEGPRPDSSAGGSSAGGGGGSAGGAAPSEGPAVGGV...,ATGACTTTCCGCGATCTCCTGAGCGTCAGTTTCGAGGGACCCCGCC...,True,,8.494850,,False,False,True,False,False
339194,D136573,XYXCOZZITXKLLF-BETUJISGSA-N,c1ncc(N2CC3CNCC3C2)cc1N1CCOCC1,T000235,P09483,Chrna4,NM_024354,MANSGTGAPPPLLLLPLLLLLGTGLLPASSHIETRAHAEERLLKRL...,GGCCCCGGGGCGCCGCCGCCGCTGCTGCTACTGCCGCTGCTGCTGC...,False,,7.326979,,False,False,True,False,False
339195,D136573,XYXCOZZITXKLLF-BETUJISGSA-N,c1ncc(N2CC3CNCC3C2)cc1N1CCOCC1,T000514,Q05941,Chrna7,NM_012832,MCGGRGGIWLALAAALLHVSLQGEFQRRLYKELVKNYNPLERPVAN...,ATGTGCGGCGGGCGGGGAGGCATCTGGCTGGCTCTGGCCGCGGCGC...,False,,4.999996,,False,False,True,False,False


In [None]:
from mb_vae_dti.processing.embedding import save_dti_to_h5, save_pretrain_to_h5

# save_dti_to_h5(
#     df,
#     "Drug_SMILES",
#     "dti_smiles.hdf5"
# )
# save_dti_to_h5(
#     df,
#     "Target_AA",
#     "dti_aa.hdf5"
# )
# save_dti_to_h5(
#     df,
#     "Target_DNA",
#     "dti_dna.hdf5"
# )
# do same for pretrain_... data

# df_drugs = pd.read_csv("data/processed/pretrain_drugs.csv")

# save_pretrain_to_h5(
#     df_drugs,
#     "smiles",
#     "pretrain_smiles.hdf5"
# )

# df_targets = pd.read_csv("data/processed/pretrain_targets.csv")

# save_pretrain_to_h5(
#     df_targets,
#     "aa",
#     "pretrain_aa.hdf5"
# )

# save_pretrain_to_h5(
#     df_targets,
#     "dna",
#     "pretrain_dna.hdf5"
# )

The drug and protein embedding generation was offloaded to an HPC. We used: 
- Digital Ocean droplet with a 48 GB NVIDIA L40S GPU
- Python3.11 and basic virtual environments

Due to dependency-conflicts between the foundation models, we had to create a new environment for each model (basic `requirements.txt` files can be found in the corresponding folders in the `external` directory). Check the `scripts/embedding.sh` file for more details.



---
---

In [1]:
from resolve import *

Setting working directory to: /home/robsyc/Desktop/thesis/MB-VAE-DTI


In [2]:
from mb_vae_dti.processing.h5factory import create_pretrain_h5torch

import logging
from pathlib import Path

# set logging to debug
logging.basicConfig(level=logging.DEBUG)

temp_dir = Path("/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp")
output_dir = Path("/home/robsyc/Desktop/thesis/MB-VAE-DTI/data/input")

# drug_input_file = [temp_dir / "pretrain_smiles.hdf5"]
# drug_output_file = output_dir / "drugs_pretrain.h5torch"

target_input_files = [temp_dir / "pretrain_aa.hdf5", temp_dir / "pretrain_dna.hdf5"]
target_output_file = output_dir / "targets_pretrain.h5torch"

if target_input_files[0].exists():
    print("\n--- Creating H5torch File ---")
    create_pretrain_h5torch(
        input_h5_paths=target_input_files,
        output_h5_path=target_output_file,
    )
else:
    print(f"Skipping drug file creation: {target_input_files[0]} not found.")


2025-05-29 17:48:06,859 - INFO - Creating pretrain h5torch file '/home/robsyc/Desktop/thesis/MB-VAE-DTI/data/input/targets_pretrain.h5torch' from 2 input file(s)...
2025-05-29 17:48:06,896 - INFO - Successfully loaded and merged metadata for 190851 entities ('target') from 2 file(s).
2025-05-29 17:48:06,897 - INFO - Found representations: ['aa', 'dna']
2025-05-29 17:48:06,897 - INFO - Found features: ['EMB-ESM', 'FP-ESP', 'EMB-NT']
2025-05-29 17:48:07,087 - INFO - Writing pretrain data for 190851 entities ('target')...



--- Creating H5torch File ---


2025-05-29 17:48:07,089 - INFO - Registered central 'index' (Shape: (190851,))
2025-05-29 17:48:07,089 - INFO - Registering aligned axis 0 data (Features)...
2025-05-29 17:48:07,090 - INFO - Registering external feature for axis 0: EMB-ESM (Shape: (190851, 1152), Dtype: float32)
2025-05-29 17:48:07,094 - INFO - Registered aligned[0] 'EMB-ESM' (Target Shape: (190851, 1152), Saved: float32, Loaded: float32)
2025-05-29 17:48:07,095 - INFO - Appending remaining 189827 items for axis 0 feature 'EMB-ESM'...
Appending EMB-ESM (Axis 0): 100%|██████████| 186/186 [00:01<00:00, 136.95batch/s]
2025-05-29 17:48:08,455 - INFO - Registering external feature for axis 0: FP-ESP (Shape: (190851, 4170), Dtype: float32)
2025-05-29 17:48:08,466 - INFO - Registered aligned[0] 'FP-ESP' (Target Shape: (190851, 4170), Saved: uint8, Loaded: float32)
2025-05-29 17:48:08,466 - INFO - Appending remaining 189827 items for axis 0 feature 'FP-ESP'...
Appending FP-ESP (Axis 0): 100%|██████████| 186/186 [00:01<00:00, 1

In [7]:
from mb_vae_dti.processing.h5factory import inspect_h5torch_file

target_output_file = output_dir / "targets.h5torch"
inspect_h5torch_file(target_output_file)

2025-05-30 13:57:57,031 - INFO - --- Inspecting H5torch File: targets.h5torch ---
2025-05-30 13:57:57,034 - INFO - --- Finished Inspecting: targets.h5torch ---



[Root Attributes]
  - entity_type: target
  - n_items: 190851

[Central Dataset]
  Mode: N/A (Implicitly N-D or similar)
    - Name: central
      - Path: /central
      - Shape/Length: (190851,)
      - Saved Dtype: uint32

[Aligned Axes]

  --- Axis 0 ---
    - Name: EMB-ESM
      - Path: /0/EMB-ESM
      - Shape/Length: (190851, 1152)
      - Saved Dtype: float32
    - Name: EMB-NT
      - Path: /0/EMB-NT
      - Shape/Length: (190851, 1024)
      - Saved Dtype: float32
    - Name: FP-ESP
      - Path: /0/FP-ESP
      - Shape/Length: (190851, 4170)
      - Saved Dtype: uint8
    - Name: aa
      - Path: /0/aa
      - Shape/Length: Length: 190851
      - Saved Dtype: |S1280
    - Name: dna
      - Path: /0/dna
      - Shape/Length: Length: 190851
      - Saved Dtype: |S3843

[Unstructured Datasets]
    - Name: is_train
      - Path: /unstructured/is_train
      - Shape/Length: (190851,)
      - Saved Dtype: bool


In [4]:
from mb_vae_dti.processing.h5factory import inspect_h5torch_file

drug_output_file = output_dir / "drugs_pretrain.h5torch"
inspect_h5torch_file(drug_output_file)

2025-05-30 13:57:16,447 - INFO - --- Inspecting H5torch File: drugs_pretrain.h5torch ---
2025-05-30 13:57:16,459 - INFO - --- Finished Inspecting: drugs_pretrain.h5torch ---



[Root Attributes]
  - entity_type: drug
  - n_items: 3460396

[Central Dataset]
  Mode: N/A (Implicitly N-D or similar)
    - Name: central
      - Path: /central
      - Shape/Length: (3460396,)
      - Saved Dtype: uint32

[Aligned Axes]

  --- Axis 0 ---
    - Name: EMB-BiomedGraph
      - Path: /0/EMB-BiomedGraph
      - Shape/Length: (3460396, 512)
      - Saved Dtype: float32
    - Name: EMB-BiomedImg
      - Path: /0/EMB-BiomedImg
      - Shape/Length: (3460396, 512)
      - Saved Dtype: float32
    - Name: EMB-BiomedText
      - Path: /0/EMB-BiomedText
      - Shape/Length: (3460396, 768)
      - Saved Dtype: float32
    - Name: FP-Morgan
      - Path: /0/FP-Morgan
      - Shape/Length: (3460396, 2048)
      - Saved Dtype: uint8

[Unstructured Datasets]
    - Name: smiles
      - Path: /unstructured/smiles
      - Shape/Length: Length: 3460396
      - Saved Dtype: object
    - Name: split
      - Path: /unstructured/split
      - Shape/Length: Length: 3460396
      - Saved Dty

In [4]:
from mb_vae_dti.processing.h5factory import create_dti_h5torch, inspect_h5torch_file
import pandas as pd

data_dir = Path("/home/robsyc/Desktop/thesis/MB-VAE-DTI/data/processed")

dti_df = pd.read_csv(data_dir / "dti.csv")

dti_target_input_files = [temp_dir / "dti_aa.hdf5", temp_dir / "dti_dna.hdf5"]
dti_drug_input_files = [temp_dir / "dti_smiles.hdf5"]

dti_output_file = output_dir / "dti.h5torch"

create_dti_h5torch(
    dti_df,
    dti_drug_input_files,
    dti_target_input_files,
    dti_output_file
)

inspect_h5torch_file(dti_output_file)

2025-05-02 10:27:56,704 - INFO - Starting creation of DTI h5torch file: /home/robsyc/Desktop/thesis/MB-VAE-DTI/data/input/dti.h5torch
2025-05-02 10:27:56,705 - INFO - Processing DTI DataFrame...
2025-05-02 10:27:56,705 - INFO - Processing DTI DataFrame with 339197 interactions...
2025-05-02 10:27:56,717 - INFO - Adding split columns (rand=True, cold=True) using fractions (0.8, 0.1, 0.1)...
2025-05-02 10:28:30,459 - INFO - Generated split columns: ['split_rand', 'split_cold']
2025-05-02 10:28:30,474 - INFO - Found 126811 unique drugs and 1976 unique targets in the DataFrame.
2025-05-02 10:28:30,487 - INFO - Extracting unique drug features from DataFrame...
2025-05-02 10:28:30,555 - INFO - Extracting unique target features from DataFrame...
2025-05-02 10:28:30,572 - INFO - Mapping interaction indices and sorting DataFrame...
2025-05-02 10:28:30,648 - INFO - DataFrame sorted, 339197 interactions remain.
2025-05-02 10:28:30,648 - INFO - Preparing COO interaction data...
2025-05-02 10:28:30


[Root Attributes]
  - created_at: 2025-05-02T10:28:30.834915
  - n_drugs: 126811
  - n_interactions: 339197
  - n_targets: 1976
  - sparsity: 0.0013536554463707139

[Central Dataset]
  Mode: coo
  Shape (Attr): [126811   1976]
    - Name: indices
      - Path: /central/indices
      - Shape/Length: (2, 339197)
      - Saved Dtype: int64
    - Dataset 'values' not found or not a dataset.

[Aligned Axes]

  --- Axis 0 ---
    - Name: Drug_ID
      - Path: /0/Drug_ID
      - Shape/Length: Length: 126811
      - Saved Dtype: |S7
    - Name: Drug_InChIKey
      - Path: /0/Drug_InChIKey
      - Shape/Length: Length: 126811
      - Saved Dtype: object
    - Name: EMB-BiomedGraph
      - Path: /0/EMB-BiomedGraph
      - Shape/Length: (126811, 512)
      - Saved Dtype: float32
    - Name: EMB-BiomedImg
      - Path: /0/EMB-BiomedImg
      - Shape/Length: (126811, 512)
      - Saved Dtype: float32
    - Name: EMB-BiomedText
      - Path: /0/EMB-BiomedText
      - Shape/Length: (126811, 768)
   

---

In [1]:
from resolve import *
from mb_vae_dti.loading import *
import pandas as pd

df = pd.read_csv("data/processed/dti.csv")
df

Setting working directory to: /home/robsyc/Desktop/thesis/MB-VAE-DTI


Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA
0,D000001,HYTVYLVVJDEURY-AUCFXJAVSA-N,C#CC(=O)C1(C)CCC2c3ccc(O)cc3CCC2C1,T000001,P14061,HSD17B1,NM_000413,MARTVVLITGCSSGIGLHLAVRLASDPSQSFKVYATLRDLKTQGRL...,ATGGCCCGCACCGTGGTGCTCATCACCGGCTGTTCCTCGGGCATCG...,False,,5.552826,,False,False,True,False,False
1,D000002,CFCGTXOJJUJIIE-UHFFFAOYSA-N,C#CC(C#C)=C1CCC(N(CCC)CCC)CC1,T000002,P35462,DRD3,NM_000796,MASLSQLSSHLNYTCGAENSTGASQARPHAYYALSYCALILAIVFG...,ATGGCATCTCTGAGCCAGCTGAGTGGCCACCTGAACTACACCTGTG...,False,,5.356537,,False,False,True,False,False
2,D000002,CFCGTXOJJUJIIE-UHFFFAOYSA-N,C#CC(C#C)=C1CCC(N(CCC)CCC)CC1,T000003,P14416,DRD2,NM_000795,MDPLNLSWYDDDLERQNWSRPFNGSDGKADRPHYNYYATLLTLLIA...,ATGGATCCACTGAATCTGTCCTGGTATGATGATGATCTGGAGAGGC...,False,,4.809891,,False,False,True,False,False
3,D000002,CFCGTXOJJUJIIE-UHFFFAOYSA-N,C#CC(C#C)=C1CCC(N(CCC)CCC)CC1,T000004,Q95136,DRD1,NM_174042,MRTLNTSTMEGTGLVAERDFSFRILTACFLSLLILSTLLGNTLVCA...,ATGAGGACTCTCAACACGTCTACCATGGAAGGCACCGGGCTGGTGG...,False,,4.795877,,False,False,True,False,False
4,D000003,PPWNCLVNXGCGAF-UHFFFAOYSA-N,C#CC(C)(C)C,T000005,P05182,Cyp2e1,NM_031543,MAVLGITIALLVWVATLLVISIWKQIYNSWNLPPGPFPLPILGNIF...,ATGGCGGTTCTTGGCATCACCATTGCCTTGCTGGTGTGGGTGGCCA...,False,,3.000000,,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339192,D136571,ZWXHJFIYTQUPOJ-UHFFFAOYSA-N,c1ncc(Cc2cc3c(s2)CCCC3)[nH]1,T000568,P25100,ADRA1D,NM_000678,MTFRDLLSVSFEGPRPDSSAGGSSAGGGGGSAGGAAPSEGPAVGGV...,ATGACTTTCCGCGATCTCCTGAGCGTCAGTTTCGAGGGACCCCGCC...,True,,8.420216,,False,False,True,False,False
339193,D136572,MPRYSKNMRJZZIQ-UHFFFAOYSA-N,c1ncc(Cc2ccsc2)[nH]1,T000568,P25100,ADRA1D,NM_000678,MTFRDLLSVSFEGPRPDSSAGGSSAGGGGGSAGGAAPSEGPAVGGV...,ATGACTTTCCGCGATCTCCTGAGCGTCAGTTTCGAGGGACCCCGCC...,True,,8.494850,,False,False,True,False,False
339194,D136573,XYXCOZZITXKLLF-BETUJISGSA-N,c1ncc(N2CC3CNCC3C2)cc1N1CCOCC1,T000235,P09483,Chrna4,NM_024354,MANSGTGAPPPLLLLPLLLLLGTGLLPASSHIETRAHAEERLLKRL...,GGCCCCGGGGCGCCGCCGCCGCTGCTGCTACTGCCGCTGCTGCTGC...,False,,7.326979,,False,False,True,False,False
339195,D136573,XYXCOZZITXKLLF-BETUJISGSA-N,c1ncc(N2CC3CNCC3C2)cc1N1CCOCC1,T000514,Q05941,Chrna7,NM_012832,MCGGRGGIWLALAAALLHVSLQGEFQRRLYKELVKNYNPLERPVAN...,ATGTGCGGCGGGCGGGGAGGCATCTGGCTGGCTCTGGCCGCGGCGC...,False,,4.999996,,False,False,True,False,False


In [5]:
from mb_vae_dti.processing.h5datasets import PretrainDataset

pretrain_targets_train = PretrainDataset(
    Path("/home/robsyc/Desktop/thesis/MB-VAE-DTI/data/input/targets_pretrain.h5torch"),
    subset_filters={'split_col': 'split', 'split_value': 'train'}
)

pretrain_targets_train[0]

2025-05-29 17:42:08,573 - INFO - Calculated subset mask for targets_pretrain.h5torch. Kept 171765 / 190851 items.
2025-05-29 17:42:08,575 - INFO - Initialized PretrainDataset from targets_pretrain.h5torch. Size: 171765 items.
2025-05-29 17:42:08,575 - INFO -   Feature paths (Axis 0): ['EMB-ESM', 'EMB-NT', 'FP-ESP', 'aa', 'dna']
2025-05-29 17:42:08,575 - INFO -   Representation paths (Unstructured): ['aa', 'dna']


AttributeError: 'bytes' object has no attribute 'astype'