# Processing Drug-Target Interaction Data

This notebook covers:
- Converting merged DTI data into an `h5torch` dataset
- Splitting the dataset (stratified) into train/val/test in two settings: random split, and cold-start split
- Computing embeddings from foundation models
    - Drugs: `MMELON` (graph, image, text), and RDKit fingerprints
    - Targets: `NT`, `ESM`, and ESPF fingerprints

In [2]:
import os

# Set the working directory to the root of the project
os.chdir("/home/robsyc/Desktop/thesis/MB-VAE-DTI")

In [2]:
import pandas as pd
import numpy as np
from typing import Tuple, List, Dict, Any

df = pd.read_csv("data/processed/data.csv")
df

Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA
0,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000001,Q2M2I8,AAK1,NM_014911,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,ATGAAGAAGTTTTTCGACTCCCGGCGAGAGCAGGGCGGCTCTGGCC...,False,4.999996,,,True,True,False,False,False
1,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000002,P00519,ABL1,NM_005157,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,AACCTTTTCGTTGCACTGTATGATTTTGTGGCCAGTGGAGATAACA...,True,8.920819,,,True,False,False,False,False
2,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000003,Q04771,ACVR1,NM_001105,MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNED...,ATGGTAGATGGAGTGATGATTCTTCCTGTGCTTATCATGATTGCTC...,False,4.999996,,,True,True,False,False,False
3,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000004,P36896,ACVR1B,NM_004302,MAESAGASSFFPLVVLLLAGSGGSGPRGVQALLCACTSCLQANYTC...,ATGGCGGAGTCGGCCGGAGCCTCCTCCTTCTTCCCCCTTGTTGTCC...,False,4.999996,,,True,True,False,False,False
4,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000005,P27037,ACVR2A,NM_001278579,MGAAAKLAFAVFLISCSSGAILGRSETQECLFFNANWEKDRTNQTG...,ATGGGAGCTGCTGCAAAGTTGGCGTTTGCCGTCTTTCTTATCTCCT...,False,4.999996,,,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396464,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T001711,P41279,MAP3K8,NM_001244134,MEYMSTGSDNKEEIDLLIKHLNVSDVIDIMENLYASEEPAVYEPSL...,ATGGAGTACATGAGCACTGGAAGTGACAATAAAGAAGAGATTGATT...,True,,,13.70206,False,False,False,False,True
396465,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T000031,Q13554,CAMK2B,NM_001220,MATTVTCTRFTDEYQLYEDIGKGAFSVVRRCVKLCTGHEYAAKIIN...,ATGGCCACCACGGTGACCTGCACCCGCTTCACCGACGAGTACCAGC...,False,,,10.49794,False,False,False,False,True
396466,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T000032,Q13557,CAMK2D,NM_001221,MASTTTCTRFTDEYQLFEELGKGAFSVVRRCMKIPTGQEYAAKIIN...,ATGGCTTCGACCACAACCTGCACCAGGTTCACGGACGAGTATCAGC...,False,,,10.49794,False,False,False,False,True
396467,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T000280,Q16539,MAPK14,NM_001315,MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKT...,ATGTCTCAGGAGAGGCCCACGTTCTACCGGCAGGAGCTGAACAAGA...,False,,,10.49794,False,False,False,False,True


In [3]:
from mb_vae_dti.processing import add_split_cols, create_h5torch

df_split = add_split_cols(df)
df_split

09:37:19 - INFO - Old pandas version detected. Patching DataFrame.map to DataFrame.applymap


Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA,split_rand,split_cold
0,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000001,Q2M2I8,AAK1,NM_014911,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,ATGAAGAAGTTTTTCGACTCCCGGCGAGAGCAGGGCGGCTCTGGCC...,False,4.999996,,,True,True,False,False,False,train,train
1,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000002,P00519,ABL1,NM_005157,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,AACCTTTTCGTTGCACTGTATGATTTTGTGGCCAGTGGAGATAACA...,True,8.920819,,,True,False,False,False,False,train,train
2,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000003,Q04771,ACVR1,NM_001105,MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNED...,ATGGTAGATGGAGTGATGATTCTTCCTGTGCTTATCATGATTGCTC...,False,4.999996,,,True,True,False,False,False,train,train
3,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000004,P36896,ACVR1B,NM_004302,MAESAGASSFFPLVVLLLAGSGGSGPRGVQALLCACTSCLQANYTC...,ATGGCGGAGTCGGCCGGAGCCTCCTCCTTCTTCCCCCTTGTTGTCC...,False,4.999996,,,True,True,False,False,False,test,train
4,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000005,P27037,ACVR2A,NM_001278579,MGAAAKLAFAVFLISCSSGAILGRSETQECLFFNANWEKDRTNQTG...,ATGGGAGCTGCTGCAAAGTTGGCGTTTGCCGTCTTTCTTATCTCCT...,False,4.999996,,,True,True,False,False,False,train,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396464,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T001711,P41279,MAP3K8,NM_001244134,MEYMSTGSDNKEEIDLLIKHLNVSDVIDIMENLYASEEPAVYEPSL...,ATGGAGTACATGAGCACTGGAAGTGACAATAAAGAAGAGATTGATT...,True,,,13.70206,False,False,False,False,True,train,train
396465,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T000031,Q13554,CAMK2B,NM_001220,MATTVTCTRFTDEYQLYEDIGKGAFSVVRRCVKLCTGHEYAAKIIN...,ATGGCCACCACGGTGACCTGCACCCGCTTCACCGACGAGTACCAGC...,False,,,10.49794,False,False,False,False,True,train,train
396466,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T000032,Q13557,CAMK2D,NM_001221,MASTTTCTRFTDEYQLFEELGKGAFSVVRRCMKIPTGQEYAAKIIN...,ATGGCTTCGACCACAACCTGCACCAGGTTCACGGACGAGTATCAGC...,False,,,10.49794,False,False,False,False,True,train,train
396467,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T000280,Q16539,MAPK14,NM_001315,MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKT...,ATGTCTCAGGAGAGGCCCACGTTCTACCGGCAGGAGCTGAACAAGA...,False,,,10.49794,False,False,False,False,True,train,train


In [4]:
df_split["split_rand"].value_counts() / len(df_split)

split_rand
train    0.799977
test     0.100043
valid    0.099980
Name: count, dtype: float64

In [5]:
create_h5torch(df_split, output_filename="data.h5torch")

Creating h5torch file from dataframe with 396469 rows...
Found 149962 unique drugs and 2047 unique targets
Creating central interaction matrix of shape (149962, 2047) with 396469 observed interactions
Processing drug features...
Processing target features...
Registering drug features...
Registering target features...
Registering split information as unstructured data...
Registering provenance flags...
Registering additional Y values as unstructured data...
Created h5torch file at data/processed/data.h5torch


In [None]:
import h5torch

file = h5torch.File("data/processed/data.h5torch", "r")
print(file["central/indices"][:].shape)      # (2, 396469)
print(file["central/indices"][:][0].max())   # 149961
print(file["central/indices"][:][1].max())   # 2046

print(len(file["0"]["Drug_SMILES"][:]))      # 149962
# print(len(df_split["Drug_SMILES"].unique())) # 149962

print(len(file["1"]["Target_AA"][:]))       # 2047
# print(len(df_split["Target_AA"].unique()))  # 2047

print(file.keys())
print(file["0"].keys())
print(file["1"].keys())
print(file["unstructured"].keys())
print(file["central"].keys())

(2, 396469)
149961
2046
149962
2047
<KeysViewHDF5 ['0', '1', 'central', 'unstructured']>
<KeysViewHDF5 ['Drug_ID', 'Drug_InChIKey', 'Drug_SMILES']>
<KeysViewHDF5 ['Target_AA', 'Target_DNA', 'Target_Gene_name', 'Target_ID', 'Target_RefSeq_ID', 'Target_UniProt_ID']>
<KeysViewHDF5 ['Y_KIBA', 'Y_pKd', 'Y_pKi', 'in_BindingDB_Kd', 'in_BindingDB_Ki', 'in_DAVIS', 'in_KIBA', 'in_Metz', 'split_cold', 'split_rand']>
<KeysViewHDF5 ['data', 'indices']>


In [7]:
39664 / 396469

0.10004313073657714

In [8]:
# Load dataset with 'coo' sampling for training split
train_dataset = h5torch.Dataset(
    "data/processed/data.h5torch",
    sampling="coo",
    subset=("unstructured/split_cold", "test"),
    in_memory=True,
)

len(train_dataset), train_dataset[100]

(39747,
 {'central': False,
  '0/Drug_ID': 'D000028',
  '0/Drug_InChIKey': 'XZXHXSATPCNXJR-ZIADKAODSA-N',
  '0/Drug_SMILES': 'COC(=O)c1ccc2c(c1)NC(=O)C2=C(Nc1ccc(N(C)C(=O)CN2CCN(C)CC2)cc1)c1ccccc1',
  '1/Target_AA': 'MSNICQRLWEYLEPYLPCLSTEADKSTVIENPGALCSPQSQRHGHYFVALFDYQARTAEDLSFRAGDKLQVLDTLHEGWWFARHLEKRRDGSSQQLQGYIPSNYVAEDRSLQAEPWFFGAIGRSDAEKQLLYSENKTGSFLIRESESQKGEFSLSVLDGAVVKHYRIKRLDEGGFFLTRRRIFSTLNEFVSHYTKTSDGLCVKLGKPCLKIQVPAPFDLSYKTVDQWEIDRNSIQLLKRLGSGQFGEVWEGLWNNTTPVAVKTLKPGSMDPNDFLREAQIMKNLRHPKLIQLYAVCTLEDPIYIITELMRHGSLQEYLQNDTGSKIHLTQQVDMAAQVASGMAYLESRNYIHRDLAARNVLVGEHNIYKVADFGLARVFKVDNEDIYESRHEIKLPVKWTAPEAIRSNKFSIKSDVWSFGILLYEIITYGKMPYSGMTGAQVIQMLAQNYRLPQPSNCPQQFYNIMLECWNAEPKERPTFETLRWKLEDYFETDSSYSDANNFIR',
  '1/Target_DNA': 'ATGAGCAACATCTGTCAGAGGCTCTGGGAGTACCTAGAACCCTATCTCCCCTGTTTGTCCACGGAGGCAGACAAGTCAACCGTGATTGAAAATCCAGGGGCCCTTTGCTCTCCCCAGTCACAGAGGCATGGCCACTACTTTGTGGCTTTGTTTGATTACCAGGCTCGGACTGCTGAGGACTTGAGCTTCCGAGCAGGTGACAAACTTCAAGTTCTGGACACTTTGCATGAGGGCTGGTGGTTTGCCAGACACTTGG

In [10]:
df_split[
    (df_split["Drug_ID"] == "D000028") 
    & (df_split["Target_ID"] == "T000101")]

Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA,split_rand,split_cold
7741,D000028,XZXHXSATPCNXJR-ZIADKAODSA-N,COC(=O)c1ccc2c(c1)NC(=O)C2=C(Nc1ccc(N(C)C(=O)C...,T000101,P42685,FRK,NM_002031,MSNICQRLWEYLEPYLPCLSTEADKSTVIENPGALCSPQSQRHGHY...,ATGAGCAACATCTGTCAGAGGCTCTGGGAGTACCTAGAACCCTATC...,False,5.56862,,,True,False,False,False,False,train,test


In [1]:
import os

os.chdir("/home/robsyc/Desktop/thesis/MB-VAE-DTI")

In [2]:
from mb_vae_dti.processing.factory import load_dti_dataset

test_dataset = load_dti_dataset(
    split_column="split_cold",
    split="test",
    dataset_filter={"in_DAVIS": True},
    y_columns=["Y_pKd", "Y_KIBA"],
    in_memory=True
)

sample = test_dataset[0]
sample

10:33:44 - INFO - Old pandas version detected. Patching DataFrame.map to DataFrame.applymap


Error loading dataset: No samples match the specified filter criteria!
Filter results (samples passing each filter):
  - unstructured/split_cold: 0/396469 (0.0%)
  - unstructured/in_DAVIS: 19244/396469 (4.9%)
Combined filter: 0/396469 (0.0%)

Getting filter statistics to help diagnose the issue:
Total samples in dataset: 396469

Split column distributions:
  split_cold:
    b'test': 39747 samples (10.0%)
    b'train': 318211 samples (80.3%)
    b'valid': 38511 samples (9.7%)
  split_rand:
    b'test': 39664 samples (10.0%)
    b'train': 317166 samples (80.0%)
    b'valid': 39639 samples (10.0%)

Dataset column distributions:
  in_BindingDB_Kd:
    False: 365742 samples (92.2%)
    True: 30727 samples (7.8%)
  in_BindingDB_Ki:
    False: 138348 samples (34.9%)
    True: 258121 samples (65.1%)
  in_DAVIS:
    False: 377225 samples (95.1%)
    True: 19244 samples (4.9%)
  in_KIBA:
    False: 306761 samples (77.4%)
    True: 89708 samples (22.6%)
  in_Metz:
    False: 370440 samples (93.4%

ValueError: No samples match the specified filter criteria!
Filter results (samples passing each filter):
  - unstructured/split_cold: 0/396469 (0.0%)
  - unstructured/in_DAVIS: 19244/396469 (4.9%)
Combined filter: 0/396469 (0.0%)