# Processing Drug-Target Interaction Data

This notebook covers:
- Converting merged DTI data into an `h5torch` dataset
- Splitting the dataset (stratified) into train/val/test in two settings: random split, and cold-start split
- Computing embeddings from foundation models and storing them in the `h5torch` file
    - Drugs: `MMELON` (graph, image, text), and RDKit fingerprints
    - Targets: `NT`, `ESM`, and ESPF fingerprints
- Visualizing the foundatoin model embeddings

In [3]:
import os

# Set the working directory to the root of the project
os.chdir("/home/robsyc/Desktop/thesis/MB-VAE-DTI")

In [4]:
import pandas as pd
import numpy as np
from typing import Tuple, List, Dict, Any

df = pd.read_csv("data/processed/data.csv")
df

Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA
0,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000001,Q2M2I8,AAK1,NM_014911,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,ATGAAGAAGTTTTTCGACTCCCGGCGAGAGCAGGGCGGCTCTGGCC...,False,4.999996,,,True,True,False,False,False
1,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000002,P00519,ABL1,NM_005157,PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGK...,AACCTTTTCGTTGCACTGTATGATTTTGTGGCCAGTGGAGATAACA...,True,8.920819,,,True,False,False,False,False
2,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000003,Q04771,ACVR1,NM_001105,MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNED...,ATGGTAGATGGAGTGATGATTCTTCCTGTGCTTATCATGATTGCTC...,False,4.999996,,,True,True,False,False,False
3,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000004,P36896,ACVR1B,NM_004302,MAESAGASSFFPLVVLLLAGSGGSGPRGVQALLCACTSCLQANYTC...,ATGGCGGAGTCGGCCGGAGCCTCCTCCTTCTTCCCCCTTGTTGTCC...,False,4.999996,,,True,True,False,False,False
4,D000001,KTUFNOKKBVMGRW-UHFFFAOYSA-N,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,T000005,P27037,ACVR2A,NM_001278579,MGAAAKLAFAVFLISCSSGAILGRSETQECLFFNANWEKDRTNQTG...,ATGGGAGCTGCTGCAAAGTTGGCGTTTGCCGTCTTTCTTATCTCCT...,False,4.999996,,,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396464,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T001711,P41279,MAP3K8,NM_001244134,MEYMSTGSDNKEEIDLLIKHLNVSDVIDIMENLYASEEPAVYEPSL...,ATGGAGTACATGAGCACTGGAAGTGACAATAAAGAAGAGATTGATT...,True,,,13.70206,False,False,False,False,True
396465,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T000031,Q13554,CAMK2B,NM_001220,MATTVTCTRFTDEYQLYEDIGKGAFSVVRRCVKLCTGHEYAAKIIN...,ATGGCCACCACGGTGACCTGCACCCGCTTCACCGACGAGTACCAGC...,False,,,10.49794,False,False,False,False,True
396466,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T000032,Q13557,CAMK2D,NM_001221,MASTTTCTRFTDEYQLFEELGKGAFSVVRRCMKIPTGQEYAAKIIN...,ATGGCTTCGACCACAACCTGCACCAGGTTCACGGACGAGTATCAGC...,False,,,10.49794,False,False,False,False,True
396467,D162026,NGCNZPVYWSEEJK-UHFFFAOYSA-N,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,T000280,Q16539,MAPK14,NM_001315,MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKT...,ATGTCTCAGGAGAGGCCCACGTTCTACCGGCAGGAGCTGAACAAGA...,False,,,10.49794,False,False,False,False,True


## Add split columns and create h5torch file

In [3]:
from mb_vae_dti.processing import add_split_cols, create_h5torch, load_h5torch_DTI

df_split = add_split_cols(df)
create_h5torch(df_split, output_filename="data.h5torch")

03:02:15 - INFO - Old pandas version detected. Patching DataFrame.map to DataFrame.applymap


Creating h5torch file from dataframe with 396469 rows...
Found 149962 unique drugs and 2047 unique targets
Creating central interaction matrix of shape (149962, 2047) with 396469 observed interactions
Processing drug features...
Processing target features...
Created h5torch file at data/processed/data.h5torch


In [4]:
test_davis_metz = load_h5torch_DTI(
    setting="split_cold",
    split="test",
    datasets=["in_DAVIS", "in_Metz", "in_BindingDB_Kd"]
)
test_davis_metz[153]

Using boolean mask for mapping (7145 indices)
Verified alignment: all unstructured data has 396469 elements


{'central': True,
 '0/Drug_ID': 'D000028',
 '0/Drug_InChIKey': 'XZXHXSATPCNXJR-ZIADKAODSA-N',
 '0/Drug_SMILES': 'COC(=O)c1ccc2c(c1)NC(=O)C2=C(Nc1ccc(N(C)C(=O)CN2CCN(C)CC2)cc1)c1ccccc1',
 '1/Target_AA': 'MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIHKESGQVVAIKQVPVESDLQEIIKEISIMQQCDSPYVVKYYGSYFKNTDLWIVMEYCGAGSVSDIIRLRNKTLIEDEIATILKSTLKGLEYLHFMRKIHRDIKAGNILLNTEGHAKLADFGVAGQLTDTMAKRNTVIGTPFWMAPEVIQEIGYNCVADIWSLGITSIEMAEGKPPYADIHPMRAIFMIPTNPPPTFRKPELWSDDFTDFVKKCLVKNPEQRATATQLLQHPFIKNAKPVSILRDLITEAMEIKAKRHEEQQRELEEEEENSDEDELDSHTMVKTSVESVGTMRATSTMSEGAQTMIEHNSTMLESDLGTMVINSEDEEEEDGTMKRNATSPQVQRPSFMDYFDKQDFKNKSHENCNQNMHEPFPMSKNVFPDNWKVPQDGDFDFLKNLSLEELQMRLKALDPMMEREIEELRQRYTAKRQPILDAMDAKKRRQQNF',
 '1/Target_DNA': 'AGTAAACTAAAAAAGCTGAGTGAAGACAGTTTGACTAAGCAGCCTGAAGAAGTTTTTGATGTATTAGAGAAGCTTGGAGAAGGGTCTTATGGAAGTGTATTTAAAGCAATACACAAGGAATCCGGTCAAGTTGTCGCAATTAAACAAGTACCTGTTGAATCAGATCTTCAGGAAATAATCAAAGAAATTTCCATAATGCAGCAATGTGACAGCCCATATGTTGTAAAGTACTATGGCAGTTATTTTAAGAATACAGACCTCTGGATTGTTATGGAGTACTGTGGC

In [6]:
# get row where Drug_ID is ... and Target_ID is ...
drug_id = 'D000028'
target_id = 'T000154'

df_split[(df_split["Drug_ID"] == drug_id) & (df_split["Target_ID"] == target_id)]

Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA,split_rand,split_cold
7794,D000028,XZXHXSATPCNXJR-ZIADKAODSA-N,COC(=O)c1ccc2c(c1)NC(=O)C2=C(Nc1ccc(N(C)C(=O)C...,T000154,Q13188,STK3,NM_001256312,MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIH...,AGTAAACTAAAAAAGCTGAGTGAAGACAGTTTGACTAAGCAGCCTG...,True,7.419075,,,True,False,False,False,False,train,test


## Adding embeddings to the h5torch file

## Drug Generation

In [7]:
import os
os.chdir("/home/robsyc/Desktop/thesis/MB-VAE-DTI")

from mb_vae_dti.processing import add_split_cols_drug_generation, create_h5torch_smiles, SMILESDataset
import pandas as pd

df_drug_generation = pd.read_csv("data/processed/data_drug_generation.csv")
df_drug_generation_split = add_split_cols_drug_generation(df_drug_generation)

In [8]:
create_h5torch_smiles(df_drug_generation_split)

Creating h5torch file from dataframe with 4007090 SMILES strings...
Created h5torch file for SMILES data at data/processed/data_drug_generation.h5torch


In [9]:
import h5torch

# open data_drug_generation.h5torch
with h5torch.File("data/processed/data_drug_generation.h5torch", "r") as f:
    print(f.keys())
    print(f["central"])
    print(f["0"])
    print(f["0"].keys())
    print(f["0/smiles"][:10])
    print("0/smiles" in f)


<KeysViewHDF5 ['0', 'central']>
<HDF5 dataset "central": shape (4007090,), type "<i8">
<HDF5 group "/0" (2 members)>
<KeysViewHDF5 ['smiles', 'split']>
[b'Cc1cccc(N2CC(c3nc4ccccc4n3C)CC2=O)c1C'
 b'CCn1c(C2CC(=O)N(c3cccc(C)c3C)C2)nc2ccccc21'
 b'Cc1cc(C)c(N2CC(c3nc4ccccc4[nH]3)CC2=O)c(C)c1'
 b'Cc1cc(C)c(N2CC(c3nc4ccccc4n3C)CC2=O)c(C)c1'
 b'Cn1c(C2CC(=O)N(Cc3ccccc3)C2)nc2ccccc21'
 b'Cc1ccc(N2CC(c3nc4ccccc4n3C)CC2=O)cc1Cl'
 b'COc1ccc(Cl)cc1N1CC(c2nc3ccccc3[nH]2)CC1=O'
 b'CC1CN(c2cc(=O)n(-c3ccccc3Cl)c(=O)[nH]2)CC(C)O1'
 b'O=C(c1ccccc1)N1CCN(S(=O)(=O)Cc2ccccc2)CC1'
 b'O=C(c1ccncc1)N1CCN(S(=O)(=O)Cc2ccccc2)CC1']
True


In [1]:
# Create dataset for all data
dataset = SMILESDataset("data/processed/data_drug_generation.h5torch")

# Or for a specific split
train_dataset = SMILESDataset(
    filename="data/processed/data_drug_generation.h5torch", 
    split="train"
)

dataset[0]

03:07:01 - INFO - Old pandas version detected. Patching DataFrame.map to DataFrame.applymap


'Cc1cccc(N2CC(c3nc4ccccc4n3C)CC2=O)c1C'