# Processing Drug-Target Interaction Data

This notebook covers:
- Converting merged DTI data into an `h5torch` dataset
- Splitting the dataset (stratified) into train/val/test in two settings: random split, and cold-start split
- Computing embeddings from foundation models and storing them in the `h5torch` file
    - Drugs: `MMELON` (graph, image, text), and `RDKit` fingerprints
    - Targets: `NT`, `ESM`, and `ESPF` fingerprints
- Visualizing the foundatoin model embeddings

In [1]:
from resolve import *

Setting working directory to: /home/robsyc/Desktop/thesis/MB-VAE-DTI


In [2]:
import pandas as pd

df = pd.read_csv("data/processed/dti.csv")
df = df.sample(100)
df

Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA
231244,D096336,PMZOPKCHYGDQPO-UHFFFAOYSA-N,Cn1c(SCCCN2CCOC(c3ccc(C(F)(F)F)cc3)C2)nnc1-c1c...,T000017,Q12809,KCNH2,NM_000238,MPVRRGHVAPQNTFLDTIIRKFEGQSRKFIIANARVENCAVIYCND...,ATGCCGGTGCGGAGGGGCCACGTCGCGCCGCAGAACACCTTCCTGG...,False,,5.949948,,False,False,True,False,False
200815,D085750,WQOCUOYKDKZARU-UHFFFAOYSA-N,Cc1ccc(C2CN(CCCSc3nnc(-c4ccc(-c5noc(C)n5)cc4)n...,T000002,P35462,DRD3,NM_000796,MASLSQLSSHLNYTCGAENSTGASQARPHAYYALSYCALILAIVFG...,ATGGCATCTCTGAGCCAGCTGAGTGGCCACCTGAACTACACCTGTG...,False,,6.396585,,False,False,True,False,False
141304,D063565,XNUNIHJOMFCFRQ-LIRRHRJNSA-N,COc1cc(N)c(Cl)cc1C(=O)NC1CCN(Cc2ccccc2)CC1C,T000004,Q95136,DRD1,NM_174042,MRTLNTSTMEGTGLVAERDFSFRILTACFLSLLILSTLLGNTLVCA...,ATGAGGACTCTCAACACGTCTACCATGGAAGGCACCGGGCTGGTGG...,False,,4.934990,,False,False,True,False,False
231345,D096398,VMNCYKLGFBECRY-UHFFFAOYSA-N,Cn1c2c(c3cc(S(=O)(=O)c4cccc(OC(F)F)c4)ccc31)C1...,T000572,P50406,HTR6,NM_000871,MVPEPGPTANSTPAWGAGPPSAPGGSGWVAAALCVVIALTAAANSL...,ATGGTCCCAGAGCCGGGCCCAACCGCCAATAGCACCCCGGCCTGGG...,True,,8.229148,,False,False,True,False,False
42119,D019890,PKCDDUHJAFVJJB-UHFFFAOYSA-N,CC1(O)CC(c2nc(-c3ccc4ccc(-c5ccccc5)nc4c3)c3c(N...,T000227,P27448,MARK3,NM_001128918,MSTRTPLPTVNERDTENHTSHGDGRQEVTSRTSRSGARCRNSIASC...,ATGTCCACTAGGACCCCATTGCCAACGGTGAATGAACGAGACACTG...,False,,,11.6,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270461,D110386,MCKJOJPTYUZASK-UHFFFAOYSA-N,Nc1ccc(-c2ccc(-c3ccccc3S(=O)(=O)N3CCCC3)cc2F)cn1,T000851,P20292,ALOX5AP,NM_001204406,MDQETVGNVVLLAIVTLISVVQNGFFAHKVEHESRTQNGRSFQRTG...,ATGGATCAAGAAACTGTAGGCAATGTTGTCCTGTTGGCCATCGTCA...,False,,7.090979,,False,False,True,False,False
177312,D077391,WFYUDPRNHZAOBW-UHFFFAOYSA-N,CS(=O)(=O)c1ccccc1-c1ccc(N2CCCC(N(CC(=O)O)S(=O...,T000160,P00742,F10,NM_000504,MGRPLHLVLLSASLAGLLLLGESLFIRREQANNILARVTRANSFLE...,ATGGGGCGCCCACTGCACCTCGTCCTGCTCAGTGCCTCCCTGGCTG...,True,,8.958607,,False,False,True,False,False
72321,D035611,DKTWJFBNXZNLCC-UHFFFAOYSA-N,CCCc1c(OCCCOc2cc(O)c(C(C)=O)cc2CC)cccc1OC(CC)C...,T001117,Q15722,LTB4R,NM_001143919,MNTTSSAAPPSLGVEFISLLAIILLSVALAVGLPGNSFVVWSILKR...,ATGAACACTACATCTTCTGCAGCACCCCCCTCACTAGGTGTAGAGT...,True,,7.793174,,False,False,True,False,False
257793,D106169,JMJOEJHKSMMCAT-ZNLHFFCSSA-N,NC1(C(=O)O)C(OCc2ccc(Cl)cc2)CC2C1C2(F)C(=O)O,T001517,P31422,Grm3,NM_001105712,MKMLTRLQILMLALFSKGFLLSLGDHNFMRREIKIEGDLVLGGLFP...,ATGAAGATGTTGACAAGACTACAAATTCTTATGTTAGCTTTGTTTT...,True,,8.312471,,False,False,True,False,False


In [3]:
from mb_vae_dti.processing.embedding import save_representations_to_h5

save_representations_to_h5(
    df,
    "Drug_SMILES",
    "dti_smiles.hdf5"
)
save_representations_to_h5(
    df,
    "Target_AA",
    "dti_aa.hdf5"
)
save_representations_to_h5(
    df,
    "Target_DNA",
    "dti_dna.hdf5"
)
# do same for pretrain_... data

2025-04-25 15:44:06,499 - embedding - INFO - Created H5 file with 100 unique Drug_IDs at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_smiles.hdf5
2025-04-25 15:44:06,503 - embedding - INFO - Created H5 file with 85 unique Target_IDs at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_aa.hdf5
2025-04-25 15:44:06,506 - embedding - INFO - Created H5 file with 85 unique Target_IDs at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_dna.hdf5


PosixPath('/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_dna.hdf5')

In [1]:
from mb_vae_dti.processing.embedding import run_embedding_script

run_embedding_script(
    "dti_smiles.hdf5",
    "rdMorganFP"            # 2048
)
run_embedding_script(
    "dti_aa.hdf5",
    "ESPF"                  # 4170
)
run_embedding_script(
    "dti_smiles.hdf5",
    "biomed-multi-view"   # 512, 512, 768
)
# run_embedding_script(
#     "dti_aa.txt",
#     "Target_emb_aa.npy",
#     "ESM"                   # 2560
# )
run_embedding_script(
    "dti_dna.hdf5",
    "nucleotide-transformer" # 1024
)

2025-04-25 16:00:00,306 - embedding - INFO - Running embedding script with command: /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/run_embeddings.sh rdMorganFP /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_smiles.hdf5


Activating virtual environment at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/rdMorganFP/venv
Checking for installation requirements...
Installing from requirements.txt
Running: python /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/rdMorganFP/script.py --input "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_smiles.hdf5"



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Generating FP embeddings: 100%|██████████| 100/100 [00:00<00:00, 4324024.74it/s]
2025-04-25 16:00:00,937 - embedding - INFO - Running embedding script with command: /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/run_embeddings.sh ESPF /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_aa.hdf5


Processing 100 items in 1 batches...
Successfully added FP embeddings with shape (2048,) to /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_smiles.hdf5
Embedding generation completed successfully
Activating virtual environment at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/ESPF/venv
Checking for installation requirements...
Installing from requirements.txt
Running: python /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/ESPF/script.py --input "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_aa.hdf5"



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Generating FP embeddings: 100%|██████████| 85/85 [00:00<00:00, 3917756.48it/s]
2025-04-25 16:00:02,482 - embedding - INFO - Running embedding script with command: /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/run_embeddings.sh biomed-multi-view /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_smiles.hdf5


Processing 85 items in 1 batches...
Successfully added FP embeddings with shape (4170,) to /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_aa.hdf5
Embedding generation completed successfully
Activating virtual environment at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/biomed-multi-view/venv
Checking for installation requirements...
Installing from requirements.txt
Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu121.html
Running: python /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/biomed-multi-view/script.py --input "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_smiles.hdf5"


  return torch._C._cuda_getDeviceCount() > 0


Processing embeddings for /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_smiles.hdf5
Using model at: /home/robsyc/Desktop/thesis/MB-VAE-DTI/data/checkpoints/Biomed-smmv/biomed-smmv-base.pth


2025-04-25 16:00:07,043 - root - INFO - pop-os:127262800213184:0:0 - BioMed Multi-view embedding model
2025-04-25 16:00:07,043 - root - INFO - pop-os:127262800213184:0:0 - dim_list [512, 512, 768] of Graph2dModel, ImageModel, TextModel
2025-04-25 16:00:07,054 - root - INFO - pop-os:127262800213184:0:0 - Loading checkpoint from provided path /home/robsyc/Desktop/thesis/MB-VAE-DTI/data/checkpoints/Biomed-smmv/biomed-smmv-base.pth
2025-04-25 16:00:07,194 - root - INFO - pop-os:127262800213184:0:0 - Loading pretrain checkpoint for SmallMoleculeMultiView Model - _IncompatibleKeys(missing_keys=['aggregator.projections.0.weight', 'aggregator.projections.0.bias', 'aggregator.projections.1.weight', 'aggregator.projections.1.bias', 'aggregator.projections.2.weight', 'aggregator.projections.2.bias', 'aggregator.w_before_mean.0.weight', 'aggregator.w_before_mean.0.bias', 'aggregator.w_before_mean.2.weight', 'aggregator.down_project.weight', 'aggregator.down_project.bias', 'aggregator.shared_task_h

Processing 100 items in 4 batches...


Generating multiview embeddings: 100%|██████████| 100/100 [00:04<00:00, 21.82it/s]


Successfully added multiview embeddings to /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_smiles.hdf5
  - Graph embeddings shape: (100, 512)
  - Image embeddings shape: (100, 512)
  - Text embeddings shape: (100, 768)


2025-04-25 16:00:14,491 - embedding - INFO - Running embedding script with command: /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/run_embeddings.sh nucleotide-transformer /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_dna.hdf5


Embedding generation completed successfully
Activating virtual environment at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/nucleotide-transformer/venv
Checking for installation requirements...
Installing using setup.py
Obtaining file:///home/robsyc/Desktop/thesis/MB-VAE-DTI/external/nucleotide-transformer
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: nucleotide_transformer
  Building editable for nucleotide_transformer (pyproject.toml): started
  Building editable for nucleotide_transformer (pyproject.


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Loading Nucleotide Transformer model (500M_multi_species_v2)...
Downloading model's hyperparameters json file...
Downloaded model's hyperparameters.
Downloading model's weights...
Downloaded model's weights...
Model loaded successfully
Processing DNA sequences with Nucleotide Transformer
Using model: 500M_multi_species_v2, embedding layer: 29




Processing 85 DNA sequences in 6 batches...


Generating NT embeddings:  94%|█████████▍| 80/85 [00:04<00:00, 17.56it/s]

Successfully added NT embeddings with shape (1024,) to /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/dti_dna.hdf5
Embedding generation completed successfully


Generating NT embeddings: 100%|██████████| 85/85 [00:05<00:00, 14.77it/s]


---
---

## Add split columns and create h5torch file

In [4]:
from mb_vae_dti.processing import add_split_cols, create_h5torch

df_split = add_split_cols(df)
create_h5torch(df_split, output_filename="DTI_small.h5torch")

14:37:25 - INFO - Old pandas version detected. Patching DataFrame.map to DataFrame.applymap


Creating h5torch file from dataframe with 1000 rows...
Found 955 unique drugs and 418 unique targets
Creating central interaction matrix of shape (955, 418) with 1000 observed interactions
Processing drug features...
Processing target features...
Created h5torch file at data/processed/DTI_small.h5torch


In [14]:
from mb_vae_dti.processing import load_h5torch_DTI

test_davis_metz = load_h5torch_DTI(
    filename="DTI_small.h5torch",
    setting="split_cold",
    split="test",
    datasets=["in_DAVIS", "in_Metz", "in_BindingDB_Kd", "in_BindingDB_Ki"]
)
test_davis_metz[20]

Using boolean mask for mapping (93 indices)
Verified alignment: all unstructured data has 1000 elements


{'central': False,
 '0/Drug_ID': 'D020044',
 '0/Drug_InChIKey': 'WUTRVCAUFFQVGX-HNNXBMFYSA-N',
 '0/Drug_SMILES': 'CC(C)CN(C(=O)c1ccccc1C(C)C)[C@H]1CCNC1',
 '1/Target_AA': 'MSKSKCSVGLMSSVVAPAKEPNAVGPKEVELILVKEQNGVQLTSSTLTNPRQSPVEAQDRETWGKKIDFLLSVIGFAVDLANVWRFPYLCYKNGGGAFLVPYLLFMVIAGMPLFYMELALGQFNREGAAGVWKICPILKGVGFTVILISLYVGFFYNVIIAWALHYLFSSFTTELPWIHCNNSWNSPNCSDAHPGDSSGDSSGLNDTFGTTPAAEYFERGVLHLHQSHGIDDLGPPRWQLTACLVLVIVLLYFSLWKGVKTSGKVVWITATMPYVVLTALLLRGVTLPGAIDGIRAYLSVDFYRLCEASVWIDAATQVCFSLGVGFGVLIAFSSYNKFTNNCYRDAIVTTSINSLTSFSSGFVVFSFLGYMAQKHSVPIGDVAKDGPGLIFIIYPEAIATLPLSSAWAVVFFIMLLTLGIDSAMGGMESVITGLIDEFQLLHRHRELFTLFIVLATFLLSLFCVTNGGIYVFTLLDHFAAGTSILFGVLIEAIGVAWFYGVGQFSDDIQQMTGQRPSLYWRLCWKLVSPCFLLFVVVVSIVTFRPPHYGAYIFPDWANALGWVIATSSMAMVPIYAAYKFCSLPGSFREKLAYAIAPEKDRELVDRGEVRQFTLRHWLKV',
 '1/Target_DNA': 'ATGAGTAAGAGCAAATGCTCCGTGGGACTCATGTCTTCCGTGGTGGCCCCGGCTAAGGAGCCCAATGCCGTGGGCCCGAAGGAGGTGGAGCTCATCCTTGTCAAGGAGCAGAACGGAGTGCAGCTCACCAGCTCCACCCTCACCAACCCGCGGCAGAGCCCCGTGGAGGCCCAGGATCGGGAGACCT

In [15]:
# get row where Drug_ID is ... and Target_ID is ...
drug_id = 'D020044'
target_id = 'T000469'

df_split[(df_split["Drug_ID"] == drug_id) & (df_split["Target_ID"] == target_id)]

Unnamed: 0,Drug_ID,Drug_InChIKey,Drug_SMILES,Target_ID,Target_UniProt_ID,Target_Gene_name,Target_RefSeq_ID,Target_AA,Target_DNA,Y,Y_pKd,Y_pKi,Y_KIBA,in_DAVIS,in_BindingDB_Kd,in_BindingDB_Ki,in_Metz,in_KIBA,split_rand,split_cold
58456,D020044,WUTRVCAUFFQVGX-HNNXBMFYSA-N,CC(C)CN(C(=O)c1ccccc1C(C)C)[C@H]1CCNC1,T000469,Q01959,SLC6A3,NM_001044,MSKSKCSVGLMSSVVAPAKEPNAVGPKEVELILVKEQNGVQLTSST...,ATGAGTAAGAGCAAATGCTCCGTGGGACTCATGTCTTCCGTGGTGG...,False,,5.444893,,False,False,True,False,False,train,test


## Adding embeddings to the h5torch file

In [7]:
from mb_vae_dti.processing import load_h5torch_DTI

test_davis_metz = load_h5torch_DTI(
    filename="DTI_small.h5torch",
    setting="split_cold",
    split="test",
    datasets=["in_DAVIS", "in_Metz", "in_BindingDB_Kd"]
)
test_davis_metz[20]

Using boolean mask for mapping (22 indices)
Verified alignment: all unstructured data has 1000 elements


{'central': True,
 '0/Drug_FP': array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 '0/Drug_ID': 'D005704',
 '0/Drug_InChIKey': 'BPNUQXPIQBZCMR-IBGZPJMESA-N',
 '0/Drug_SMILES': 'Cc1n[nH]c2ccc(-c3cncc(OC[C@@H](N)Cc4ccccc4)c3)cc12',
 '0/Drug_emb_graph': array([ 1.90964546e-02,  3.65307368e-02, -7.67227039e-02, -2.17817098e-01,
         2.92698480e-02, -7.65806576e-03,  6.17058277e-02, -3.02248616e-02,
        -3.07081044e-01, -1.00169100e-01, -1.62992515e-02, -4.32518095e-01,
         7.75575042e-02,  2.66861264e-02, -2.08349586e-01,  6.72742501e-02,
         4.46739830e-02,  1.43983409e-01,  3.29942629e-02,  8.80348533e-02,
        -1.41768217e-01, -3.20785753e-02,  2.06791297e-01,  2.04960611e-02,
         1.23171246e+00, -1.98846664e-02,  2.10213590e+00, -6.31648824e-02,
         3.30971442e-02,  2.19252086e+00, -5.95130119e-03, -2.86935985e-01,
        -4.31635641e-02,  4.11687195e-02,  1.21839553e-01, -4.30982709e-01,
        -4.06590328e-02,  6.95760772e-02, -9.02318507e-02,  2.5

In [6]:
from mb_vae_dti.processing.embedding import add_all_embeddings_to_h5torch

add_all_embeddings_to_h5torch(
    h5torch_file_name="DTI_small.h5torch"
)

2025-04-19 14:39:47,131 - embedding - INFO - Retrieved 955 sequences from Drug_SMILES
2025-04-19 14:39:47,131 - embedding - INFO - Generating embeddings using rdMorganFP...
2025-04-19 14:39:47,132 - embedding - INFO - Wrote 955 sequences to external/temp/rdmorganfp_input_1745066387.txt
2025-04-19 14:39:47,133 - embedding - INFO - Running embedding command: /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/run_embeddings.sh rdMorganFP /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/rdmorganfp_input_1745066387.txt /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/rdmorganfp_output_1745066387.npy
2025-04-19 14:39:47,980 - embedding - INFO - Activating virtual environment at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/rdMorganFP/venv
Running: python /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/rdMorganFP/script.py --input "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/rdmorganfp_input_1745066387.txt" --output "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/rdmorga

In [5]:
from mb_vae_dti.processing import add_embeddings_to_h5torch

add_embeddings_to_h5torch(
    h5torch_file_name="DTI_small.h5torch",
    repo_name="biomed-multi-view",
    entity_axis=0,
    entity_representation="Drug_SMILES",
    feature_name="Drug_emb_graph",
    use_batch=False,
    overwrite=True
)

2025-04-19 13:29:35,624 - embedding - INFO - Retrieved 433 sequences from Target_AA
2025-04-19 13:29:35,625 - embedding - INFO - Generating embeddings using ESPF...
2025-04-19 13:29:35,626 - embedding - INFO - Wrote 433 sequences to external/temp/espf_input_1745062175.txt
2025-04-19 13:29:35,626 - embedding - INFO - Running embedding command: /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/run_embeddings.sh ESPF /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/espf_input_1745062175.txt /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/espf_output_1745062175.npy
2025-04-19 13:29:39,705 - embedding - INFO - Activating virtual environment at /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/ESPF/venv
Running: python /home/robsyc/Desktop/thesis/MB-VAE-DTI/external/ESPF/script.py --input "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/espf_input_1745062175.txt" --output "/home/robsyc/Desktop/thesis/MB-VAE-DTI/external/temp/espf_output_1745062175.npy"
Processing 433 sequences...
S

---

## Drug Generation

In [7]:
import os
os.chdir("/home/robsyc/Desktop/thesis/MB-VAE-DTI")

from mb_vae_dti.processing import add_split_cols_drug_generation, create_h5torch_smiles, SMILESDataset
import pandas as pd

df_drug_generation = pd.read_csv("data/processed/data_drug_generation.csv")
df_drug_generation_split = add_split_cols_drug_generation(df_drug_generation)

In [8]:
create_h5torch_smiles(df_drug_generation_split)

Creating h5torch file from dataframe with 4007090 SMILES strings...
Created h5torch file for SMILES data at data/processed/data_drug_generation.h5torch


In [9]:
import h5torch

# open data_drug_generation.h5torch
with h5torch.File("data/processed/data_drug_generation.h5torch", "r") as f:
    print(f.keys())
    print(f["central"])
    print(f["0"])
    print(f["0"].keys())
    print(f["0/smiles"][:10])
    print("0/smiles" in f)


<KeysViewHDF5 ['0', 'central']>
<HDF5 dataset "central": shape (4007090,), type "<i8">
<HDF5 group "/0" (2 members)>
<KeysViewHDF5 ['smiles', 'split']>
[b'Cc1cccc(N2CC(c3nc4ccccc4n3C)CC2=O)c1C'
 b'CCn1c(C2CC(=O)N(c3cccc(C)c3C)C2)nc2ccccc21'
 b'Cc1cc(C)c(N2CC(c3nc4ccccc4[nH]3)CC2=O)c(C)c1'
 b'Cc1cc(C)c(N2CC(c3nc4ccccc4n3C)CC2=O)c(C)c1'
 b'Cn1c(C2CC(=O)N(Cc3ccccc3)C2)nc2ccccc21'
 b'Cc1ccc(N2CC(c3nc4ccccc4n3C)CC2=O)cc1Cl'
 b'COc1ccc(Cl)cc1N1CC(c2nc3ccccc3[nH]2)CC1=O'
 b'CC1CN(c2cc(=O)n(-c3ccccc3Cl)c(=O)[nH]2)CC(C)O1'
 b'O=C(c1ccccc1)N1CCN(S(=O)(=O)Cc2ccccc2)CC1'
 b'O=C(c1ccncc1)N1CCN(S(=O)(=O)Cc2ccccc2)CC1']
True


In [1]:
# Create dataset for all data
dataset = SMILESDataset("data/processed/data_drug_generation.h5torch")

# Or for a specific split
train_dataset = SMILESDataset(
    filename="data/processed/data_drug_generation.h5torch", 
    split="train"
)

dataset[0]

03:07:01 - INFO - Old pandas version detected. Patching DataFrame.map to DataFrame.applymap


'Cc1cccc(N2CC(c3nc4ccccc4n3C)CC2=O)c1C'