In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from tdc.multi_pred import DTI



In [1]:
import os

os.chdir('/home/robsyc/Desktop/thesis/MB-VAE-DTI')

In [8]:
data = MolGen(name="MOSES", path="./data/source/")
df = data.get_data()

df.iloc[0]

Found local copy...
Loading...
Done!


smiles    CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1
Name: 0, dtype: object

In [9]:
mol = Chem.MolFromSmiles("CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1")

In [12]:
from rdkit.Chem import Descriptors

Descriptors.HeavyAtomCount(mol)

19

In [13]:
from tdc.generation import MolGen
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from pathlib import Path
import concurrent.futures
import numpy as np

# Define paths
DATA_DIR = Path("data")
SOURCE_DIR = DATA_DIR / "source"
PROCESSED_DIR = DATA_DIR / "processed"

MAX_N_HEAVY_ATOMS = 64

# Disable RDKit logging for better performance
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

def canonicalize_smiles(smiles):
    """Convert a SMILES string to its canonical form"""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol and Descriptors.HeavyAtomCount(mol) <= MAX_N_HEAVY_ATOMS:
            return Chem.MolToSmiles(mol, isomericSmiles=True)
        return ""
    except:
        return ""

def process_chunk(smiles_chunk):
    """Process a chunk of SMILES strings in parallel"""
    return [canonicalize_smiles(smiles) for smiles in smiles_chunk]

def fetch_and_merge_datasets(datasets=["MOSES", "ZINC", "ChEMBL_V29"], path=SOURCE_DIR, n_workers=8, chunk_size=10000):
    """
    Fetches and merges multiple molecular datasets, removing duplicate SMILES
    and ensuring all SMILES are in canonical form using RDKit with parallel processing.
    
    Args:
        datasets (list): List of dataset names to fetch
        path (str): Path to store/load the datasets
        n_workers (int): Number of parallel workers
        chunk_size (int): Size of chunks for parallel processing
        
    Returns:
        pandas.DataFrame: Merged dataframe with unique canonical SMILES
    """
    all_data = []
    
    for dataset_name in datasets:
        print(f"Loading {dataset_name}...")
        data = MolGen(name=dataset_name, path=path)
        df = data.get_data()
        all_data.append(df)
        print(f"  {len(df)} molecules loaded")
    
    # Concatenate all dataframes
    merged_df = pd.concat(all_data, ignore_index=True)
    print(f"Total molecules before deduplication: {len(merged_df)}")
    
    # First deduplication on exact SMILES strings
    merged_df = merged_df.drop_duplicates(subset=['smiles'])
    print(f"Unique molecules before canonicalization: {len(merged_df)}")
    
    # Split SMILES into chunks for parallel processing
    smiles_list = merged_df['smiles'].tolist()
    smiles_chunks = [smiles_list[i:i+chunk_size] for i in range(0, len(smiles_list), chunk_size)]
    
    # Use parallel processing for canonicalization
    print(f"Canonicalizing SMILES using {n_workers} workers...")
    canonical_smiles_chunks = []
    
    with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor:
        # Submit all chunks for processing
        future_to_chunk = {executor.submit(process_chunk, chunk): i for i, chunk in enumerate(smiles_chunks)}
        
        # Process results as they complete
        for future in concurrent.futures.as_completed(future_to_chunk):
            chunk_idx = future_to_chunk[future]
            try:
                result = future.result()
                canonical_smiles_chunks.append(result)
                # Print progress
                if (chunk_idx + 1) % 10 == 0:
                    print(f"  Processed {chunk_idx + 1}/{len(smiles_chunks)} chunks")
            except Exception as e:
                print(f"Error processing chunk {chunk_idx}: {e}")
    
    # Flatten the list of chunks
    canonical_smiles = []
    for chunk in canonical_smiles_chunks:
        canonical_smiles.extend(chunk)
    
    # Add canonical SMILES to dataframe
    merged_df['canonical_smiles'] = canonical_smiles

    # Remove duplicates based on canonical SMILES and filter out invalid molecules
    merged_df = merged_df[merged_df['canonical_smiles'] != ""]
    unique_df = merged_df.drop_duplicates(subset=['canonical_smiles'])
    
    # Keep original column structure if needed
    unique_df['smiles'] = unique_df['canonical_smiles']
    unique_df.drop(columns=['canonical_smiles'], inplace=True)
    
    print(f"Total valid molecules after processing: {len(unique_df)}")
    
    return unique_df

# Example usage
datasets = ["MOSES", "ZINC", "ChEMBL_V29"]
df = fetch_and_merge_datasets(datasets)
df.to_csv(PROCESSED_DIR / "data_drug_generation.csv", index=False)

Found local copy...
Loading...


Loading MOSES...


Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...


  1936962 molecules loaded
Loading ZINC...
  249455 molecules loaded
Loading ChEMBL_V29...


Done!


  2084723 molecules loaded
Total molecules before deduplication: 4271140
Unique molecules before canonicalization: 4181441
Canonicalizing SMILES using 8 workers...
  Processed 10/419 chunks
  Processed 20/419 chunks
  Processed 30/419 chunks
  Processed 40/419 chunks
  Processed 50/419 chunks
  Processed 60/419 chunks
  Processed 70/419 chunks
  Processed 80/419 chunks
  Processed 90/419 chunks
  Processed 100/419 chunks
  Processed 110/419 chunks
  Processed 120/419 chunks
  Processed 130/419 chunks
  Processed 140/419 chunks
  Processed 150/419 chunks
  Processed 160/419 chunks
  Processed 170/419 chunks
  Processed 180/419 chunks
  Processed 190/419 chunks
  Processed 200/419 chunks
  Processed 210/419 chunks
  Processed 220/419 chunks
  Processed 230/419 chunks
  Processed 240/419 chunks
  Processed 250/419 chunks
  Processed 260/419 chunks
  Processed 270/419 chunks
  Processed 280/419 chunks
  Processed 290/419 chunks
  Processed 300/419 chunks
  Processed 310/419 chunks
  Proces

In [14]:
df

Unnamed: 0,smiles
0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1
2,CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1
3,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO
4,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C
...,...
4271135,CC(C)(C)c1cc(C(=O)NC2CCN(Cc3ccccc3)CC2)cc(C(C)...
4271136,CCCCCCCCCOC[C@H]1O[C@H](O[C@@H]2[C@@H](O)[C@H]...
4271137,COC(=O)[C@]12CC[C@@H](C(C)CO)[C@@H]1[C@H]1CC[C...
4271138,COc1ccc(S(=O)(=O)NCc2ccc(C(=O)O)cc2)c2ccccc12
