In [1]:
import os
import sys
import gzip
from multiprocessing import Pool

import pandas as pd
from pandarallel import pandarallel
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import rdMolDescriptors
from tqdm import tqdm

In [2]:
# Suppress RDKit warnings and errors
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

## Download PubChem

```bash
wget -r -np -nH --cut-dirs=3 -A "*.sdf.gz" ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/
```

In [4]:
# Directory containing the downloaded SDF files
sdf_directory = "./SDF"

# Create a directory to store the SMILES files
smiles_directory = "./SMILES"
os.makedirs(smiles_directory, exist_ok=True)

# Function to convert SDF to SMILES
def convert_sdf_to_smiles(args):
    sdf_file, output_file = args
    with gzip.open(sdf_file, 'rb') if sdf_file.endswith('.gz') else open(sdf_file, 'rb') as f_in:
        suppl = Chem.ForwardSDMolSupplier(f_in)
        with open(output_file, 'w') as smiles_file:
            for mol in suppl:
                if mol is not None:
                    smiles_file.write(Chem.MolToSmiles(mol) + '\n')
    return sdf_file, output_file

# Prepare the list of SDF files to process
sdf_files = [
    (os.path.join(sdf_directory, sdf_file),
     os.path.join(smiles_directory, sdf_file.replace('.sdf.gz', '.smiles').replace('.sdf', '.smiles')))
    for sdf_file in os.listdir(sdf_directory)
    if sdf_file.endswith('.sdf') or sdf_file.endswith('.sdf.gz')
]

# Use multiprocessing to convert SDF to SMILES in parallel
with Pool(processes=15) as pool:
    for sdf_file, output_smiles_file in tqdm(pool.imap_unordered(convert_sdf_to_smiles, sdf_files), total=len(sdf_files), desc='Converting SDF to SMILES'):
        print(f"Converted {sdf_file} to {output_smiles_file}", file=sys.stderr)

Converting SDF to SMILES:   0%|          | 0/343 [00:00<?, ?it/s]Converted ./SDF/Compound_039000001_039500000.sdf.gz to ./SMILES/Compound_039000001_039500000.smiles
Converting SDF to SMILES:   0%|          | 1/343 [00:40<3:50:35, 40.46s/it]Converted ./SDF/Compound_037000001_037500000.sdf.gz to ./SMILES/Compound_037000001_037500000.smiles
Converting SDF to SMILES:   1%|          | 2/343 [00:59<2:39:06, 27.99s/it]Converted ./SDF/Compound_086000001_086500000.sdf.gz to ./SMILES/Compound_086000001_086500000.smiles
Converting SDF to SMILES:   1%|          | 3/343 [01:13<2:03:06, 21.72s/it]Converted ./SDF/Compound_079000001_079500000.sdf.gz to ./SMILES/Compound_079000001_079500000.smiles
Converting SDF to SMILES:   1%|          | 4/343 [03:19<5:53:49, 62.62s/it]Converted ./SDF/Compound_022000001_022500000.sdf.gz to ./SMILES/Compound_022000001_022500000.smiles
Converting SDF to SMILES:   1%|▏         | 5/343 [03:35<4:18:01, 45.80s/it]Converted ./SDF/Compound_125000001_125500000.sdf.gz to ./SMI

In [5]:
# Merge all SMILES files into a single file
merged_smiles_file = "pubchem.smiles"
with open(merged_smiles_file, 'w') as outfile:
    for smiles_file in os.listdir(smiles_directory):
        if smiles_file.endswith('.smiles'):
            with open(os.path.join(smiles_directory, smiles_file), 'r') as infile:
                for line in infile:
                    outfile.write(line)

## Calculate masses and formulas

In [3]:
df = pd.read_csv('pubchem.smiles', header=None, names=['smiles'])
df

Unnamed: 0,smiles
0,[O-]/C(=N\c1ccc(-c2cc(C(F)(F)F)ccc2F)cc1)c1ccc...
1,O=C(Nc1ccc(-c2cc(C(F)(F)F)ccc2F)cc1)c1ccccc1F
2,Cc1cc(Cl)cc(-c2cnc3ccc(-c4cccc(OCc5ccccc5)c4)c...
3,Cc1cc(Cl)cc(-c2cnc3ccc(-c4cccc(OCc5ccccc5)c4)c...
4,CNC(=O)/C=C/c1ccc(-c2ccc(O[C@@H]3C[C@@H]4CC[C@...
...,...
118008568,COCCOc1ncccc1NC(=O)C1CCN(C(=O)COc2ccccc2)CC1
118008569,CCOC(=O)c1ccc(S(=O)(=O)N2CCC(C(=O)Nc3cccnc3OCC...
118008570,COCCOc1ncccc1NC(=O)C1CCN(S(=O)(=O)c2cccnc2)CC1
118008571,COCCOc1ncccc1NC(=O)c1sc2nc(C(F)(F)F)ccc2c1C


In [4]:
def compute_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return pd.Series([None, None])
    formula = rdMolDescriptors.CalcMolFormula(mol)
    mass = rdMolDescriptors.CalcExactMolWt(mol)
    return pd.Series([formula, mass])

pandarallel.initialize(nb_workers=30, progress_bar=True)
df[['formula', 'mass']] = df['smiles'].parallel_apply(compute_properties)
df.to_csv('pubchem.tsv', index=False, sep='\t')
df

INFO: Pandarallel will run on 30 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3933620), Label(value='0 / 3933620…

Unnamed: 0,smiles,formula,mass
0,[O-]/C(=N\c1ccc(-c2cc(C(F)(F)F)ccc2F)cc1)c1ccc...,C20H11F5NO-,376.076629
1,O=C(Nc1ccc(-c2cc(C(F)(F)F)ccc2F)cc1)c1ccccc1F,C20H12F5NO,377.083905
2,Cc1cc(Cl)cc(-c2cnc3ccc(-c4cccc(OCc5ccccc5)c4)c...,C41H42ClN3O4,675.286385
3,Cc1cc(Cl)cc(-c2cnc3ccc(-c4cccc(OCc5ccccc5)c4)c...,C41H42ClN3O4,675.286385
4,CNC(=O)/C=C/c1ccc(-c2ccc(O[C@@H]3C[C@@H]4CC[C@...,C21H24N4O3,380.184841
...,...,...,...
118008568,COCCOc1ncccc1NC(=O)C1CCN(C(=O)COc2ccccc2)CC1,C22H27N3O5,413.195071
118008569,CCOC(=O)c1ccc(S(=O)(=O)N2CCC(C(=O)Nc3cccnc3OCC...,C21H27N3O8S,481.151886
118008570,COCCOc1ncccc1NC(=O)C1CCN(S(=O)(=O)c2cccnc2)CC1,C19H24N4O5S,420.146741
118008571,COCCOc1ncccc1NC(=O)c1sc2nc(C(F)(F)F)ccc2c1C,C18H16F3N3O3S,411.086447
