# Script to standardize compounds from DrugBank

### 0. Imports

In [1]:
from rdkit import Chem
import pandas as pd
import molvs
from molvs import Standardizer
from molvs.fragment import LargestFragmentChooser
from molvs.charge import Uncharger

### 1. Extract information from .SDF file

In [2]:
# Path to SDF file
sdf_file = 'DrugBank_All_14_03_2024.sdf'

# Fields to extract
fields = ['DATABASE_ID', 'SMILES', 'INCHI_KEY', 'MOLECULAR_WEIGHT', 'GENERIC_NAME']

# Create a list to hold the data
data = []

# Read the SDF file
supplier = Chem.SDMolSupplier(sdf_file)

for mol in supplier:
    if mol is not None:
        # Create a dictionary for each molecule
        mol_data = {}
        
        for field in fields:
            mol_data[field] = mol.GetProp(field) if mol.HasProp(field) else None
        
        # Append the dictionary to the list
        data.append(mol_data)

# Convert to DataFrame
df = pd.DataFrame(data)

# Optionally save to a CSV file
df.to_csv('DrugBank_All.csv', index=False)

[10:38:02] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[10:38:02] ERROR: Could not sanitize molecule ending on line 298546
[10:38:02] ERROR: Explicit valence for atom # 13 Cl, 5, is greater than permitted
[10:38:03] Explicit valence for atom # 19 O, 3, is greater than permitted
[10:38:03] ERROR: Could not sanitize molecule ending on line 412783
[10:38:03] ERROR: Explicit valence for atom # 19 O, 3, is greater than permitted
[10:38:03] Explicit valence for atom # 1 N, 4, is greater than permitted
[10:38:03] ERROR: Could not sanitize molecule ending on line 540725
[10:38:03] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
[10:38:03] Explicit valence for atom # 1 N, 4, is greater than permitted
[10:38:03] ERROR: Could not sanitize molecule ending on line 598016
[10:38:03] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
[10:38:03] Explicit valence for atom # 12 N, 4, is greater than permitted
[10:38:03] ERROR: Could not sanitize

In [3]:
df

Unnamed: 0,DATABASE_ID,SMILES,INCHI_KEY,MOLECULAR_WEIGHT,GENERIC_NAME
0,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,OIRCOABEOLEUMC-GEJPAHFPSA-N,2180.2853,Bivalirudin
1,DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,1209.3983,Leuprolide
2,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,BLCLNMBMMGCOAS-URPVMXJPSA-N,1269.4105,Goserelin
3,DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,NDAYQJDHGXTBJL-MWWSRJDJSA-N,1811.253,Gramicidin D
4,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,NFLWUMRGJYTJIN-PNIOQBSNSA-N,1069.22,Desmopressin
...,...,...,...,...,...
11912,DB18708,CC1=C(COC2=CC=C(N=N2)C(=O)NC2CCOCC2)C(=NO1)C1=...,ACZCJTHHWMBFKC-UHFFFAOYSA-N,409.446,Alogabat
11913,DB18709,CN1C=C(C=N1)C1=CN2N=CC=C2C(=N1)C1=CN(N=C1)[C@@...,XPLZTJWZDBFWDE-OYOVHJISSA-N,383.419,Ropsacitinib
11914,DB18711,OC(=O)CCCCC(O)=O.C[C@@H](N)COC1=CC=C(C=C1)C1=C...,DORJQZDOULKINH-QNBGGDODSA-N,551.619,taletrectinib
11915,DB18715,NC1=C2N(C(=O)N([C@@H]3CCCN(C3)C(=O)C=C)C2=CC=N...,KOEUOFPEZFUWRF-LJQANCHMSA-N,455.518,Tolebrutinib


In [4]:
# Let us first check if all SMILES are valid

# Filter out rows with invalid SMILES strings
valid_smiles_indices = [] # Empty list of valid SMILES
for i, smiles in enumerate(df["SMILES"]):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        valid_smiles_indices.append(i)
        
# The new list will only have SMILES codes that are valid

# Update DataFrame with valid SMILES strings
df = df.iloc[valid_smiles_indices].reset_index(drop=True)

df.shape

[10:38:09] Explicit valence for atom # 0 N, 4, is greater than permitted
[10:38:09] Explicit valence for atom # 0 N, 4, is greater than permitted
[10:38:09] Explicit valence for atom # 0 N, 4, is greater than permitted
[10:38:10] SMILES Parse Error: syntax error while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]
[10:38:10] SMILES Parse Error: Failed parsing SMILES 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]' for input: 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]'
[10:38:10] Unusual charge on atom 0 number of radical electrons set to zero
[10:38:10] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=

(11911, 5)

### 2. Standardization of SMILES

In [5]:
# Standardization of SMILES

def standardize_molecules(smiles_list):
    # Function to remove salts
    unsalt = molvs.fragment.LargestFragmentChooser()
    # Function to standardize
    s = Standardizer()
    # Function to remove charges
    u = molvs.charge.Uncharger()
    
    
    # Empty list of standardized SMILES 
    std_smi = []

    
    # SMILES codes need to obey the following functions
    for smi in smiles_list:
        # SMILES to MOL
        mol = Chem.MolFromSmiles(smi)
        # Standardization that includes removing metals and normalization of chemotypes
        mol = s.standardize(mol)
        # Remove salts
        mol = unsalt(mol)
        # Neutralize/Remove charges
        mol = u.uncharge(mol)
        
        
        # Add that SMILES to the list after transforming it from MOL code
        std_smi.append(Chem.MolToSmiles(mol))
          
    return(std_smi)

In [6]:
# Applying this function to our database

std_mols = standardize_molecules(df["SMILES"])

# Adding to a new column to our dataframe

df["Standardized SMILES"] = std_mols

[10:38:14] Unusual charge on atom 10 number of radical electrons set to zero
[10:38:14] Unusual charge on atom 0 number of radical electrons set to zero
[10:38:14] Unusual charge on atom 19 number of radical electrons set to zero
[10:38:14] Unusual charge on atom 19 number of radical electrons set to zero
[10:38:14] Unusual charge on atom 0 number of radical electrons set to zero
[10:38:17] Unusual charge on atom 20 number of radical electrons set to zero
[10:38:17] Unusual charge on atom 0 number of radical electrons set to zero
[10:38:17] Unusual charge on atom 25 number of radical electrons set to zero
[10:38:17] Unusual charge on atom 25 number of radical electrons set to zero
[10:38:17] Unusual charge on atom 0 number of radical electrons set to zero
[10:38:17] Unusual charge on atom 0 number of radical electrons set to zero
[10:38:17] Unusual charge on atom 0 number of radical electrons set to zero
[10:38:17] Unusual charge on atom 0 number of radical electrons set to zero
[10:38

In [7]:
df

Unnamed: 0,DATABASE_ID,SMILES,INCHI_KEY,MOLECULAR_WEIGHT,GENERIC_NAME,Standardized SMILES
0,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,OIRCOABEOLEUMC-GEJPAHFPSA-N,2180.2853,Bivalirudin,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...
1,DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,1209.3983,Leuprolide,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=N)N)NC(=...
2,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,BLCLNMBMMGCOAS-URPVMXJPSA-N,1269.4105,Goserelin,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
3,DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,NDAYQJDHGXTBJL-MWWSRJDJSA-N,1811.253,Gramicidin D,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
4,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,NFLWUMRGJYTJIN-PNIOQBSNSA-N,1069.22,Desmopressin,N=C(N)NCCC[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H]...
...,...,...,...,...,...,...
11906,DB18708,CC1=C(COC2=CC=C(N=N2)C(=O)NC2CCOCC2)C(=NO1)C1=...,ACZCJTHHWMBFKC-UHFFFAOYSA-N,409.446,Alogabat,Cc1ccc(-c2noc(C)c2COc2ccc(C(=O)NC3CCOCC3)nn2)cn1
11907,DB18709,CN1C=C(C=N1)C1=CN2N=CC=C2C(=N1)C1=CN(N=C1)[C@@...,XPLZTJWZDBFWDE-OYOVHJISSA-N,383.419,Ropsacitinib,Cn1cc(-c2cn3nccc3c(-c3cnn([C@]4(CC#N)C[C@@H](C...
11908,DB18711,OC(=O)CCCCC(O)=O.C[C@@H](N)COC1=CC=C(C=C1)C1=C...,DORJQZDOULKINH-QNBGGDODSA-N,551.619,taletrectinib,C[C@@H](N)COc1ccc(-c2cnc3ccc(N[C@H](C)c4cccc(F...
11909,DB18715,NC1=C2N(C(=O)N([C@@H]3CCCN(C3)C(=O)C=C)C2=CC=N...,KOEUOFPEZFUWRF-LJQANCHMSA-N,455.518,Tolebrutinib,C=CC(=O)N1CCC[C@@H](n2c(=O)n(-c3ccc(Oc4ccccc4)...


In [8]:
df[['Standardized SMILES', 'DATABASE_ID']].to_csv('DrugBank_All_Cleaned.smi', sep='\t', index=False, header=False)