In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
# Load your data into a DataFrame (assuming 'data.csv' contains your data)
df = pd.read_csv('../Data/drug_protein_interactions.csv', sep="|")
df.columns

Index(['drug_id', 'name', 'cas', 'smiles', 'logP ALOGPS', 'logP ChemAxon',
       'solubility ALOGPS', 'pKa (strongest acidic)', 'pKa (strongest basic)',
       'drugname', 'drug_type', 'approved', 'experimental', 'illicit',
       'investigational', 'nutraceutical', 'withdrawn', 'collated_partner_id',
       'collated_gene_name', 'collated_target_type', 'collated_inhibitor',
       'collated_inducer'],
      dtype='object')

In [3]:
# Define indices or identifiers of molecules to remove due to bad conformer IDs
indices_to_remove = []

In [4]:
# Process each molecule
for idx, row in df.iterrows():
    try:
        drug_id = row['drug_id']  #Assuming 'drug_id' is the column name
        
        # Create RDKit molecule object from SMILES
        mol = Chem.MolFromSmiles(row['smiles'])
        if mol is None:
            print(f"Failed to create molecule from SMILES: {row['smiles']}")
            indices_to_remove.append(idx)
            continue
        
        # Add hydrogens to the molecule
        mol = Chem.AddHs(mol)
        
        # Generate conformers
        AllChem.EmbedMultipleConfs(mol, numConfs=1)  # Generates 1 conformer
        
        # Perform UFF optimization on each conformer
        for conf in mol.GetConformers():
            try:
                AllChem.UFFOptimizeMolecule(mol, confId=conf.GetId(), maxIters=1000)
            except Exception as e:
                print(f"UFF optimization failed for molecule with DrugbankID {drug_id}: {row['smiles']}")
                print(e)
        
        # Save the optimized molecule to an SDF file
        sdf_filename = f'../Data/drug_sdfs/mol_{drug_id}.sdf'  # Naming based on DrugbankID
        writer = Chem.SDWriter(sdf_filename)
        writer.write(mol)
        writer.close()
        
    except Exception as e:
        print(f"Processing failed for molecule with DrugbankID {drug_id}: {row['smiles']}")
        print(e)
        indices_to_remove.append(idx)

[10:03:09] UFFTYPER: Unrecognized atom type: Co5+3 (45)
[10:03:13] UFFTYPER: Unrecognized charge state for atom: 4
[10:03:13] UFFTYPER: Unrecognized charge state for atom: 4
[10:03:26] UFFTYPER: Unrecognized charge state for atom: 93
[10:05:39] UFFTYPER: Unrecognized charge state for atom: 0
[10:05:39] UFFTYPER: Unrecognized atom type: Gd2+3 (0)
[10:05:39] UFFTYPER: Unrecognized charge state for atom: 0
[10:05:39] UFFTYPER: Unrecognized atom type: Gd2+3 (0)
[10:19:43] UFFTYPER: Unrecognized charge state for atom: 0
[10:19:43] UFFTYPER: Unrecognized charge state for atom: 0
[10:22:21] UFFTYPER: Unrecognized atom type: Fe1+2 (0)
[10:22:21] UFFTYPER: Unrecognized atom type: Fe1+2 (0)
[10:22:43] Explicit valence for atom # 13 Cl, 5, is greater than permitted


Failed to create molecule from SMILES: NC1=C(C2=C(N)N=C(N)N=C2C=C1)[Cl](=O)=O


[10:22:55] UFFTYPER: Unrecognized atom type: W_3 (15)
[10:22:55] UFFTYPER: Unrecognized atom type: W_3 (19)
[10:22:55] UFFTYPER: Unrecognized atom type: W_3 (15)
[10:22:55] UFFTYPER: Unrecognized atom type: W_3 (19)
[10:23:04] UFFTYPER: Unrecognized atom type: W_3 (15)
[10:23:04] UFFTYPER: Unrecognized atom type: W_3 (15)
[10:23:10] UFFTYPER: Unrecognized atom type: S_5+4 (16)
[10:23:10] UFFTYPER: Unrecognized atom type: S_5+4 (16)
[10:23:34] UFFTYPER: Unrecognized charge state for atom: 7
[10:23:34] UFFTYPER: Unrecognized charge state for atom: 7
[10:23:37] UFFTYPER: Unrecognized atom type: Se2+2 (5)
[10:23:37] UFFTYPER: Unrecognized atom type: Se2+2 (5)
[10:24:01] UFFTYPER: Unrecognized atom type: Zn1+2 (16)
[10:24:01] UFFTYPER: Unrecognized atom type: Zn1+2 (16)
[10:24:04] UFFTYPER: Unrecognized atom type: V_5+5 (7)
[10:24:04] UFFTYPER: Unrecognized atom type: V_5+5 (7)
[10:24:04] 

****
Pre-condition Violation
bad params pointer
Violation occurred on line 75 in file /project/build/

UFF optimization failed for molecule with DrugbankID DB03512: OC[C@@H]1O[C@H]([C@H]2O[V](O)(O)(=O)O[C@H]12)N1C=CC(=O)NC1=O
Pre-condition Violation
	bad params pointer
	Violation occurred on line 75 in file Code/ForceField/UFF/AngleBend.cpp
	Failed Expression: at2Params
	RDKIT: 2023.03.2
	BOOST: 1_78



[10:24:26] UFFTYPER: Unrecognized atom type: Mo5+6 (20)
[10:24:26] UFFTYPER: Unrecognized atom type: Mo5+6 (20)
[10:24:26] 

****
Pre-condition Violation
bad params pointer
Violation occurred on line 75 in file /project/build/temp.linux-x86_64-cpython-311/rdkit/Code/ForceField/UFF/AngleBend.cpp
Failed Expression: at2Params
****



UFF optimization failed for molecule with DrugbankID DB03983: NC1=NC2=C(N[C@@H]3[C@H](N2)O[C@@H](COP(O)(O)=O)C2=C3S[Mo](S)(=O)(=O)S2)C(=O)N1
Pre-condition Violation
	bad params pointer
	Violation occurred on line 75 in file Code/ForceField/UFF/AngleBend.cpp
	Failed Expression: at2Params
	RDKIT: 2023.03.2
	BOOST: 1_78



[10:24:27] UFFTYPER: Unrecognized atom type: Zn1+2 (10)
[10:24:27] UFFTYPER: Unrecognized atom type: Zn1+2 (10)
[10:24:34] Explicit valence for atom # 6 Be, 4, is greater than permitted


Failed to create molecule from SMILES: N[C@@H](C[C@H](O)O[Be](F)(F)F)C(O)=O


[10:24:36] UFFTYPER: Unrecognized charge state for atom: 7
[10:24:36] UFFTYPER: Unrecognized charge state for atom: 7
[10:24:38] Unusual charge on atom 0 number of radical electrons set to zero
[10:24:38] Unusual charge on atom 0 number of radical electrons set to zero
[10:24:38] UFFTYPER: Unrecognized hybridization for atom: 0
[10:24:38] UFFTYPER: Unrecognized charge state for atom: 0
[10:24:38] UFFTYPER: Unrecognized atom type: Zn+2 (0)
[10:24:38] UFFTYPER: Unrecognized hybridization for atom: 0
[10:24:38] UFFTYPER: Unrecognized charge state for atom: 0
[10:24:38] UFFTYPER: Unrecognized atom type: Zn+2 (0)
[10:24:38] Unusual charge on atom 0 number of radical electrons set to zero
[10:29:59] UFFTYPER: Unrecognized charge state for atom: 0
[10:29:59] UFFTYPER: Unrecognized atom type: Ga+3 (0)
[10:29:59] UFFTYPER: Unrecognized charge state for atom: 0
[10:29:59] UFFTYPER: Unrecognized atom type: Ga+3 (0)
[10:29:59] UFFTYPER: Unrecognized charge state for atom: 0
[10:29:59] UFFTYPER: Un

Failed to create molecule from SMILES: CN(CCO[P@](O)(=O)O[P@@](O)(=O)O[Be-](F)(F)F)C1=CC=CC=C1[N+]([O-])=O


[10:44:53] UFFTYPER: Unrecognized charge state for atom: 7
[10:44:53] UFFTYPER: Unrecognized charge state for atom: 7
[10:44:58] UFFTYPER: Unrecognized atom type: Pt3+2 (0)
[10:44:58] UFFTYPER: Unrecognized atom type: Pt3+2 (3)
[10:48:19] UFFTYPER: Unrecognized charge state for atom: 0
[10:48:19] UFFTYPER: Unrecognized atom type: Gd2+3 (0)
[10:48:19] UFFTYPER: Unrecognized charge state for atom: 0
[10:48:19] UFFTYPER: Unrecognized atom type: Gd2+3 (0)
[10:48:42] UFFTYPER: Unrecognized atom type: Au6+3 (6)
[10:48:42] UFFTYPER: Unrecognized atom type: Au6+3 (6)
[10:48:42] UFFTYPER: Unrecognized atom type: Tc3+5 (0)
[10:48:42] UFFTYPER: Unrecognized atom type: Tc3+5 (0)
[10:48:44] UFFTYPER: Unrecognized atom type: Fe2+2 (0)
[10:48:44] UFFTYPER: Unrecognized atom type: Fe2+2 (0)
[10:48:44] UFFTYPER: Unrecognized atom type: Fe2+2 (0)
[10:48:44] UFFTYPER: Unrecognized atom type: Fe2+2 (0)
[10:48:44] UFFTYPER: Unrecognized atom type: Fe2+2 (0)
[10:48:44] UFFTYPER: Unrecognized atom type: Fe2+

In [5]:
print(len(indices_to_remove))

3


In [None]:
## Optionally, save the processed DataFrame back to a CSV file without the removed molecules
#df_cleaned = df.drop(indices_to_remove)
#df_cleaned.to_csv('../data/cleaned_drug_protein_interactions.csv', index=False, sep="|")