In [14]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, SaltRemover
from rdkit import RDLogger
from molvs import Standardizer
import matplotlib.pyplot as plt

In [15]:
i = 0
data_path = "../../Data/result/nci60/CV10"
train = pd.read_csv(data_path + f'/fold{i}/train.csv')
test = pd.read_csv(data_path + f'/fold{i}/test.csv')
val = pd.read_csv(data_path + f'/fold{i}/val.csv')

In [20]:
def preprocess_smiles(smiles):
    # Attempt to sanitize the SMILES
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print("Invalid SMILES:", smiles)
        return None
    
    # Sanitize the molecule
    Chem.SanitizeMol(mol)
    
    # Check if there are charged atoms and neutralize
    if mol.GetNumAtoms():
        Chem.Kekulize(mol)
        Chem.AssignStereochemistry(mol, cleanIt=True, force=True)
        Chem.SanitizeMol(mol)
        Chem.DetectBondStereochemistry(mol)
        Chem.AssignStereochemistry(mol, cleanIt=True, force=True)

    return Chem.MolToSmiles(mol, isomericSmiles=True)

In [21]:
def smiles2morgan(smiles, radius=2, nBits=1024):
    sanitized_smile = preprocess_smiles(smiles)
    try:
        mol = Chem.MolFromSmiles(smiles)
        remover = SaltRemover.SaltRemover()  # remove salt
        mol = remover.StripMol(mol)
        s = Standardizer()  # standardize molecule
        mol = s.standardize(mol)
        features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
        features = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(features_vec, features)
    except:
        return smiles

In [17]:
train.head()

Unnamed: 0,cell_line,panel,smiles,pIC50
0,MDA-MB-231,BRE,NC(=O)c1sc(cc1OCc2ccccc2Br)n3cnc4cc(ccc34)C(F)...,-1.3979
1,MDA-MB-231,BRE,C\C=C(\C)/C(=O)O[C@@H]1CCN2CC[C@H](COC(=O)\C(=...,-2.0
2,SNB75,CNS,C=CC(=O)NC(=N)NC#N,-2.0
3,EKVX,LNS,Clc1ccc(CNNC(=O)Nc2cccc3ccccc23)cc1,-2.0
4,NCI-H23,LNS,CC\C(=N/NC(=O)c1cccnc1)\c2ccc3OCCOc3c2,-2.0


In [18]:
smiles = train['smiles'].unique()
len(smiles)

56034

In [22]:
fails = [smiles2morgan(x) for x in smiles[20:30]]
fails = [x for x in fails if x is not None]
len(fails)

Invalid SMILES: Cn1cnnc1.Cn2c[n+](cn2)[Ru+3]([ClH-])([ClH-])([ClH-])([ClH-])[n+]3cnn(C)c3


[22:52:37] Explicit valence for atom # 13 Cl, 3, is greater than permitted
[22:52:37] Explicit valence for atom # 13 Cl, 3, is greater than permitted


1

In [23]:
fails

['Cn1cnnc1.Cn2c[n+](cn2)[Ru+3]([ClH-])([ClH-])([ClH-])([ClH-])[n+]3cnn(C)c3']

In [24]:
mol = Chem.MolFromSmiles('Cn1cnnc1.Cn2c[n+](cn2)[Ru+3]([ClH-])([ClH-])([ClH-])([ClH-])[n+]3cnn(C)c3')

[22:53:47] Explicit valence for atom # 13 Cl, 3, is greater than permitted


In [25]:
mol