In [4]:
import numpy as np
import pandas as pd
from rdkit import Chem

In [2]:
data = pd.read_csv(
    'https://raw.githubusercontent.com/cptlab/ASCEPT_2023_comptox_workshop/main/W109_Machine_learning_QSAR_for_toxicity_prediction/data/smiles_cas_N6512_corrected.smi',
    names=['SMILES', 'CASRN', 'Ames'],
    delimiter='\t'
)

In [3]:
data.head(2)

Unnamed: 0,SMILES,CASRN,Ames
0,O=C1c2ccccc2C(=O)c3c1ccc4c3[nH]c5c6C(=O)c7cccc...,2475-33-4,0
1,NNC(=O)CNC(=O)\C=N#N,820-75-7,1


In [5]:
# Here we use Pat Walters' cannonical smiles recipe.

def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [6]:
# Canonical SMILES
Canon_SMILES = canonical_smiles(data.SMILES)

In [7]:
data.head(2)

Unnamed: 0,SMILES,CASRN,Ames
0,O=C1c2ccccc2C(=O)c3c1ccc4c3[nH]c5c6C(=O)c7cccc...,2475-33-4,0
1,NNC(=O)CNC(=O)\C=N#N,820-75-7,1


In [8]:
# Look at what has happened to the SMILES in line 1.
# Here we put the cannonical smiles into the SMILES column.
data['SMILES'] = Canon_SMILES
data.head(2)

Unnamed: 0,SMILES,CASRN,Ames
0,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,2475-33-4,0
1,[N-]=[N+]=CC(=O)NCC(=O)NN,820-75-7,1


In [8]:
#Add a molecule column and make sure RDkt can conver all SMILES
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
PandasTools.AddMoleculeColumnToFrame(data,'SMILES','Molecule')
data[["SMILES","Molecule"]].head(1)

Unnamed: 0,SMILES,Molecule
0,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,"<img data-content=""rdkit/molecule"" src=""data:i..."


In [9]:
def mol2fp(mol):
    fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=2048) # was 4096
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

In [10]:
# Use the molecule object from rdkit to make fingerprints
data_fp = data.Molecule.apply(mol2fp)
data_fp.head(2)

0    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: Molecule, dtype: object

In [11]:
data_fp = data_fp.apply(pd.Series)

In [12]:
# Concatenate the dataframes along axis 1 (columns)
data2 = pd.concat([data, data_fp], axis=1)
data2.head(1)

Unnamed: 0,SMILES,CASRN,Ames,Molecule,0,1,2,3,4,5,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,2475-33-4,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
