In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
from rdkit.rdBase import BlockLogs
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import AllChem
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
test_run = True

### Helper Functions

In [3]:
def smiles_standardise(smiles, neutralize=True):
    """ input a smiles, return a standardised molecule
    """
    mol = Chem.MolFromSmiles(smiles)
    block = BlockLogs() # Block all RDkit logging
    
    # Normalizing functional groups, remove Hs, disconnect metal atoms
    # https://molvs.readthedocs.io/en/latest/guide/standardize.html
    clean_mol = rdMolStandardize.Cleanup(mol) 
    # Get parents fragments
    parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
    # Neutralise
    if neutralize:
        uncharger = rdMolStandardize.Uncharger()
        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
    else:
        uncharged_parent_clean_mol = parent_clean_mol
    del block # Release logging block to previous state
    return(uncharged_parent_clean_mol)


def fp_generator(mol, radius=2, nBits=2048):
    """ Input an RDKit molecule, return an ECFP bit vector
    """
    return(AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits))


def retrieve_rdkit_desc(mol):
    """ Input an RDKit molecule, return RDKit descriptors
    """
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][1](mol))
    return desc_list


def retrieve_rdkit_desc_name():
    """ Return name of RDKit descriptors
    """
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][0])
    return desc_list

### 0. Reading files and inits

In [4]:
df_CP_features = pd.read_csv("output/CP_feature_df.csv")
if test_run:
    df_CP_feature = df_CP_features.head()
else:
    df_CP_feature = df_CP_features
df_compounds_smiles = df_CP_feature.iloc[:,0:6]
df_CPf = df_CP_feature.iloc[:,6:]

In [5]:
# Standardisation of molecules 
df_compounds_smiles['RDKit_mol'] = df_compounds_smiles['CPD_SMILES'].apply(smiles_standardise)

### 1. Extract Morgan Fingerprints

In [6]:
# Create a dataframe of Morgan Fingerprints
df_FPf = df_compounds_smiles.copy(deep=True)
df_FPf["ECFP"] = df_FPf["RDKit_mol"].apply(fp_generator)
df_FPf = pd.DataFrame(np.stack(df_FPf["ECFP"]))
# Rename
df_FPf.columns = [f"ecfp_{i}" for i in df_FPf.columns]
df_FPf

Unnamed: 0,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,ecfp_8,ecfp_9,...,ecfp_2038,ecfp_2039,ecfp_2040,ecfp_2041,ecfp_2042,ecfp_2043,ecfp_2044,ecfp_2045,ecfp_2046,ecfp_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### 2. Extract RDKit descriptors

In [7]:
# Create a dataframe of RDKit Descriptors
df_RDKit_f = df_compounds_smiles.copy(deep=True)
df_RDKit_f["RDKit"] = df_RDKit_f["RDKit_mol"].apply(retrieve_rdkit_desc)
df_RDKit_f = pd.DataFrame(np.stack(df_RDKit_f["RDKit"]))
# Rename columns
list_rdkit_desc = retrieve_rdkit_desc_name()
df_RDKit_f.columns = [f"RDKit_{i}" for i in list_rdkit_desc]
df_RDKit_f

Unnamed: 0,RDKit_MaxEStateIndex,RDKit_MinEStateIndex,RDKit_MaxAbsEStateIndex,RDKit_MinAbsEStateIndex,RDKit_qed,RDKit_MolWt,RDKit_HeavyAtomMolWt,RDKit_ExactMolWt,RDKit_NumValenceElectrons,RDKit_NumRadicalElectrons,...,RDKit_fr_sulfide,RDKit_fr_sulfonamd,RDKit_fr_sulfone,RDKit_fr_term_acetylene,RDKit_fr_tetrazole,RDKit_fr_thiazole,RDKit_fr_thiocyan,RDKit_fr_thiophene,RDKit_fr_unbrch_alkane,RDKit_fr_urea
0,12.136547,-0.367353,12.136547,0.367353,0.559907,294.395,268.187,294.194343,118.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9.508312,0.205613,9.508312,0.205613,0.810559,226.275,212.163,226.09938,86.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12.126902,-0.906229,12.126902,0.001289,0.59924,390.52,356.248,390.240624,156.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12.161683,-3.476898,12.161683,0.290281,0.781978,265.338,250.218,265.088498,96.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,10.514642,-0.720975,10.514642,0.243641,0.405857,350.499,316.227,350.24571,142.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


### Feature Selection

In [8]:
# Remove columns which are all 0s
num_bits_before = df_FPf.shape[1]
df_FPf = df_FPf.loc[:,(df_FPf!=0).any(axis=0)]
num_bits_after = df_FPf.shape[1]
print(f"ECFP: {num_bits_before - num_bits_after} bits removed, from {num_bits_before} to {num_bits_after} bits")

num_rdkit_before = df_RDKit_f.shape[1]
df_RDKit_f = df_RDKit_f.loc[:,(df_RDKit_f!=0).any(axis=0)]
num_rdkit_after = df_RDKit_f.shape[1]
print(f"RDKit descriptors: {num_bits_before - num_bits_after} descriptors removed, from {num_bits_before} to {num_bits_after} descriptors")

ECFP: 1883 bits removed, from 2048 to 165 bits
RDKit descriptors: 1883 descriptors removed, from 2048 to 165 descriptors


In [None]:
# Remove columns whose correlations are above 0.95


### Standardising features

In [12]:
scaler = StandardScaler()
scaler.fit(df_RDKit_f)
pd.DataFrame(scaler.transform(df_RDKit_f), columns=df_RDKit_f.columns)

Unnamed: 0,RDKit_MaxEStateIndex,RDKit_MinEStateIndex,RDKit_MaxAbsEStateIndex,RDKit_MinAbsEStateIndex,RDKit_qed,RDKit_MolWt,RDKit_HeavyAtomMolWt,RDKit_ExactMolWt,RDKit_NumValenceElectrons,RDKit_MaxPartialCharge,...,RDKit_fr_bicyclic,RDKit_fr_ester,RDKit_fr_ether,RDKit_fr_lactone,RDKit_fr_methoxy,RDKit_fr_phenol,RDKit_fr_phenol_noOrthoHbond,RDKit_fr_pyridine,RDKit_fr_sulfonamd,RDKit_fr_unbrch_alkane
0,0.776218,0.54012,0.776218,1.187886,-0.478864,-0.187366,-0.245959,-0.186936,-0.060343,0.916607,...,-0.862662,1.224745,1.581139,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.75
1,-1.632581,0.991364,-1.632581,-0.130616,1.197473,-1.346574,-1.355283,-1.346329,-1.267212,-1.801318,...,-0.862662,-0.816497,0.0,-0.5,2.0,2.0,2.0,-0.5,-0.5,-0.75
2,0.767379,0.115723,0.767379,-1.796257,-0.215807,1.448408,1.497726,1.44836,1.372813,0.782395,...,1.833157,1.224745,0.0,2.0,-0.5,-0.5,-0.5,-0.5,-0.5,-0.75
3,0.799255,-1.908829,0.799255,0.559596,1.006329,-0.681833,-0.60176,-0.682496,-0.890066,-0.334659,...,-0.323498,-0.816497,-1.581139,-0.5,-0.5,-0.5,-0.5,2.0,2.0,0.5
4,-0.710271,0.261622,-0.710271,0.179391,-1.509131,0.767365,0.705275,0.767401,0.844808,0.436976,...,0.215666,-0.816497,0.0,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,1.75


In [13]:
df_RDKit_f

Unnamed: 0,RDKit_MaxEStateIndex,RDKit_MinEStateIndex,RDKit_MaxAbsEStateIndex,RDKit_MinAbsEStateIndex,RDKit_qed,RDKit_MolWt,RDKit_HeavyAtomMolWt,RDKit_ExactMolWt,RDKit_NumValenceElectrons,RDKit_MaxPartialCharge,...,RDKit_fr_bicyclic,RDKit_fr_ester,RDKit_fr_ether,RDKit_fr_lactone,RDKit_fr_methoxy,RDKit_fr_phenol,RDKit_fr_phenol_noOrthoHbond,RDKit_fr_pyridine,RDKit_fr_sulfonamd,RDKit_fr_unbrch_alkane
0,12.136547,-0.367353,12.136547,0.367353,0.559907,294.395,268.187,294.194343,118.0,0.341467,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9.508312,0.205613,9.508312,0.205613,0.810559,226.275,212.163,226.09938,86.0,0.122645,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
2,12.126902,-0.906229,12.126902,0.001289,0.59924,390.52,356.248,390.240624,156.0,0.330661,...,5.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12.161683,-3.476898,12.161683,0.290281,0.781978,265.338,250.218,265.088498,96.0,0.240726,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,10.514642,-0.720975,10.514642,0.243641,0.405857,350.499,316.227,350.24571,142.0,0.302851,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [9]:
# Sklearn standard scaler