In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
from rdkit.rdBase import BlockLogs
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import AllChem
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
test_run = False

### Helper Functions

In [3]:
def smiles_standardise(smiles, neutralize=True):
    """ input a smiles, return a standardised molecule
    """
    mol = Chem.MolFromSmiles(smiles)
    block = BlockLogs() # Block all RDkit logging
    
    # Normalizing functional groups, remove Hs, disconnect metal atoms
    # https://molvs.readthedocs.io/en/latest/guide/standardize.html
    clean_mol = rdMolStandardize.Cleanup(mol) 
    # Get parents fragments
    parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
    # Neutralise
    if neutralize:
        uncharger = rdMolStandardize.Uncharger()
        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
    else:
        uncharged_parent_clean_mol = parent_clean_mol
    del block # Release logging block to previous state
    return(uncharged_parent_clean_mol)


def fp_generator(mol, radius=2, nBits=4096):
    """ Input an RDKit molecule, return an ECFP bit vector
    """
    return(AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits))


def retrieve_rdkit_desc(mol):
    """ Input an RDKit molecule, return RDKit descriptors
    """
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][1](mol))
    return desc_list


def retrieve_rdkit_desc_name():
    """ Return name of RDKit descriptors
    """
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][0])
    return desc_list

### 0. Reading files and inits

In [4]:
df_CP_features = pd.read_csv("output/CP_feature_df.csv")
if test_run:
    df_CP_feature = df_CP_features.head()
else:
    df_CP_feature = df_CP_features
df_compounds_smiles = df_CP_feature.iloc[:,0:6]
df_CPf = df_CP_feature.iloc[:,6:]

In [5]:
# Standardisation of molecules 
df_compounds_smiles['RDKit_mol'] = df_compounds_smiles['CPD_SMILES'].apply(smiles_standardise)

### 1. Extract Morgan Fingerprints

In [6]:
# Create a dataframe of Morgan Fingerprints
df_FPf = df_compounds_smiles.copy(deep=True)
df_FPf["ECFP"] = df_FPf["RDKit_mol"].apply(fp_generator)
df_FPf = pd.DataFrame(np.stack(df_FPf["ECFP"]))
# Rename
df_FPf.columns = [f"ecfp_{i}" for i in df_FPf.columns]
df_FPf

Unnamed: 0,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,ecfp_8,ecfp_9,...,ecfp_4086,ecfp_4087,ecfp_4088,ecfp_4089,ecfp_4090,ecfp_4091,ecfp_4092,ecfp_4093,ecfp_4094,ecfp_4095
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47161,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47162,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47163,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Extract RDKit descriptors

In [7]:
# Create a dataframe of RDKit Descriptors
df_RDKit_f = df_compounds_smiles.copy(deep=True)
df_RDKit_f["RDKit"] = df_RDKit_f["RDKit_mol"].apply(retrieve_rdkit_desc)
df_RDKit_f = pd.DataFrame(np.stack(df_RDKit_f["RDKit"]))
# Rename columns
list_rdkit_desc = retrieve_rdkit_desc_name()
df_RDKit_f.columns = [f"RDKit_{i}" for i in list_rdkit_desc]
df_RDKit_f

Unnamed: 0,RDKit_MaxEStateIndex,RDKit_MinEStateIndex,RDKit_MaxAbsEStateIndex,RDKit_MinAbsEStateIndex,RDKit_qed,RDKit_MolWt,RDKit_HeavyAtomMolWt,RDKit_ExactMolWt,RDKit_NumValenceElectrons,RDKit_NumRadicalElectrons,...,RDKit_fr_sulfide,RDKit_fr_sulfonamd,RDKit_fr_sulfone,RDKit_fr_term_acetylene,RDKit_fr_tetrazole,RDKit_fr_thiazole,RDKit_fr_thiocyan,RDKit_fr_thiophene,RDKit_fr_unbrch_alkane,RDKit_fr_urea
0,12.136547,-0.367353,12.136547,0.367353,0.559907,294.395,268.187,294.194343,118.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9.508312,0.205613,9.508312,0.205613,0.810559,226.275,212.163,226.099380,86.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12.126902,-0.906229,12.126902,0.001289,0.599240,390.520,356.248,390.240624,156.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12.161683,-3.476898,12.161683,0.290281,0.781978,265.338,250.218,265.088498,96.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,10.514642,-0.720975,10.514642,0.243641,0.405857,350.499,316.227,350.245710,142.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47160,12.906797,-0.338188,12.906797,0.013328,0.730903,444.536,412.280,444.248504,174.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47161,12.906797,-0.338188,12.906797,0.013328,0.730903,444.536,412.280,444.248504,174.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47162,12.906797,-0.338188,12.906797,0.013328,0.730903,444.536,412.280,444.248504,174.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47163,12.905202,-0.349182,12.905202,0.015785,0.730903,444.536,412.280,444.248504,174.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. Feature Selection

In [8]:
# Removing columns which only have 1 value
print('Removing columns which only have 1 value...')
num_bits_before = df_FPf.shape[1]
df_FPf = df_FPf.loc[:,df_FPf.nunique()!=1]
num_bits_after = df_FPf.shape[1]
print(f"ECFP: {num_bits_before - num_bits_after} bits removed, from {num_bits_before} to {num_bits_after} bits")

num_rdkit_before = df_RDKit_f.shape[1]
df_RDKit_f = df_RDKit_f.loc[:,df_RDKit_f.nunique()!=1]
num_rdkit_after = df_RDKit_f.shape[1]
print(f"RDKit descriptors: {num_rdkit_before - num_rdkit_after} descriptors removed, from {num_rdkit_before} to {num_rdkit_after} descriptors")

num_cp_before = df_CPf.shape[1]
df_CPf = df_CPf.loc[:,df_CPf.nunique()!=1]
num_cp_after = df_CPf.shape[1]
print(f"Cell Painting features: {num_cp_before - num_cp_after} features removed, from {num_cp_before} to {num_cp_after} features")

Removing columns which only have 1 value...
ECFP: 1 bits removed, from 4096 to 4095 bits
RDKit descriptors: 6 descriptors removed, from 208 to 202 descriptors
Cell Painting features: 4 features removed, from 1783 to 1779 features


In [9]:
# Remove columns whose correlations are above 0.95

# For RDKit features
corr_matrix = df_RDKit_f.corr(numeric_only=True).abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
df_RDKit_f = df_RDKit_f.drop(columns=[column for column in upper.columns if any(upper[column] > 0.95)])

# For CP features
corr_matrix = df_CPf.corr(numeric_only=True).abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
df_CPf = df_CPf.drop(columns=[column for column in upper.columns if any(upper[column] > 0.95)])

### 4. Standardising features

In [10]:
# Sklearn standard scaler

# Scale RDKit features
scaler = StandardScaler()
df_RDKit_f = pd.DataFrame(scaler.fit_transform(df_RDKit_f), columns=df_RDKit_f.columns)

# Scale CP features
scaler2 = StandardScaler()
df_CPf = pd.DataFrame(scaler2.fit_transform(df_CPf), columns=df_CPf.columns)

### 5. Save CSV files 

In [11]:
pd.concat([df_compounds_smiles[['INCHIKEY', 'CPD_SMILES', 'SAMPLE_KEY']], df_RDKit_f.astype('float32')], axis=1).to_csv('output/norm_RDKit_feature_df.csv', index=False)
pd.concat([df_compounds_smiles[['INCHIKEY', 'CPD_SMILES', 'SAMPLE_KEY']], df_CPf.astype('float32')], axis=1).to_csv('output/norm_CP_feature_df.csv', index=False)
pd.concat([df_compounds_smiles[['INCHIKEY', 'CPD_SMILES', 'SAMPLE_KEY']], df_FPf.astype('float32')], axis=1).to_csv('output/norm_ECFP_feature_df.csv', index=False)