# Feature engineering

> In this module, we develop tools to extract features from compounds

In [1]:
#| default_exp feature

In [5]:
#| hide
from nbdev.showdoc import *

In [6]:
#| export
import seaborn as sns
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
import pandas as pd
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler

In [17]:
#| export
def smi2prop(df, # df needs to have SMILES an ID columns
             smi_colname = "SMILES", # column name of smiles
             id_colname = "ID", # column name of ID
             remove_duplicate=True, # remove features that are same across compounds
             normalize = False, # normalize features using StandardScaler()
            ):
    "Extract 208 features from smiles via rdkit.Chem.Descriptors, and remove duplicate features"
    
    mols = [Chem.MolFromSmiles(smi) for smi in df[smi_colname]]
    desc_names = [desc_name[0] for desc_name in Descriptors.descList]
    desc_calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_names)
    desc_values = [desc_calc.CalcDescriptors(mol) for mol in mols]
    compound = pd.DataFrame(np.stack(desc_values).T, index=desc_names,columns=df[id_colname])
    if remove_duplicate:
        compound = compound.loc[~compound.duplicated()] # remove duplicates
        compound = compound.loc[compound.std(axis=1) != 0] # remove compound that has same value across features
    compound = compound.T.reset_index()
    if normalize:
        scaler = StandardScaler()
        transformed = scaler.fit_transform(compound.iloc[:,1:])
        compound.iloc[:,1:] = transformed
    return compound

In [33]:
show_doc(smi2prop)

---

### smi2prop

>      smi2prop (df, smi_colname='SMILES', id_colname='ID',
>                remove_duplicate=True, normalize=False)

Convert a dataframe that contains [ID & smiles] to [ID & 138 features from rdkit.Chem.Descriptors]

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df |  |  | df needs to have SMILES an ID columns |
| smi_colname | str | SMILES | column name of smiles |
| id_colname | str | ID | column name of ID |
| remove_duplicate | bool | True | remove compounds that have duplicated feature or same value across features (std=0) |
| normalize | bool | False | normalize features using StandardScaler() |

In [21]:
df = pd.read_csv('kras_smiles.csv')

In [22]:
df.shape

(62, 8)

In [23]:
smi2prop(df,normalize=True)

Unnamed: 0,ID,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,MaxPartialCharge,...,fr_halogen,fr_imidazole,fr_methoxy,fr_morpholine,fr_nitrile,fr_piperdine,fr_piperzine,fr_pyridine,fr_term_acetylene,fr_unbrch_alkane
0,G12D_1,0.975036,-0.441677,-1.213742,-1.281097,1.345582,1.510088,1.350555,1.230101,0.027035,...,2.286072,-0.128037,-0.262613,-0.182574,-0.570162,-0.212,0.182574,0.849837,3.376389,-0.128037
1,G12D_5A,0.808485,0.040612,-1.087714,-0.422660,-0.001068,0.127539,-0.010601,-0.512362,0.026650,...,1.124298,-0.128037,-0.262613,-0.182574,1.509252,-0.212,0.182574,0.849837,-0.296174,-0.128037
2,G12D_5B,0.727322,0.103262,2.806293,0.555812,-0.963962,-0.872422,-0.975432,-1.450612,0.026650,...,1.124298,-0.128037,-0.262613,-0.182574,-0.570162,-0.212,0.182574,0.849837,-0.296174,-0.128037
3,G12D_6,0.707055,0.034226,-0.223186,0.802310,-1.418955,-1.383959,-1.419776,-1.450612,0.026650,...,-0.037477,-0.128037,-0.262613,-0.182574,-0.570162,-0.212,0.182574,0.849837,-0.296174,-0.128037
4,G12D_7,0.749244,0.037652,-0.198666,0.514847,-1.072962,-1.068133,-1.073136,-1.048505,0.026649,...,-0.037477,-0.128037,-0.262613,-0.182574,-0.570162,-0.212,-5.477226,0.849837,-0.296174,-0.128037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,G12C_20,-0.565705,-0.578406,0.298354,-0.612290,1.431544,1.495705,1.424910,1.096066,0.012762,...,1.124298,-0.128037,-0.262613,-0.182574,1.509252,-0.212,0.182574,-1.176697,-0.296174,-0.128037
58,G12C_21,-0.994819,0.330649,0.131999,-2.126292,1.728426,1.732673,1.722193,1.498173,0.012762,...,-0.037477,-0.128037,3.807887,-0.182574,1.509252,-0.212,0.182574,-1.176697,-0.296174,-0.128037
59,G12C_22,-1.100526,-0.168656,2.277858,-1.025113,1.777537,1.811530,1.771550,1.498173,0.012762,...,1.124298,-0.128037,-0.262613,-0.182574,1.509252,-0.212,0.182574,-1.176697,-0.296174,-0.128037
60,G12C_23,-0.981162,0.416288,0.862398,-1.298750,2.074419,2.048498,2.068833,1.900280,0.012762,...,-0.037477,-0.128037,3.807887,-0.182574,1.509252,-0.212,0.182574,-1.176697,-0.296174,-0.128037


In [24]:
smi2prop(df,normalize=False)

Unnamed: 0,ID,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,MaxPartialCharge,...,fr_halogen,fr_imidazole,fr_methoxy,fr_morpholine,fr_nitrile,fr_piperdine,fr_piperzine,fr_pyridine,fr_term_acetylene,fr_unbrch_alkane
0,G12D_1,16.725593,-0.912746,0.007777,0.319044,600.645,569.397,600.246059,226.0,0.318621,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,G12D_5A,16.408797,-0.546191,0.013725,0.369319,546.050,516.818,545.210614,200.0,0.318610,...,2.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,G12D_5B,16.254417,-0.498575,0.197532,0.426624,507.013,478.789,506.199715,186.0,0.318610,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,G12D_6,16.215868,-0.551045,0.054533,0.441060,488.567,459.335,488.233602,186.0,0.318610,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,G12D_7,16.296115,-0.548440,0.055691,0.424225,502.594,471.346,502.249252,192.0,0.318610,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,G12C_20,13.794964,-1.016664,0.079151,0.358213,604.130,568.850,603.252479,224.0,0.318223,...,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
58,G12C_21,12.978753,-0.325754,0.071299,0.269545,616.166,577.862,615.272466,230.0,0.318223,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
59,G12C_22,12.777688,-0.705241,0.172589,0.334036,618.157,580.861,617.268129,230.0,0.318223,...,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
60,G12C_23,13.004729,-0.260666,0.105776,0.318010,630.193,589.873,629.288116,236.0,0.318223,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [30]:
#| export
def smi2morgan(df, # a dataframe contains ID and SMILES columns
               smi_colname = "SMILES", # set smiles columne name
               id_colname = "ID", # set ID column name
              ):
    "Like `smi2prop`, get 2048 morgan feature (0/1) given a dataframe that contains ID&smiles"
    mols = [Chem.MolFromSmiles(smi) for smi in df[smi_colname]]
    morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) for mol in mols]
    fp_df = pd.DataFrame(np.array(morgan_fps), index=df[id_colname])
    colnames = [f'morgan_{i}' for i in fp_df.columns]
    fp_df.columns = colnames
    fp_df = fp_df.reset_index()
    return fp_df

In [31]:
show_doc(smi2morgan)

---

### smi2morgan

>      smi2morgan (df, smi_colname='SMILES', id_colname='ID')

Like `smi2prop`, get 2048 morgan feature (0/1) given a dataframe that contains ID&smiles

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df |  |  | a dataframe contains ID and SMILES columns |
| smi_colname | str | SMILES | set smiles columne name |
| id_colname | str | ID | set ID column name |

In [27]:
smi2morgan(df)

Unnamed: 0,ID,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,G12D_1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,G12D_5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,G12D_5B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,G12D_6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,G12D_7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,G12C_20,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
58,G12C_21,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
59,G12C_22,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
60,G12C_23,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [32]:
#| hide
import nbdev; nbdev.nbdev_export()