# Data preprocesing

Zuzanna Gorczyca, zgo@kth.se

Alga Nour Elimane, nealga@kth.se

Tse An Shih, tashih@kth.se

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import os

from rdkit import Chem
from rdkit.Chem import Descriptors
import rdkit.Chem.rdMolDescriptors as d
import rdkit.Chem.Lipinski as l
import rdkit.Chem.Fragments as f

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Data loading

In [2]:
dir_path = os.getcwd()
train_path = os.path.join(dir_path, "training_smiles.csv")
test_path = os.path.join(dir_path, "test_smiles.csv")

orig_train_data = pd.read_csv(train_path, index_col="INDEX")
orig_test_data = pd.read_csv(test_path, index_col="INDEX")

orig_train_data.shape
orig_train_data.info()
orig_train_data.head()

<class 'pandas.core.frame.DataFrame'>
Index: 202895 entries, 1 to 202895
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   SMILES  202895 non-null  object 
 1   ACTIVE  202895 non-null  float64
dtypes: float64(1), object(1)
memory usage: 4.6+ MB


Unnamed: 0_level_0,SMILES,ACTIVE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1
1,O=C(Nc1ccc2c(c1)OCCO2)C1CCN(c2ncccn2)CC1,0.0
2,COCCCN1C(=O)C2C(C(=O)Nc3cccc(Cl)c3)C3C=CC2(O3)...,0.0
3,CCSc1ncc(Cl)c(C(=O)Nc2ccccc2C)n1,0.0
4,COc1ccc2cc(/C=N/NC(=O)CN(c3ccccc3C)S(=O)(=O)c3...,0.0
5,CCCC(=O)Nc1nc2ccc(NC(=O)c3c(F)c(F)c(OC)c(F)c3F...,0.0


In [4]:
dups = orig_train_data[orig_train_data.duplicated(keep=False)].sort_values("SMILES")
dups

Unnamed: 0_level_0,SMILES,ACTIVE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1
53753,C#CCN(C)CCCOc1ccc(Cl)cc1Cl,0.0
76642,C#CCN(C)CCCOc1ccc(Cl)cc1Cl,0.0
62428,C#CCN(C)Cc1ccccc1,0.0
187254,C#CCN(C)Cc1ccccc1,0.0
57327,C=CCc1cc(OC)ccc1OCC(O)CN1CCN(c2ccc(OC)cc2)CC1,0.0
...,...,...
88625,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,0.0
2664,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,0.0
118198,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,0.0
92987,c1cncc(C2CCCCN2)c1,0.0


In [5]:
train_data = orig_train_data.drop_duplicates(keep='first')
train_data.duplicated().sum()
train_data.shape

(202672, 2)

In [6]:
pd.DataFrame({'Null Values':train_data.isnull().sum()})

Unnamed: 0,Null Values
SMILES,0
ACTIVE,0


## Feature selection RDKit

In [7]:
train_data.shape, train_data.columns, train_data.head()

((202672, 2),
 Index(['SMILES', 'ACTIVE'], dtype='object'),
                                                   SMILES  ACTIVE
 INDEX                                                           
 1               O=C(Nc1ccc2c(c1)OCCO2)C1CCN(c2ncccn2)CC1     0.0
 2      COCCCN1C(=O)C2C(C(=O)Nc3cccc(Cl)c3)C3C=CC2(O3)...     0.0
 3                       CCSc1ncc(Cl)c(C(=O)Nc2ccccc2C)n1     0.0
 4      COc1ccc2cc(/C=N/NC(=O)CN(c3ccccc3C)S(=O)(=O)c3...     0.0
 5      CCCC(=O)Nc1nc2ccc(NC(=O)c3c(F)c(F)c(OC)c(F)c3F...     0.0)

In [18]:
def smile_to_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    return {
        "MolWt": d.CalcExactMolWt(mol),
        "HeavyAtomCount": l.HeavyAtomCount(mol),
        "NumHDonors": l.NumHDonors(mol),
        "NumHAcceptors": l.NumHAcceptors(mol),
        "fr_Al_COO": f.fr_Al_COO(mol)
    }

descriptors = [smile_to_descriptors(smiles) for smiles in train_data["SMILES"]]
df = pd.DataFrame(descriptors)

df = pd.DataFrame(descriptors, index=train_data.index)
df["ACTIVE"] = train_data["ACTIVE"]



In [19]:
df.head()

Unnamed: 0_level_0,MolWt,HeavyAtomCount,NumHDonors,NumHAcceptors,fr_Al_COO,ACTIVE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,340.153541,25,1,6,0,0.0
2,501.203049,35,2,5,0,0.0
3,307.054611,20,1,4,0,0.0
4,522.112854,36,1,6,0,0.0
5,441.077025,30,2,5,0,0.0


In [22]:
df.drop(columns=["ACTIVE"], inplace=False).nunique()

MolWt             69856
HeavyAtomCount      173
NumHDonors           58
NumHAcceptors        58
fr_Al_COO             8
dtype: int64

In [24]:
df.corr(numeric_only=True)["ACTIVE"].sort_values(ascending=False)

ACTIVE            1.000000
NumHDonors        0.049411
MolWt            -0.033146
fr_Al_COO        -0.033323
HeavyAtomCount   -0.044986
NumHAcceptors    -0.050029
Name: ACTIVE, dtype: float64