# Data preprocesing

Zuzanna Gorczyca, zgo@kth.se

Alga Nour Elimane, nealga@kth.se

Tse An Shih, tashih@kth.se

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import os
from Functions import *

from rdkit import Chem
from rdkit.Chem import Descriptors
import rdkit.Chem.rdMolDescriptors as d
import rdkit.Chem.Lipinski as l
import rdkit.Chem.Fragments as f
from rdkit.Chem import AllChem



#ignore warnings
import warnings
warnings.filterwarnings('ignore')



### Data loading

In [2]:
dir_path = os.getcwd()
train_path = os.path.join(dir_path, "datasets", "training_smiles.csv")
test_path = os.path.join(dir_path, "datasets", "test_smiles.csv")

orig_train_data = pd.read_csv(train_path, index_col="INDEX")
orig_test_data = pd.read_csv(test_path, index_col="INDEX")

print("Shape: ", orig_train_data.shape, "\nInfo: ")
orig_train_data.info()
orig_train_data.head()

Shape:  (202895, 2) 
Info: 
<class 'pandas.core.frame.DataFrame'>
Index: 202895 entries, 1 to 202895
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   SMILES  202895 non-null  object 
 1   ACTIVE  202895 non-null  float64
dtypes: float64(1), object(1)
memory usage: 4.6+ MB


Unnamed: 0_level_0,SMILES,ACTIVE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1
1,O=C(Nc1ccc2c(c1)OCCO2)C1CCN(c2ncccn2)CC1,0.0
2,COCCCN1C(=O)C2C(C(=O)Nc3cccc(Cl)c3)C3C=CC2(O3)...,0.0
3,CCSc1ncc(Cl)c(C(=O)Nc2ccccc2C)n1,0.0
4,COc1ccc2cc(/C=N/NC(=O)CN(c3ccccc3C)S(=O)(=O)c3...,0.0
5,CCCC(=O)Nc1nc2ccc(NC(=O)c3c(F)c(F)c(OC)c(F)c3F...,0.0


### Duplicates

In [3]:
dups = orig_train_data[orig_train_data.duplicated(keep=False)].sort_values("SMILES")
dups

Unnamed: 0_level_0,SMILES,ACTIVE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1
53753,C#CCN(C)CCCOc1ccc(Cl)cc1Cl,0.0
76642,C#CCN(C)CCCOc1ccc(Cl)cc1Cl,0.0
62428,C#CCN(C)Cc1ccccc1,0.0
187254,C#CCN(C)Cc1ccccc1,0.0
57327,C=CCc1cc(OC)ccc1OCC(O)CN1CCN(c2ccc(OC)cc2)CC1,0.0
...,...,...
88625,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,0.0
2664,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,0.0
118198,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,0.0
92987,c1cncc(C2CCCCN2)c1,0.0


Deleting duplicates

In [4]:
duplicates = orig_train_data[orig_train_data.duplicated(subset="SMILES", keep=False)].sort_values(by="SMILES")

conflicting_smiles = duplicates.groupby('SMILES').filter(lambda x: x['ACTIVE'].nunique() > 1)
train_data = orig_train_data[~orig_train_data['SMILES'].isin(conflicting_smiles['SMILES'])]
train_data = train_data.drop_duplicates(subset='SMILES', keep='first')

print(f"\nRemoved {len(orig_train_data) - len(train_data)} rows total")
print(f"New training data shape: {train_data.shape}")


Removed 239 rows total
New training data shape: (202656, 2)


There is no missing values

In [5]:
pd.DataFrame({'Null Values':train_data.isnull().sum()})

Unnamed: 0,Null Values
SMILES,0
ACTIVE,0


## Feature selection RDKit

In [6]:
train_data.shape, train_data.columns, train_data.head()

((202656, 2),
 Index(['SMILES', 'ACTIVE'], dtype='object'),
                                                   SMILES  ACTIVE
 INDEX                                                           
 1               O=C(Nc1ccc2c(c1)OCCO2)C1CCN(c2ncccn2)CC1     0.0
 2      COCCCN1C(=O)C2C(C(=O)Nc3cccc(Cl)c3)C3C=CC2(O3)...     0.0
 3                       CCSc1ncc(Cl)c(C(=O)Nc2ccccc2C)n1     0.0
 4      COc1ccc2cc(/C=N/NC(=O)CN(c3ccccc3C)S(=O)(=O)c3...     0.0
 5      CCCC(=O)Nc1nc2ccc(NC(=O)c3c(F)c(F)c(OC)c(F)c3F...     0.0)

In [7]:
display(train_data)

Unnamed: 0_level_0,SMILES,ACTIVE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1
1,O=C(Nc1ccc2c(c1)OCCO2)C1CCN(c2ncccn2)CC1,0.0
2,COCCCN1C(=O)C2C(C(=O)Nc3cccc(Cl)c3)C3C=CC2(O3)...,0.0
3,CCSc1ncc(Cl)c(C(=O)Nc2ccccc2C)n1,0.0
4,COc1ccc2cc(/C=N/NC(=O)CN(c3ccccc3C)S(=O)(=O)c3...,0.0
5,CCCC(=O)Nc1nc2ccc(NC(=O)c3c(F)c(F)c(OC)c(F)c3F...,0.0
...,...,...
202891,O=S1(=O)CC2(N3CCOCC3)CCCCCCC21,0.0
202892,Cc1ccc(C2c3c(oc4ccccc4c3=O)C(=O)N2c2nnc(C(C)C)...,0.0
202893,COc1ccc(CC(=O)OCC(=O)NCc2ccc(Cl)cc2)cc1,0.0
202894,CC1CN(CCOCCOc2ccccc2-c2ccccc2)CC(C)O1,0.0


In [8]:

FP_RADIUS = 2
FP_BITS = 124

columns_names = [
    "NoAtoms",
    "CalcExactMolWt",
    "fr_Al_COO",
    "HeavyAtomCount",
    "NumHDonors",
    "NumHAcceptors",
] + [f"MFp_{i}" for i in range(FP_BITS)]

fpgen = AllChem.GetMorganGenerator(radius=FP_RADIUS, fpSize=FP_BITS)

def smile_to_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = fpgen.GetFingerprint(mol)
    features = [
        mol.GetNumAtoms(),
        d.CalcExactMolWt(mol),
        f.fr_Al_COO(mol),
        l.HeavyAtomCount(mol),
        l.NumHDonors(mol),
        l.NumHAcceptors(mol),
    ]
    return np.concatenate([features, fp])

rows = []
valid_idx = []

for i, smiles in enumerate(train_data["SMILES"]):
    if i % 200 == 0:
        print(f"Processed {i}")
    row = smile_to_descriptors(smiles)
    if row is not None:
        rows.append(row)
        valid_idx.append(train_data.index[i])

descriptors = pd.DataFrame(rows, columns=columns_names, index=valid_idx)
df = descriptors.copy()
df["ACTIVE"] = train_data.loc[valid_idx, "ACTIVE"]

display(df.head())
df.info()

Processed 0
Processed 200
Processed 400
Processed 600
Processed 800
Processed 1000
Processed 1200
Processed 1400
Processed 1600
Processed 1800
Processed 2000
Processed 2200
Processed 2400
Processed 2600
Processed 2800
Processed 3000
Processed 3200
Processed 3400
Processed 3600
Processed 3800
Processed 4000
Processed 4200
Processed 4400
Processed 4600
Processed 4800
Processed 5000
Processed 5200
Processed 5400
Processed 5600
Processed 5800
Processed 6000
Processed 6200
Processed 6400
Processed 6600
Processed 6800
Processed 7000
Processed 7200
Processed 7400
Processed 7600
Processed 7800
Processed 8000
Processed 8200
Processed 8400
Processed 8600
Processed 8800
Processed 9000
Processed 9200
Processed 9400
Processed 9600
Processed 9800
Processed 10000
Processed 10200
Processed 10400
Processed 10600
Processed 10800
Processed 11000
Processed 11200
Processed 11400
Processed 11600
Processed 11800
Processed 12000
Processed 12200
Processed 12400
Processed 12600
Processed 12800
Processed 13000
P



Processed 58400
Processed 58600
Processed 58800
Processed 59000
Processed 59200
Processed 59400
Processed 59600
Processed 59800
Processed 60000
Processed 60200
Processed 60400
Processed 60600
Processed 60800
Processed 61000
Processed 61200
Processed 61400
Processed 61600
Processed 61800
Processed 62000
Processed 62200
Processed 62400
Processed 62600
Processed 62800
Processed 63000
Processed 63200
Processed 63400
Processed 63600
Processed 63800
Processed 64000
Processed 64200
Processed 64400
Processed 64600
Processed 64800
Processed 65000
Processed 65200
Processed 65400
Processed 65600
Processed 65800
Processed 66000
Processed 66200
Processed 66400
Processed 66600
Processed 66800
Processed 67000
Processed 67200
Processed 67400
Processed 67600
Processed 67800
Processed 68000
Processed 68200
Processed 68400
Processed 68600
Processed 68800
Processed 69000
Processed 69200
Processed 69400
Processed 69600
Processed 69800
Processed 70000
Processed 70200
Processed 70400
Processed 70600
Processe

Unnamed: 0,NoAtoms,CalcExactMolWt,fr_Al_COO,HeavyAtomCount,NumHDonors,NumHAcceptors,MFp_0,MFp_1,MFp_2,MFp_3,...,MFp_115,MFp_116,MFp_117,MFp_118,MFp_119,MFp_120,MFp_121,MFp_122,MFp_123,ACTIVE
1,25.0,340.153541,0.0,25.0,1.0,6.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,35.0,501.203049,0.0,35.0,2.0,5.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
3,20.0,307.054611,0.0,20.0,1.0,4.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
4,36.0,522.112854,0.0,36.0,1.0,6.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
5,30.0,441.077025,0.0,30.0,2.0,5.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
Index: 202656 entries, 1 to 202895
Columns: 131 entries, NoAtoms to ACTIVE
dtypes: float64(131)
memory usage: 204.1 MB


## Preprocesing of extracted fearures
Data Info

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 202656 entries, 1 to 202895
Columns: 131 entries, NoAtoms to ACTIVE
dtypes: float64(131)
memory usage: 204.1 MB


Missing values in rows = invalid SMILES

In [23]:
pd.DataFrame({'Null Values':df.isnull().sum()})

Unnamed: 0,Null Values
NoAtoms,0
CalcExactMolWt,0
fr_Al_COO,0
HeavyAtomCount,0
NumHDonors,0
...,...
MFp_120,0
MFp_121,0
MFp_122,0
MFp_123,0


In [24]:
df_new = df.dropna(axis=0, how='any', inplace=False, ignore_index=False)

In [25]:
display(df_new)

Unnamed: 0,NoAtoms,CalcExactMolWt,fr_Al_COO,HeavyAtomCount,NumHDonors,NumHAcceptors,MFp_0,MFp_1,MFp_2,MFp_3,...,MFp_115,MFp_116,MFp_117,MFp_118,MFp_119,MFp_120,MFp_121,MFp_122,MFp_123,ACTIVE
1,25.0,340.153541,0.0,25.0,1.0,6.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,35.0,501.203049,0.0,35.0,2.0,5.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
3,20.0,307.054611,0.0,20.0,1.0,4.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
4,36.0,522.112854,0.0,36.0,1.0,6.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
5,30.0,441.077025,0.0,30.0,2.0,5.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202891,18.0,273.139865,0.0,18.0,0.0,4.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
202892,29.0,407.093977,0.0,29.0,0.0,7.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
202893,24.0,347.092436,0.0,24.0,1.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
202894,26.0,355.214744,0.0,26.0,0.0,4.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [26]:
y = df_new["ACTIVE"]

desc_cols = [
    "NoAtoms",
    "CalcExactMolWt",
    "fr_Al_COO",
    "HeavyAtomCount",
    "NumHDonors",
    "NumHAcceptors"
]

fp_cols = [c for c in df_new.columns if c.startswith("MFp_")]

#Descriptors only
X_desc = df_new[desc_cols]

#Descriptors+Morgan fingerprints
X_desc_fp = df_new[desc_cols + fp_cols]

Normalization + saved mask for normalization for procesing test data befor testking prediction

In [27]:
norm_train_data, normalisation_mask = create_normalization(df_new, "minmax")
display(norm_train_data)

Unnamed: 0,NoAtoms,CalcExactMolWt,fr_Al_COO,HeavyAtomCount,NumHDonors,NumHAcceptors,MFp_0,MFp_1,MFp_2,MFp_3,...,MFp_115,MFp_116,MFp_117,MFp_118,MFp_119,MFp_120,MFp_121,MFp_122,MFp_123,ACTIVE
1,0.072508,0.064142,0.0,0.072508,0.013514,0.092308,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.102719,0.098677,0.0,0.102719,0.027027,0.076923,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
3,0.057402,0.057045,0.0,0.057402,0.013514,0.061538,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
4,0.105740,0.103160,0.0,0.105740,0.013514,0.092308,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
5,0.087613,0.085784,0.0,0.087613,0.027027,0.076923,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202891,0.051360,0.049773,0.0,0.051360,0.000000,0.061538,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
202892,0.084592,0.078497,0.0,0.084592,0.000000,0.107692,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
202893,0.069486,0.065630,0.0,0.069486,0.013514,0.061538,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
202894,0.075529,0.067372,0.0,0.075529,0.000000,0.061538,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


Discretization of columns "NoAtoms", "CalcExactMolWt", "HeavyAtomCount" + saved mask for normalization for procesing test data befor testking prediction

In [28]:
discret_df, discretization_mask = create_bins(norm_train_data[["NoAtoms", "CalcExactMolWt", "fr_Al_COO", "HeavyAtomCount", "NumHDonors", "NumHAcceptors"]], nobins=10)#, bintype="equal-size")
df_test = apply_bins(norm_train_data, discretization_mask)
display(df_test)

['NoAtoms', 'CalcExactMolWt', 'fr_Al_COO', 'HeavyAtomCount', 'NumHDonors', 'NumHAcceptors']


Unnamed: 0,NoAtoms,CalcExactMolWt,fr_Al_COO,HeavyAtomCount,NumHDonors,NumHAcceptors,MFp_0,MFp_1,MFp_2,MFp_3,...,MFp_115,MFp_116,MFp_117,MFp_118,MFp_119,MFp_120,MFp_121,MFp_122,MFp_123,ACTIVE
1,4,4,0,4,0,3,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,9,9,0,9,1,2,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
3,1,2,0,1,0,1,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
4,9,9,0,9,0,3,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
5,8,8,0,8,1,2,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202891,0,1,0,0,0,1,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
202892,7,7,0,7,0,4,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
202893,4,4,0,4,0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
202894,5,5,0,5,0,1,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [29]:
df_test.drop(columns=["ACTIVE"], inplace=False).nunique()

NoAtoms           10
CalcExactMolWt    10
fr_Al_COO          1
HeavyAtomCount    10
NumHDonors         3
                  ..
MFp_119            2
MFp_120            2
MFp_121            2
MFp_122            2
MFp_123            2
Length: 130, dtype: int64

In [30]:
df_test.describe()

Unnamed: 0,MFp_0,MFp_1,MFp_2,MFp_3,MFp_4,MFp_5,MFp_6,MFp_7,MFp_8,MFp_9,...,MFp_115,MFp_116,MFp_117,MFp_118,MFp_119,MFp_120,MFp_121,MFp_122,MFp_123,ACTIVE
count,202656.0,202656.0,202656.0,202656.0,202656.0,202656.0,202656.0,202656.0,202656.0,202656.0,...,202656.0,202656.0,202656.0,202656.0,202656.0,202656.0,202656.0,202656.0,202656.0,202656.0
mean,0.114677,0.23573,0.300174,0.395883,0.16717,0.781151,0.248919,0.293384,0.21238,0.425001,...,0.154483,0.129421,0.114677,0.166454,0.248549,0.341505,0.41417,0.919992,0.116197,0.061691
std,0.318633,0.424455,0.458334,0.489041,0.373128,0.413467,0.432388,0.455314,0.408993,0.494344,...,0.361412,0.335666,0.318633,0.372489,0.432173,0.474216,0.492579,0.271305,0.320462,0.240594
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
df.corr(numeric_only=True)["ACTIVE"].sort_values(ascending=False)

ACTIVE    1.000000
MFp_25    0.165878
MFp_96    0.106884
MFp_65    0.087286
MFp_18    0.071368
            ...   
MFp_5    -0.059076
MFp_80   -0.060763
MFp_64   -0.062166
MFp_84   -0.076822
MFp_74   -0.138736
Name: ACTIVE, Length: 131, dtype: float64

Save new preprocesed data

In [32]:
df_test.to_csv("./datasets/training_smiles_processed5.csv")

In [33]:
norm_train_data.to_csv("./datasets/training_smiles_processed3.csv")

In [148]:
df_new.to_csv("./datasets/training_smiles_processed4.csv")