## Datasets using w-ECFPs

In [1]:

import numpy as np
import pandas as pd
from pathlib import Path

path_to_scripts = '../../scripts'
sys.path.append(path_to_scripts)
from smiles_property_extractor import Fingerprint


In [2]:

#dataset after cleaning
db_name = '../../datasets/processed/cleaned_data_v1.csv'

#dataset wo outliers
#db_name = '../../datasets/processed/cleaned_data_v2.csv'

db = pd.read_csv(db_name , sep=',')


In [3]:

print('checking duplications based on number of SMILES ...')
#DB['Temperature']
db[db.duplicated(subset=['SMILES'])]


checking duplications based on number of SMILES ...


Unnamed: 0,molindx,Name,SMILES,MolarMass,CASNumber,ExperimentalSolubilityInWater,Temperature,ExperimentReference,Datagroup,SolvationEnergy,...,CalMR,TPSA,LabuteASA,BalabanJIndex,BertzCTIndex,Isomer,Aromatic,Cyclic,ChargeNeutral,MadeOfKeyElements


In [4]:

print('Generating a new database :')
db_ = db[['molindx', 'SMILES', 'Temperature', 'ExperimentalSolubilityInWater']]

db_ = db_.assign(logS=lambda x: (np.log(x['ExperimentalSolubilityInWater'])))

db_['logS'] = db_['logS'].round(5)

db_['Temperature'] = db_['Temperature'] / max(db_['Temperature'])

db_new = db_.drop("ExperimentalSolubilityInWater", axis='columns')
db_new


Generating a new database :


Unnamed: 0,molindx,SMILES,Temperature,logS
0,0,C1(=NC(=NC(=N1)Cl)Cl)Cl,0.635511,10.81559
1,1,C1=C(C=C(C=C1[N+](=O)[O-])[N+](=O)[O-])[N+](=O...,0.614196,5.62659
2,2,C1=C(C=C(C(=C1[N+](=O)[O-])C(=O)O)[N+](=O)[O-]...,0.633380,9.92904
3,3,C1=C(C=C(C=C1[N+](=O)[O-])[N+](=O)[O-])C(=O)O,0.635511,7.20786
4,4,[O-][N+](=O)C1=CC(=C(Cl)C(=C1)[N+]([O-])=O)[N+...,0.616327,6.27377
...,...,...,...,...
4630,4630,CC1=CC(=O)N=C(N1)S,0.635511,6.27897
4631,4631,COP(=S)(OC)SCC1=NC(=NC(=N1)N)N,0.624853,5.48050
4632,4632,CC(=O)N[C@@H](CC1=CC(=C(C(=C1)Br)O)Br)C(=O)O,0.635511,7.82406
4633,4633,CN1/C(=C(\NC2=CC=CC=N2)/O)/C(=O)C3=CC=CC=C3S1(...,0.629116,3.13549


In [5]:

DATA_PATH = "../../datasets/processed/"
diameter = 6
nbits=1024
is_numbering_fragments=True

f = Fingerprint(db_, DATA_PATH, is_numbering_fragments)
if Path('%s/wecfp-fingerprints.csv' % DATA_PATH).exists():
    print('Reading features from file...')
    xbits = pd.read_csv('%s/wecfp-fingerprints.csv' % DATA_PATH)
else:
    print('Computing features from scratch... THIS FAILS!! ')
    f.generate_ecfp(nbits=nbits, diameter=diameter)      # 1024
    xbits = pd.DataFrame(f.fingerprints)
    

Computing features from scratch... THIS FAILS!! 


Processing fingerprints: 100%|████████████████████████████████████████████████████| 4635/4635 [02:12<00:00, 34.96it/s]


In [6]:

xbits[xbits.duplicated()]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
3080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### One should delete/(solve the problem) the duplicated ones in the xbits.

In [7]:

db_org = db_.drop(columns=['SMILES', 'ExperimentalSolubilityInWater'], axis=1)
db_org


Unnamed: 0,molindx,Temperature,logS
0,0,0.635511,10.81559
1,1,0.614196,5.62659
2,2,0.633380,9.92904
3,3,0.635511,7.20786
4,4,0.616327,6.27377
...,...,...,...
4630,4630,0.635511,6.27897
4631,4631,0.624853,5.48050
4632,4632,0.635511,7.82406
4633,4633,0.629116,3.13549


In [8]:

data = pd.concat([xbits, db_org], axis=1)
data


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1017,1018,1019,1020,1021,1022,1023,molindx,Temperature,logS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.635511,10.81559
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.614196,5.62659
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2,0.633380,9.92904
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0.635511,7.20786
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4,0.616327,6.27377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4630,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4630,0.635511,6.27897
4631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4631,0.624853,5.48050
4632,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4632,0.635511,7.82406
4633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4633,0.629116,3.13549


In [9]:

data.to_csv('../../datasets/processed/dataset-w-ECFPDescriptors.csv', index=False)

#dataset wo outliers
#data.to_csv('../../datasets/processed/dataset-w-ECFPDescriptors_v2.csv',  index=False)
