## Dataset using Rdkit 2D descriptors

In [1]:

import gc  # Garbage collector
import sys
import numpy as np
import pandas as pd

path_to_scripts = '../../scripts/'
sys.path.append(path_to_scripts)
from smiles_property_extractor import ChemicalInfoFromSmiles


In [2]:

##dataset after cleaning
db_name = '../../datasets/processed/cleaned_data_v1.csv'

##dataset wo outliers
#db_name = '../datasets/cleaned_data_v2.csv'

db = pd.read_csv(db_name)


In [3]:

print('Generating a new dataset of smiles, raw and transformend  solubilities ...')
db_ = db[['molindx', 'SMILES', 'Temperature', 'ExperimentalSolubilityInWater']]

db_ = db_.assign(logS=lambda x: (np.log(x['ExperimentalSolubilityInWater'])))

db_['ExperimentalSolubilityInWater'] = db_['ExperimentalSolubilityInWater'].round(3)
db_['logS'] = db_['logS'].round(5)


Generating a new dataset of smiles, raw and transformend  solubilities ...


### Generate the database from rdkitr 2D descriptors

In [4]:

from joblib import Parallel, delayed

# Define a function to handle a single row
def process_row(smiles):
    return ChemicalInfoFromSmiles.get_rdkit_2dDescriptors_from_smiles(smiles)

# Use joblib to parallelize the task
print('This process might take a bit of time. Please be patient.')
df = Parallel(n_jobs=-1)(delayed(process_row)(smiles) for smiles in db_['SMILES'])

# Convert the list of results into a DataFrame
df = pd.DataFrame(df)


This process might take a bit of time. Please be patient.


In [5]:

db = pd.concat([db_, df], axis=1)
db = db.drop(columns=['SMILES', 'ExperimentalSolubilityInWater'], axis=1)
db


Unnamed: 0,molindx,Temperature,logS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0,298.15,10.81559,5.319136,4.440892e-16,5.319136,4.440892e-16,0.617367,184.413,184.413,...,0,0,0,0,0,0,0,0,0,0
1,1,288.15,5.62659,10.310926,-9.305556e-01,10.310926,6.597222e-01,0.550532,213.105,210.081,...,0,0,0,0,0,0,0,0,0,0
2,2,297.15,9.92904,10.753893,-1.913613e+00,10.753893,3.575926e-01,0.617257,257.114,254.090,...,0,0,0,0,0,0,0,0,0,0
3,3,298.15,7.20786,10.505370,-1.456667e+00,10.505370,4.899074e-01,0.592368,212.117,208.085,...,0,0,0,0,0,0,0,0,0,0
4,4,289.15,6.27377,10.447315,-1.025463e+00,10.447315,5.648148e-01,0.593359,247.550,245.534,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4630,4630,298.15,6.27897,10.531111,-2.534722e-01,10.531111,2.534722e-01,0.405757,142.183,136.135,...,0,0,0,0,0,0,0,0,0,0
4631,4631,293.15,5.48050,5.432816,-2.308866e+00,5.432816,8.910683e-02,0.756509,281.303,269.207,...,0,0,0,0,0,0,0,0,0,0
4632,4632,298.15,7.82406,10.988586,-1.106000e+00,10.988586,4.643713e-02,0.745556,381.020,369.932,...,0,0,0,0,0,0,0,0,0,0
4633,4633,295.15,3.13549,12.562207,-3.905929e+00,12.562207,2.312117e-02,0.642151,331.353,318.249,...,0,1,0,0,0,0,0,0,0,0


In [6]:

#filtration 3 (for attributes)
db_filtered = db.loc[:, ~(db == 0).all()]
number_of_columns_with_zeros = db.shape[1] - db_filtered.shape[1]
print(f'{number_of_columns_with_zeros} columns are fully zero.')


4 columns are fully zero.


In [7]:

#filtration 4 (for attributes)
db_ = db_filtered.dropna(axis=1)
number_of_columns_with_Nans = db_filtered.shape[1] - db_.shape[1]
print(f'{number_of_columns_with_Nans} columns are fully np.nan.')

db_


0 columns are fully np.nan.


Unnamed: 0,molindx,Temperature,logS,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0,298.15,10.81559,5.319136,4.440892e-16,5.319136,4.440892e-16,0.617367,184.413,184.413,...,0,0,0,0,0,0,0,0,0,0
1,1,288.15,5.62659,10.310926,-9.305556e-01,10.310926,6.597222e-01,0.550532,213.105,210.081,...,0,0,0,0,0,0,0,0,0,0
2,2,297.15,9.92904,10.753893,-1.913613e+00,10.753893,3.575926e-01,0.617257,257.114,254.090,...,0,0,0,0,0,0,0,0,0,0
3,3,298.15,7.20786,10.505370,-1.456667e+00,10.505370,4.899074e-01,0.592368,212.117,208.085,...,0,0,0,0,0,0,0,0,0,0
4,4,289.15,6.27377,10.447315,-1.025463e+00,10.447315,5.648148e-01,0.593359,247.550,245.534,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4630,4630,298.15,6.27897,10.531111,-2.534722e-01,10.531111,2.534722e-01,0.405757,142.183,136.135,...,0,0,0,0,0,0,0,0,0,0
4631,4631,293.15,5.48050,5.432816,-2.308866e+00,5.432816,8.910683e-02,0.756509,281.303,269.207,...,0,0,0,0,0,0,0,0,0,0
4632,4632,298.15,7.82406,10.988586,-1.106000e+00,10.988586,4.643713e-02,0.745556,381.020,369.932,...,0,0,0,0,0,0,0,0,0,0
4633,4633,295.15,3.13549,12.562207,-3.905929e+00,12.562207,2.312117e-02,0.642151,331.353,318.249,...,0,1,0,0,0,0,0,0,0,0


In [8]:

db_.to_csv('../../datasets/processed/datasetRdkitDescriptors.csv',  index=False)

#dataset wo outliers
#db_.to_csv('../../datasets/processed/datasetRdkitDescriptors_v2.csv',  index=False)
