## Dataset using Jazzy descriptors

In [1]:

import gc  # Garbage collector
import sys
import logging
import numpy as np
import pandas as pd
import concurrent.futures
from tqdm import tqdm  # For progress bar

sys.path.append('../../scripts/')
from smiles_property_extractor import ChemicalInfoFromSmiles


In [2]:

#dataset after cleaning
db_name = '../../datasets/processed/cleaned_data_v1.csv'

#dataset wo outliers
#db_name = '../datasets/cleaned_data_v2.csv'

db = pd.read_csv(db_name)


In [3]:

print('Generating a Jazzy dataset ...')
db_ = db[['molindx', 'SMILES', 'Temperature', 'ExperimentalSolubilityInWater']]

db_ = db_.assign(logS=lambda x: (np.log(x['ExperimentalSolubilityInWater'])))

db_['ExperimentalSolubilityInWater'] = db_['ExperimentalSolubilityInWater']
db_['logS'] = db_['logS'].round(5)


Generating a Jazzy dataset ...


### Generate the database from jazzy 2D descriptors "Set number of processors based on you system"

In [None]:

def compute_descriptors(smiles):
    """Compute descriptors for a given SMILES string."""
    jazzydescriptors_eeq = ChemicalInfoFromSmiles.get_yukawa_potential_from_jazzy(smiles, atomic_property='eeq')
    jazzydescriptors_alp = ChemicalInfoFromSmiles.get_yukawa_potential_from_jazzy(smiles, atomic_property='alp')
    jazzydescriptors = ChemicalInfoFromSmiles.get_molecular_vector_from_smiles_jazzy(smiles)
    return jazzydescriptors_eeq, jazzydescriptors_alp, jazzydescriptors

def process_data_chunk(smiles_list):
    """Process a chunk of data and return the results."""
    data_eeq = []
    data_alp = []
    data = []

    for smiles in smiles_list:
        jazzydescriptors_eeq, jazzydescriptors_alp, jazzydescriptors = compute_descriptors(smiles)
        data_eeq.append(jazzydescriptors_eeq)
        data_alp.append(jazzydescriptors_alp)
        data.append(jazzydescriptors)

    return data_eeq, data_alp, data

# Adjust the threshold to a higher number based on your dataset size
gc.set_threshold(1000)

print('This will take some time. Please be patient...')
print('Check "number_of_processors" for your system.')

smiles_list = db_['SMILES'].tolist()

# Use a ThreadPoolExecutor to parallelize the computations
number_of_processors = 8
# Adjust number of workers based on your system
num_workers = min(number_of_processors, len(smiles_list)) 

chunk_size = len(smiles_list) // num_workers


data_eeq = [None] * len(smiles_list)
data_alp = [None] * len(smiles_list)
data = [None] * len(smiles_list)

with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
    # Map futures to their original positions
    future_to_index = {
        executor.submit(process_data_chunk, smiles_list[i:i + chunk_size]): i
        for i in range(0, len(smiles_list), chunk_size)
    }

    # Collect results and place them in the correct positions
    for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(future_to_index)):
        chunk_start_index = future_to_index[future]  # Get the start index of the chunk
        try:
            results = future.result()
            # Determine the end index of the chunk
            chunk_end_index = min(chunk_start_index + chunk_size, len(smiles_list))
            # Extract results for this chunk
            result_eeq, result_alp, result_data = results
            # Place results in the correct positions
            data_eeq[chunk_start_index:chunk_end_index] = result_eeq
            data_alp[chunk_start_index:chunk_end_index] = result_alp
            data[chunk_start_index:chunk_end_index] = result_data
        except Exception as exc:
            print(f'Chunk generated an exception: {exc}')
           
gc.collect() # Optionally, you can run garbage collection at the end


This will take some time. Please be patient...
Check "number_of_processors" for your system.


 89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍             | 8/9 [29:19<01:43, 103.91s/it]

In [None]:

df = pd.DataFrame(data_eeq)
df.columns = ['rdf_eeq']
df1 = df.round(5)

df = pd.DataFrame(data_alp)
df.columns = ['rdf_alp']
df2 = df.round(5)

df3 = pd.DataFrame(data)
df3 = df3.round(5)


In [None]:

#This is the database for running Machine learning...
db = pd.concat([db_, df1, df2, df3], axis=1)
db


In [None]:

#filtration 3 (for attributes)
db_filtered = db.loc[:, ~(db == 0).all()]
number_of_columns_with_zeros = db.shape[1] - db_filtered.shape[1]
print(f'{number_of_columns_with_zeros} columns are fully zero.')


In [None]:

#filtration 4 (for attributes)
db_ = db_filtered.dropna(axis=1)
number_of_columns_with_Nans = db_filtered.shape[1] - db_.shape[1]
print(f'{number_of_columns_with_Nans} columns are fully np.nan.')


In [None]:

is_scaled = False
if is_scaled:
    db_['Temperature'] = db_['Temperature'] / max(db_['Temperature'])
    db_['sdc'] = db_['sdc'] / max(db_['sdc'])
    db_['sdx'] = db_['sdx'] / max(db_['sdx'])
    db_['sa'] = db_['sa'] / max(db_['sa'])
    db_['dga'] = - db_['dga'] / min(db_['dga'])
    db_['dgp'] = - db_['dgp'] / min(db_['dgp'])
    db_['dgtot'] = - db_['dgtot'] / min(db_['dgtot'])


In [None]:

db_new = db_.drop(['ExperimentalSolubilityInWater', 'SMILES'], axis='columns')
db_new


In [None]:

db_new.to_csv('../../datasets/processed/datasetJazzyDescriptors.csv', index=False)

#dataset wo outliers
#db_new.to_csv('../../datasets/processed/datasetJazzyDescriptors_v2.csv', index=False)
