In [None]:
pip install rdkit

In [3]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [4]:
import numpy as np
import pandas as pd

In [5]:
dataset = pd.read_csv('E.bugandensis_ISS.csv')

In [6]:
dataset.shape

(11, 2)

In [7]:
dataset.head()

Unnamed: 0,Smiles,Values
0,[H][C@]12SCC(CSC3=NN=C(C)S3)=C(N1C(=O)[C@H]2NC...,0
1,[H][C@]12SCC(COC(N)=O)=C(N1C(=O)[C@]2(NC(=O)CC...,0
2,OC(=O)C1=CN(C2CC2)C2=CC(N3CCNCC3)=C(F)C=C2C1=O,0
3,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,0
4,[H][C@@]1(CN)CC[C@@H](N)[C@@H](O[C@]2([H])[C@@...,0


In [None]:
# Count the occurrences of each value in the 'Binarization' column
binarization_counts = dataset['Values'].value_counts()

# Calculate the percentage of each class
binarization_percentages = binarization_counts / len(dataset) * 100

print("Binarization class distribution:")
print(binarization_counts)
print("\nBinarization class percentages:")
print(binarization_percentages)

# Check for balance (e.g., if the difference between percentages is less than 5%)
is_balanced = abs(binarization_percentages.iloc[0] - binarization_percentages.iloc[1]) < 5

print(f"\nIs the 'Binarization' column balanced (within 5% difference)? {is_balanced}")

In [11]:
# General molecular descriptors about 200 molecular descriptors
def RDKit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles if Chem.MolFromSmiles(smi) is not None] #list comprehension, it converst all smiles to molecular graphs
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList]) #calculate all descriptors available in RDKIT
    desc_names = calc.GetDescriptorNames() #to get mol desc names

    mol_descriptors = [] #to get mol desc values
    for mol in mols:
        mol = Chem.AddHs(mol) # add hydrogen molecule
        descriptors = calc.CalcDescriptors(mol)
        mol_descriptors.append(descriptors)
    return mol_descriptors, desc_names

In [None]:
# Calculate descriptors
mol_descriptors, desc_names = RDKit_descriptors(dataset['Smiles'])

# Create DataFrame from descriptors
df_descriptors = pd.DataFrame(mol_descriptors, columns=desc_names)

# Merge with original data
# Reset index of unique_smiles to avoid issues during merging.
unique_smiles = dataset.reset_index(drop=True)
df_merged = pd.concat([unique_smiles, df_descriptors], axis=1)

# Display the merged dataframe
print(df_merged.tail())

In [None]:
print(df_merged)


In [14]:
df_merged.to_csv('E.bugandensis_ISS_2DRDKit_descriptors.csv', index=False)

preprocessing

In [None]:
import pandas as pd

# Assuming df_merged is your DataFrame from the previous code
selected_features = ['Values', 'MaxEStateIndex', 'MinEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'NumRadicalElectrons',
                     'MaxPartialCharge', 'MinPartialCharge', 'FpDensityMorgan1', 'BCUT2D_MWHI', 'BCUT2D_MWLOW',
                     'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'HallKierAlpha',
                     'Kappa3', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14',
                     'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8',
                     'PEOE_VSA9', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8',
                     'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA12', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA7',
                     'SlogP_VSA8', 'SlogP_VSA9', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3',
                     'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'VSA_EState10',
                     'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState9', 'FractionCSP3',
                     'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings',
                     'NumAromaticHeterocycles', 'MolLogP', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN',
                     'fr_Ar_COO', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 'fr_NH1',
                     'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_SH', 'fr_aldehyde',
                     'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amidine', 'fr_aniline',
                     'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzodiazepine', 'fr_bicyclic',
                     'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan',
                     'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide',
                     'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_lactam', 'fr_lactone', 'fr_methoxy',
                     'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole',
                     'fr_oxime', 'fr_para_hydroxylation', 'fr_phos_acid', 'fr_piperdine', 'fr_piperzine',
                     'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd',
                     'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene',
                     'fr_unbrch_alkane', 'fr_urea']


data_selected = df_merged[selected_features]

print(data_selected)


In [None]:
from sklearn.preprocessing import StandardScaler

# Assuming data_selected is your DataFrame
X = data_selected.drop('Values', axis=1)  # Features (exclude target variable)
y = data_selected['Values']  # Target variable

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

# Create a new DataFrame with the scaled features
scaled_data = pd.DataFrame(X_scaled, columns=X.columns)

# Add the target variable back to the DataFrame
scaled_data['Values'] = y

scaled_data


Unnamed: 0,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,Values
0,-0.028467,0.922355,-1.006093,-0.247395,-0.27603,0.0,0.70075,-0.387699,0.744767,0.998798,...,0.0,0.0,0.0,3.162278,0.0,0.0,-0.316228,0.0,0.0,0
1,0.13194,0.648539,-0.841571,-0.23163,-0.361769,0.0,1.621289,-0.387666,0.972273,0.995069,...,0.0,0.0,0.0,-0.316228,0.0,0.0,3.162278,0.0,0.0,0
2,1.126089,0.618122,-0.847308,1.428522,-0.666312,0.0,0.498927,-0.412726,0.295588,-0.737911,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
3,1.417103,-2.22964,2.186181,-0.821962,0.609333,0.0,-0.022327,0.083356,-0.888611,-1.061043,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
4,-1.735625,-0.707868,0.995659,-1.442748,2.690439,0.0,-1.789735,1.991038,-2.097481,-1.057824,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
5,0.206418,0.604363,-0.453775,0.94509,-0.444203,0.0,0.263779,-0.47088,0.295588,0.995099,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
6,-0.351662,0.799078,-0.852499,0.605731,-0.945412,0.0,,,1.331762,,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
7,-0.066305,0.626338,-0.440189,1.100692,-0.656648,0.0,0.263779,-0.47088,0.553023,0.995099,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
8,-0.085133,0.624203,-0.427337,0.967817,-0.605954,0.0,0.263779,-0.580869,0.591638,0.995099,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
9,1.163619,-1.322242,0.671771,-1.262842,0.891389,0.0,-0.006077,-1.194213,-0.687899,-1.062041,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0


In [None]:
# Assuming scaled_data is your DataFrame
scaled_data = scaled_data.dropna()
scaled_data


Unnamed: 0,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,Values
0,-0.028467,0.922355,-1.006093,-0.247395,-0.27603,0.0,0.70075,-0.387699,0.744767,0.998798,...,0.0,0.0,0.0,3.162278,0.0,0.0,-0.316228,0.0,0.0,0
1,0.13194,0.648539,-0.841571,-0.23163,-0.361769,0.0,1.621289,-0.387666,0.972273,0.995069,...,0.0,0.0,0.0,-0.316228,0.0,0.0,3.162278,0.0,0.0,0
2,1.126089,0.618122,-0.847308,1.428522,-0.666312,0.0,0.498927,-0.412726,0.295588,-0.737911,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
3,1.417103,-2.22964,2.186181,-0.821962,0.609333,0.0,-0.022327,0.083356,-0.888611,-1.061043,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
4,-1.735625,-0.707868,0.995659,-1.442748,2.690439,0.0,-1.789735,1.991038,-2.097481,-1.057824,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
5,0.206418,0.604363,-0.453775,0.94509,-0.444203,0.0,0.263779,-0.47088,0.295588,0.995099,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
7,-0.066305,0.626338,-0.440189,1.100692,-0.656648,0.0,0.263779,-0.47088,0.553023,0.995099,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
8,-0.085133,0.624203,-0.427337,0.967817,-0.605954,0.0,0.263779,-0.580869,0.591638,0.995099,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
9,1.163619,-1.322242,0.671771,-1.262842,0.891389,0.0,-0.006077,-1.194213,-0.687899,-1.062041,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,0
10,-1.777977,-0.583248,1.015162,-1.041276,-0.234835,0.0,-1.794163,1.830539,-1.110648,-1.060344,...,0.0,0.0,0.0,-0.316228,0.0,0.0,-0.316228,0.0,0.0,1


In [20]:
scaled_data.to_csv('E.bugandensis_ISS_2DRDKit_descriptors_scaled.csv', index=False)