## **Importing Libraries**

In [1]:
%%capture
!pip install mordred
!pip install rdkit


In [2]:
# Importing Libraries
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, AllChem

import mordred
from mordred import Calculator, descriptors


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

pd.set_option('display.max_columns', 2000)
warnings.filterwarnings("ignore")

In [4]:
sns.set(style='whitegrid')

In [5]:
df = pd.read_csv('2-RDKIT 2D descriptors RDKIT.csv')

## **Data Preprocessing**



1.   Removing missing values/non-numerical values
2.   Remove constant values
3.   Remove highly correlated values





In [None]:
# prompt: check for duplicates

# Check for duplicates based on specific columns (e.g., 'smiles' and 'molecule_chembl_id')
duplicates = df[df.duplicated(subset=['smiles'], keep=False)]

# Print the duplicate rows
len(duplicates)

In [7]:
# If you want to remove the duplicates:
df = df.drop_duplicates(subset=['smiles'], keep='first')

In [None]:
df.shape

In [9]:
df.head()

Unnamed: 0,codes,smiles,values,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA6,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,HeavyAtomCount,NHOHCount,NOCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_N,fr_Ar_NH,fr_Ar_OH,fr_COO,fr_COO2,fr_C_O,fr_C_O_noCOO,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_Nhpyrrole,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amide,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_barbitur,fr_benzene,fr_benzodiazepine,fr_bicyclic,fr_diazo,fr_dihydropyridine,fr_epoxide,fr_ester,fr_ether,fr_furan,fr_guanido,fr_halogen,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_isothiocyan,fr_ketone,fr_ketone_Topliss,fr_lactam,fr_lactone,fr_methoxy,fr_morpholine,fr_nitrile,fr_nitro,fr_nitro_arom,fr_nitro_arom_nonortho,fr_nitroso,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_phos_acid,fr_phos_ester,fr_piperdine,fr_piperzine,fr_priamide,fr_prisulfonamd,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1,O=C(O)P(=O)(O)O,1,10.467731,-4.487461,10.467731,1.696759,0.424095,126.004,122.98,125.97181,42,0,0.432622,-0.472198,0.472198,0.432622,1.714286,2.285714,2.428571,16.01678,-0.062454,0.5405894,-2.096348,0.6756902,-2.083933,1.348098,-0.76834,4.337771,160.275998,8.198671,5.988455,3.882882,4.602701,2.404323,2.722236,0.763376,2.12346,0.223607,0.67082,0.0,0.0,-0.38,136.8106,2.813948,1.61438,2.331744,43.221337,14.900193,0.0,0.0,2.862399,1.4312,13.306156,0.0,9.359585,0.0,0.0,0.0,0.0,0.0,0.0,24.259778,13.306156,4.293599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.794537,0.0,7.595762,23.472986,4.565048,0.0,0.0,0.0,0.0,1.4312,0.0,94.83,13.306156,9.359585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.900193,4.293599,28.38652,0.0,16.614846,3.155077,-1.696759,0.0,0.0,0.0,0.0,-4.487461,0.0,7,3,5,0,0,0,0,0,0,5,3,6,3,0,0,0,0,-0.1579,20.0689,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,NCc1cn(-c2ccccc2)nc1C12CC3CC(CC(C3)C1)C2,1,9.163315,-4.4338,9.163315,0.008726,0.937072,307.441,282.241,307.204848,120,0,0.118715,-0.326392,0.326392,0.118715,1.26087,1.913043,2.521739,5.936308e-16,-0.171716,-2.464451e-16,-2.995835,-8.586728000000001e-17,-2.912634,1.834876e-16,-1.197652,2.155439,1631.662609,37.55796,36.341641,11.341641,21.134657,18.988854,6.594427,4.957624,4.957624,3.507624,3.507624,2.619611,2.619611,-1.61,4754868000.0,3.502326,4.940841,2.09367,172.672867,5.719717,2.823684,0.0,0.0,0.0,0.0,0.0,4.681803,5.098682,0.0,18.127256,68.00426,46.951092,19.605866,34.351131,0.0,0.0,9.780485,23.40159,50.1494,0.0,47.642368,0.0,5.687386,5.719717,0.0,0.0,0.0,9.780485,11.91185,17.681873,81.022376,36.384989,2.823684,5.687386,0.0,43.84,126.880744,0.0,0.0,4.681803,0.0,0.0,0.0,0.0,0.0,5.098682,34.351131,216.274872,0.0,0.0,3.766323,-9.61618,-12.894043,-5.356498,-27.605224,-3.819249,0.0,0.55,23,2,3,4,0,4,1,1,2,3,1,3,4,4,0,4,6,3.7988,91.3054,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,1,16.630408,-8.17302,16.630408,2.011766,0.237873,733.937,666.401,733.461241,298,0,0.311188,-0.458937,0.458937,0.311188,1.0,1.901961,2.509804,-1.10414e-16,-0.249011,-2.821411e-16,-2.820224,-4.025176e-16,-2.92212,-8.678601e-17,-1.118721,3.941092,3723.848857,96.010225,91.254441,24.254441,50.487388,45.340421,12.299179,9.164438,9.164438,5.727323,5.727323,3.326461,3.326461,-1.3,3.904127e+38,8.365904,17.014479,9.506327,398.161366,58.865128,29.62435,12.532157,7.155998,0.0,5.969305,9.589074,0.0,0.0,0.0,27.407568,74.181781,104.277812,65.316011,148.541324,11.75255,7.155998,4.89991,23.575831,178.003967,20.989606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,154.335451,38.010252,23.575831,178.997651,0.0,0.0,0.0,0.0,193.91,239.221864,28.78593,9.589074,0.0,0.0,0.0,0.0,0.0,0.0,44.491493,72.830825,625.536327,0.0,31.124497,19.737283,-20.991014,-39.554826,-6.32817,-95.200189,-72.515747,-5.224828,0.945946,51,5,14,0,3,3,0,0,0,14,5,14,25,0,3,3,3,1.7856,186.258,0,5,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,CO/N=C1\CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)...,1,16.153679,-4.142295,16.153679,0.11758,0.730424,389.387,369.227,389.149932,148,0,0.340725,-0.477497,0.477497,0.340725,1.785714,2.642857,3.357143,11.50944,-0.243339,1.660313e-16,-2.548891,3.52863e-16,-2.755759,1.839745,-0.955868,2.242722,1814.808153,37.333981,33.247026,13.247026,21.383823,16.976796,7.174121,5.187403,5.187403,3.559196,3.559196,2.325602,2.325602,-3.01,4528452000.0,6.244897,6.860777,2.98863,187.047731,25.133123,21.072265,11.635084,5.42879,1.4312,5.969305,4.794537,14.168931,0.0,0.0,5.155713,18.788269,40.673775,31.302354,45.257929,28.532255,1.4312,9.551078,16.769387,18.763742,31.428441,33.819314,0.0,0.0,16.048417,10.208278,0.0,0.0,52.869408,4.837589,11.711179,52.424626,22.165564,2.823684,12.464601,0.0,123.04,119.362889,9.589074,4.390415,4.5671,0.0,0.0,0.0,0.0,0.0,20.086088,27.557779,176.198799,0.0,32.789328,5.187285,-7.510876,-10.024591,-5.124122,-8.18066,-12.115414,-3.553081,0.444444,28,3,9,1,1,2,0,2,2,9,2,10,7,1,1,2,4,0.966,100.3927,0,0,0,0,1,2,0,0,1,1,1,0,0,0,0,4,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0
4,5,CN[C@@H]1[C@@H](O)C(OC2[C@@H](N)C[C@@H](N)[C@H...,1,9.531285,-5.872222,9.531285,1.499618,0.164624,496.558,456.238,496.274443,200,0,0.211046,-0.390604,0.390604,0.211046,1.176471,1.970588,2.558824,-2.910004e-16,-0.246604,-2.518814e-16,-2.828425,-0.01655117,-3.036146,-5.596414000000001e-17,-1.098122,2.883945,2106.963439,59.380469,55.871337,15.871337,32.427216,27.305757,8.225772,6.030635,6.030635,3.824015,3.824015,2.294141,2.294141,-0.56,119972600000000.0,6.183361,10.819562,5.401402,254.570013,72.069265,58.044072,12.532157,8.587198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.052536,29.855646,43.865408,96.493677,0.0,8.587198,5.309813,17.159151,110.921631,13.534812,0.0,0.0,0.0,22.468964,0.0,0.0,0.0,143.619781,18.947452,0.0,57.08719,0.0,9.882895,0.0,0.0,248.39,146.925407,6.853793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.389712,62.83737,355.828789,0.0,0.0,22.319864,-9.815296,0.0,-21.409482,-59.981771,-18.942104,0.0,1.0,34,13,14,1,2,3,0,0,0,14,10,14,18,1,2,3,3,-5.613,115.8697,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
data = df.iloc[:,3:]

In [11]:
type(data)

In [12]:
data.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA6,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,HeavyAtomCount,NHOHCount,NOCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_N,fr_Ar_NH,fr_Ar_OH,fr_COO,fr_COO2,fr_C_O,fr_C_O_noCOO,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_Nhpyrrole,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amide,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_barbitur,fr_benzene,fr_benzodiazepine,fr_bicyclic,fr_diazo,fr_dihydropyridine,fr_epoxide,fr_ester,fr_ether,fr_furan,fr_guanido,fr_halogen,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_isothiocyan,fr_ketone,fr_ketone_Topliss,fr_lactam,fr_lactone,fr_methoxy,fr_morpholine,fr_nitrile,fr_nitro,fr_nitro_arom,fr_nitro_arom_nonortho,fr_nitroso,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_phos_acid,fr_phos_ester,fr_piperdine,fr_piperzine,fr_priamide,fr_prisulfonamd,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,10.467731,-4.487461,10.467731,1.696759,0.424095,126.004,122.98,125.97181,42,0,0.432622,-0.472198,0.472198,0.432622,1.714286,2.285714,2.428571,16.01678,-0.062454,0.5405894,-2.096348,0.6756902,-2.083933,1.348098,-0.76834,4.337771,160.275998,8.198671,5.988455,3.882882,4.602701,2.404323,2.722236,0.763376,2.12346,0.223607,0.67082,0.0,0.0,-0.38,136.8106,2.813948,1.61438,2.331744,43.221337,14.900193,0.0,0.0,2.862399,1.4312,13.306156,0.0,9.359585,0.0,0.0,0.0,0.0,0.0,0.0,24.259778,13.306156,4.293599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.794537,0.0,7.595762,23.472986,4.565048,0.0,0.0,0.0,0.0,1.4312,0.0,94.83,13.306156,9.359585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.900193,4.293599,28.38652,0.0,16.614846,3.155077,-1.696759,0.0,0.0,0.0,0.0,-4.487461,0.0,7,3,5,0,0,0,0,0,0,5,3,6,3,0,0,0,0,-0.1579,20.0689,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,9.163315,-4.4338,9.163315,0.008726,0.937072,307.441,282.241,307.204848,120,0,0.118715,-0.326392,0.326392,0.118715,1.26087,1.913043,2.521739,5.936308e-16,-0.171716,-2.464451e-16,-2.995835,-8.586728000000001e-17,-2.912634,1.834876e-16,-1.197652,2.155439,1631.662609,37.55796,36.341641,11.341641,21.134657,18.988854,6.594427,4.957624,4.957624,3.507624,3.507624,2.619611,2.619611,-1.61,4754868000.0,3.502326,4.940841,2.09367,172.672867,5.719717,2.823684,0.0,0.0,0.0,0.0,0.0,4.681803,5.098682,0.0,18.127256,68.00426,46.951092,19.605866,34.351131,0.0,0.0,9.780485,23.40159,50.1494,0.0,47.642368,0.0,5.687386,5.719717,0.0,0.0,0.0,9.780485,11.91185,17.681873,81.022376,36.384989,2.823684,5.687386,0.0,43.84,126.880744,0.0,0.0,4.681803,0.0,0.0,0.0,0.0,0.0,5.098682,34.351131,216.274872,0.0,0.0,3.766323,-9.61618,-12.894043,-5.356498,-27.605224,-3.819249,0.0,0.55,23,2,3,4,0,4,1,1,2,3,1,3,4,4,0,4,6,3.7988,91.3054,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,16.630408,-8.17302,16.630408,2.011766,0.237873,733.937,666.401,733.461241,298,0,0.311188,-0.458937,0.458937,0.311188,1.0,1.901961,2.509804,-1.10414e-16,-0.249011,-2.821411e-16,-2.820224,-4.025176e-16,-2.92212,-8.678601e-17,-1.118721,3.941092,3723.848857,96.010225,91.254441,24.254441,50.487388,45.340421,12.299179,9.164438,9.164438,5.727323,5.727323,3.326461,3.326461,-1.3,3.904127e+38,8.365904,17.014479,9.506327,398.161366,58.865128,29.62435,12.532157,7.155998,0.0,5.969305,9.589074,0.0,0.0,0.0,27.407568,74.181781,104.277812,65.316011,148.541324,11.75255,7.155998,4.89991,23.575831,178.003967,20.989606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,154.335451,38.010252,23.575831,178.997651,0.0,0.0,0.0,0.0,193.91,239.221864,28.78593,9.589074,0.0,0.0,0.0,0.0,0.0,0.0,44.491493,72.830825,625.536327,0.0,31.124497,19.737283,-20.991014,-39.554826,-6.32817,-95.200189,-72.515747,-5.224828,0.945946,51,5,14,0,3,3,0,0,0,14,5,14,25,0,3,3,3,1.7856,186.258,0,5,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,16.153679,-4.142295,16.153679,0.11758,0.730424,389.387,369.227,389.149932,148,0,0.340725,-0.477497,0.477497,0.340725,1.785714,2.642857,3.357143,11.50944,-0.243339,1.660313e-16,-2.548891,3.52863e-16,-2.755759,1.839745,-0.955868,2.242722,1814.808153,37.333981,33.247026,13.247026,21.383823,16.976796,7.174121,5.187403,5.187403,3.559196,3.559196,2.325602,2.325602,-3.01,4528452000.0,6.244897,6.860777,2.98863,187.047731,25.133123,21.072265,11.635084,5.42879,1.4312,5.969305,4.794537,14.168931,0.0,0.0,5.155713,18.788269,40.673775,31.302354,45.257929,28.532255,1.4312,9.551078,16.769387,18.763742,31.428441,33.819314,0.0,0.0,16.048417,10.208278,0.0,0.0,52.869408,4.837589,11.711179,52.424626,22.165564,2.823684,12.464601,0.0,123.04,119.362889,9.589074,4.390415,4.5671,0.0,0.0,0.0,0.0,0.0,20.086088,27.557779,176.198799,0.0,32.789328,5.187285,-7.510876,-10.024591,-5.124122,-8.18066,-12.115414,-3.553081,0.444444,28,3,9,1,1,2,0,2,2,9,2,10,7,1,1,2,4,0.966,100.3927,0,0,0,0,1,2,0,0,1,1,1,0,0,0,0,4,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0
4,9.531285,-5.872222,9.531285,1.499618,0.164624,496.558,456.238,496.274443,200,0,0.211046,-0.390604,0.390604,0.211046,1.176471,1.970588,2.558824,-2.910004e-16,-0.246604,-2.518814e-16,-2.828425,-0.01655117,-3.036146,-5.596414000000001e-17,-1.098122,2.883945,2106.963439,59.380469,55.871337,15.871337,32.427216,27.305757,8.225772,6.030635,6.030635,3.824015,3.824015,2.294141,2.294141,-0.56,119972600000000.0,6.183361,10.819562,5.401402,254.570013,72.069265,58.044072,12.532157,8.587198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.052536,29.855646,43.865408,96.493677,0.0,8.587198,5.309813,17.159151,110.921631,13.534812,0.0,0.0,0.0,22.468964,0.0,0.0,0.0,143.619781,18.947452,0.0,57.08719,0.0,9.882895,0.0,0.0,248.39,146.925407,6.853793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.389712,62.83737,355.828789,0.0,0.0,22.319864,-9.815296,0.0,-21.409482,-59.981771,-18.942104,0.0,1.0,34,13,14,1,2,3,0,0,0,14,10,14,18,1,2,3,3,-5.613,115.8697,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
data.shape

In [None]:
data.isnull().sum().sum()

In [15]:
column_num = []
column_bool = []
for column in data.columns:
  column_type = data[column].dtype
  if column_type == 'object':
      pass
  elif column_type =='bool':
      column_bool.append(column)
  else:
      column_num.append(column)

In [None]:
len(column_num)

In [None]:
column_bool

In [None]:
# prompt: remove null values data.isnull().sum().sum()

# Remove rows with any null values
data = data.dropna()

#Verify that there are no more null values
print(data.isnull().sum().sum())


In [None]:
data.shape

In [None]:
data.isnull().sum().sum()

In [21]:
# To calclulate  Correlation and remove highly  correlated columns
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
# Dropping highly correlated Features
corr_features = correlation(data, 0.80)
print("No. of features to drop : ",len(set(corr_features)))

data.drop(corr_features,axis=1,inplace=True)

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.shape

In [None]:
# Select only the rows in df that exist in data after dropping NaNs
df_cleaned = df.loc[data.index]

# Now merge the filtered df with the processed data
merged_df = pd.concat([df_cleaned.iloc[:, :3], data], axis=1)

# Verify that the number of rows matches
print(merged_df.shape)


In [None]:
merged_df.shape

In [42]:
merged_df.to_csv('RDKit_2D_truncated.csv', index=None)

In [None]:
merged_df['values'].info()

In [None]:
plt.figure(figsize=(5, 4))
sns.countplot(x=df["values"], palette="coolwarm")  # Replace "target" with actual column name
plt.title("Class Distribution")
plt.xlabel("Class (0 or 1)")
plt.ylabel("Count")
plt.show()

# **SMOTE**

In [43]:
y = merged_df['values']


In [None]:
# prompt: count classes in y
print(y.value_counts())


In [None]:
merged_df.isnull().sum().sum()

In [51]:
from sklearn.preprocessing import StandardScaler
scaled_DF = pd.DataFrame(StandardScaler().fit_transform(merged_df.iloc[:,3:]), columns=merged_df.iloc[:,3:].columns)

In [52]:
scaled_DF.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_MRHI,BCUT2D_MRLOW,BalabanJ,HallKierAlpha,Ipc,Kappa3,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA12,SlogP_VSA3,SlogP_VSA4,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,VSA_EState10,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState9,FractionCSP3,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticHeterocycles,MolLogP,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_NH,fr_Ar_OH,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_barbitur,fr_benzodiazepine,fr_bicyclic,fr_diazo,fr_dihydropyridine,fr_epoxide,fr_ester,fr_ether,fr_furan,fr_guanido,fr_halogen,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_isothiocyan,fr_ketone,fr_lactam,fr_lactone,fr_methoxy,fr_morpholine,fr_nitrile,fr_nitro,fr_nitro_arom_nonortho,fr_nitroso,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phos_acid,fr_piperdine,fr_piperzine,fr_priamide,fr_prisulfonamd,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,-1.080521,0.181166,1.37085,-0.472071,-1.383086,-0.014845,1.946663,-0.429034,0.930258,1.488318,0.71569,2.02359,2.794739,0.190559,1.873744,1.386061,1.388339,,-0.060403,-0.255046,-0.894733,-0.751846,-0.398369,-0.33311,1.672281,-0.927736,0.524562,-0.427157,-0.41156,-0.971481,-1.466445,-1.507449,-1.436096,-0.506354,0.832173,-0.988924,-0.62194,-1.133044,-1.463461,0.0,-0.62583,-0.901319,0.003991,0.222444,-0.689138,-0.741414,-0.658641,-0.482273,0.0,-0.236785,-0.339997,-0.55392,-0.218683,-0.110634,-0.161563,-0.099679,-0.074351,0.221412,-0.366625,-0.107123,0.668339,0.718696,-0.556871,-1.797929,-0.430846,-0.904357,-0.984755,-0.792435,-0.957792,-0.352228,-0.482986,-0.094914,-0.283664,-0.29354,-0.284559,-0.228807,-0.096464,-0.190209,-0.160769,-1.063099,-0.566042,-0.415239,-0.110548,-0.381766,-0.27279,-0.059566,-0.066982,-0.077493,-0.208518,-0.221912,-0.130562,-0.605416,-0.332775,-0.043912,-0.03681,-0.028619,-0.022943,-0.688612,-0.010813,-0.050199,-0.084315,-0.324904,-0.698269,-0.148374,-0.10627,-0.498183,-0.109172,-0.127787,-0.251261,-0.13373,0.0,-0.022808,-0.275626,-0.22283,-0.178294,-0.38366,-0.229282,-0.148959,-0.131424,0.0,-0.025366,-0.062732,-0.182621,-0.345572,-0.06128,-0.244227,-0.268787,-0.183367,0.0,-0.455245,-0.108709,-0.27039,-0.349524,-0.134427,-0.071514,-0.098413,-0.197648,-0.007646,-0.141092,0.0,-0.234686
1,-1.605205,0.218248,-1.082445,1.843591,-0.484254,-0.014845,-2.237741,1.424978,-0.419463,-1.429717,0.214076,-0.433776,-1.898361,-0.997425,-1.191247,-0.405478,0.554391,,-0.064783,-0.795991,-0.701295,-0.751846,-0.707134,-0.656924,-0.720184,-0.927736,-0.205823,0.613424,-0.41156,-0.228025,1.190669,0.330216,-0.503027,-1.271656,-0.655212,-0.030551,1.820102,-1.133044,0.101398,0.0,0.000608,-0.366244,-0.678793,-0.518161,-0.122239,1.057881,-0.018941,0.045849,0.0,-1.053853,-0.339997,0.085399,-0.218683,-0.110634,-0.161563,-0.099679,-0.074351,-0.68159,-0.366625,-0.026046,-0.455801,-0.389333,0.558959,0.470038,3.708714,-0.904357,1.825874,0.420646,0.617362,-0.352228,-0.482986,-0.094914,-0.283664,-0.29354,-0.284559,-0.228807,-0.096464,-0.190209,-0.160769,0.213635,-0.566042,0.961675,-0.110548,-0.381766,-0.27279,-0.059566,-0.066982,-0.077493,-0.208518,-0.221912,-0.130562,-0.605416,-0.332775,-0.043912,-0.03681,-0.028619,-0.022943,-0.688612,-0.010813,-0.050199,-0.084315,-0.324904,-0.698269,-0.148374,-0.10627,-0.498183,-0.109172,-0.127787,-0.251261,-0.13373,0.0,-0.022808,-0.275626,-0.22283,-0.178294,-0.38366,-0.229282,-0.148959,-0.131424,0.0,-0.025366,-0.062732,-0.182621,1.927285,-0.06128,-0.244227,-0.268787,-0.183367,0.0,-0.455245,-0.108709,-0.27039,-0.349524,-0.134427,-0.071514,-0.098413,-0.197648,-0.007646,-0.141092,0.0,-0.234686
2,1.398331,-2.365687,1.828663,-1.312711,1.62859,-0.014845,0.327945,-0.260409,-1.196015,-1.429717,-0.140776,-0.433776,-0.982105,-0.997425,-0.627734,1.060417,0.764573,,0.071574,2.335516,1.134695,0.7555,0.064778,-0.656924,0.353105,0.147671,-0.936835,-0.427157,-0.41156,0.15259,1.432042,2.573983,1.672381,-0.595709,1.823763,-0.50879,1.838284,0.132082,-1.463461,0.0,-0.62583,-0.901319,-0.678793,-0.518161,1.891583,1.657646,-0.658641,-0.659861,0.0,1.459087,1.121175,-0.55392,-0.218683,-0.110634,-0.161563,-0.099679,-0.074351,2.947626,-0.366625,2.092383,-2.070429,-2.680384,-0.740221,2.102751,-0.430846,1.849604,1.123217,-0.792435,-0.184089,-0.352228,3.330211,-0.094914,-0.283664,-0.29354,-0.284559,-0.228807,-0.096464,-0.190209,-0.160769,-0.424732,-0.566042,-0.415239,-0.110548,1.92108,-0.27279,-0.059566,-0.066982,-0.077493,-0.208518,-0.221912,-0.130562,-0.605416,-0.332775,-0.043912,-0.03681,-0.028619,-0.022943,-0.688612,-0.010813,-0.050199,-0.084315,1.522263,3.34504,-0.148374,-0.10627,-0.498183,-0.109172,-0.127787,-0.251261,-0.13373,0.0,-0.022808,1.963279,-0.22283,4.821821,1.175684,-0.229282,-0.148959,-0.131424,0.0,-0.025366,-0.062732,-0.182621,-0.345572,-0.06128,-0.244227,-0.268787,-0.183367,0.0,-0.455245,-0.108709,-0.27039,-0.349524,-0.134427,-0.071514,-0.098413,-0.197648,-0.007646,-0.141092,0.0,-0.234686
3,1.206574,0.419689,-0.924243,0.910747,-0.078297,-0.014845,0.721672,-0.496416,1.142885,0.667143,-0.114735,-0.433776,0.433585,0.623813,0.534924,-0.333824,-0.394817,,-0.04832,0.347912,0.548831,0.647602,-0.121534,-0.33311,0.353105,-0.390033,1.275488,-0.427157,-0.41156,-0.760029,-0.732335,0.084522,0.053625,0.369374,-0.159417,-0.05303,1.128007,0.761272,-0.352634,0.0,-0.62583,0.6,0.774954,-0.518161,-0.668108,0.450307,-0.018941,0.886788,0.0,-0.216751,0.329009,0.069736,-0.218683,-0.110634,-0.161563,-0.099679,-0.074351,0.699183,-0.366625,0.162434,-0.156959,-0.142751,-0.324533,0.034772,0.604044,0.01363,0.420559,1.633728,-0.51037,-0.352228,-0.482986,-0.094914,-0.283664,3.237819,-0.284559,-0.228807,-0.096464,-0.190209,-0.160769,1.490369,-0.566042,0.961675,-0.110548,-0.381766,-0.27279,-0.059566,-0.066982,-0.077493,-0.208518,-0.221912,-0.130562,0.709239,-0.332775,-0.043912,-0.03681,-0.028619,-0.022943,-0.042916,-0.010813,-0.050199,-0.084315,-0.324904,-0.698269,-0.148374,-0.10627,0.376539,-0.109172,-0.127787,-0.251261,-0.13373,0.0,-0.022808,-0.275626,-0.22283,-0.178294,-0.38366,-0.229282,-0.148959,-0.131424,0.0,-0.025366,-0.062732,5.396125,-0.345572,-0.06128,-0.244227,-0.268787,-0.183367,0.0,3.697183,-0.108709,-0.27039,-0.349524,-0.134427,-0.071514,-0.098413,-0.197648,-0.007646,-0.141092,0.0,-0.234686
4,-1.457194,-0.775753,1.084335,-1.643368,0.452624,-0.014845,-1.006963,0.608476,-0.670701,-1.429717,-0.129727,-0.433776,-1.024895,-0.997425,-0.480676,0.192574,1.266298,,-0.003937,3.113549,3.0816,0.7555,0.219161,-0.656924,-0.720184,-0.927736,-0.936835,-0.427157,-0.41156,-0.971481,-0.409428,-0.338899,0.651518,-1.271656,2.319558,-0.468625,1.16868,-0.317248,-1.463461,0.0,-0.62583,1.200638,-0.678793,-0.518161,0.420647,-0.741414,1.580308,-0.659861,0.0,-0.455534,-0.339997,-0.55392,-0.218683,-0.110634,-0.161563,-0.099679,-0.074351,2.109089,-0.366625,2.434943,-0.484065,0.718696,0.558959,2.325647,0.604044,0.931617,1.123217,-0.792435,-3.129457,-0.352228,4.09285,-0.094914,-0.283664,-0.29354,-0.284559,-0.228807,-0.096464,-0.190209,-0.160769,-1.063099,0.059736,3.715504,-0.110548,1.92108,-0.27279,-0.059566,-0.066982,-0.077493,-0.208518,-0.221912,-0.130562,-0.605416,-0.332775,-0.043912,-0.03681,-0.028619,-0.022943,-0.688612,-0.010813,-0.050199,-0.084315,-0.324904,1.99727,-0.148374,-0.10627,-0.498183,-0.109172,-0.127787,-0.251261,-0.13373,0.0,-0.022808,-0.275626,-0.22283,-0.178294,-0.38366,-0.229282,-0.148959,-0.131424,0.0,-0.025366,-0.062732,-0.182621,-0.345572,-0.06128,-0.244227,-0.268787,-0.183367,0.0,-0.455245,-0.108709,-0.27039,-0.349524,-0.134427,-0.071514,-0.098413,-0.197648,-0.007646,-0.141092,0.0,-0.234686


In [None]:
scaled_DF.isnull().sum().sum()

In [None]:
# The axis=1 argument specifies that we are concatenating along the columns
final_df = pd.concat([scaled_DF, y], axis=1)

In [None]:
final_df.shape

In [61]:
final_df.to_csv('rdkit_scaled_data.csv', index=False)

In [62]:
RD = pd.read_csv('rdkit_scaled_data.csv')

In [63]:
# prompt: define x and y

x = RD.drop('values', axis=1)
y = RD['values']


In [64]:
x.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_MRHI,BCUT2D_MRLOW,BalabanJ,HallKierAlpha,Kappa3,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA12,SlogP_VSA3,SlogP_VSA4,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,VSA_EState10,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState9,FractionCSP3,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticHeterocycles,MolLogP,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_NH,fr_Ar_OH,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_barbitur,fr_benzodiazepine,fr_bicyclic,fr_diazo,fr_dihydropyridine,fr_epoxide,fr_ester,fr_ether,fr_furan,fr_guanido,fr_halogen,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_isothiocyan,fr_ketone,fr_lactam,fr_lactone,fr_methoxy,fr_morpholine,fr_nitrile,fr_nitro,fr_nitro_arom_nonortho,fr_nitroso,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phos_acid,fr_piperdine,fr_piperzine,fr_priamide,fr_prisulfonamd,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,-1.080521,0.181166,1.37085,-0.472071,-1.383086,-0.014845,1.946663,-0.429034,0.930258,1.488318,0.71569,2.02359,2.794739,0.190559,1.873744,1.386061,1.388339,-0.060403,-0.255046,-0.894733,-0.751846,-0.398369,-0.33311,1.672281,-0.927736,0.524562,-0.427157,-0.41156,-0.971481,-1.466445,-1.507449,-1.436096,-0.506354,0.832173,-0.988924,-0.62194,-1.133044,-1.463461,0,-0.62583,-0.901319,0.003991,0.222444,-0.689138,-0.741414,-0.658641,-0.482273,0,-0.236785,-0.339997,-0.55392,-0.218683,-0.110634,-0.161563,-0.099679,-0.074351,0.221412,-0.366625,-0.107123,0.668339,0.718696,-0.556871,-1.797929,-0.430846,-0.904357,-0.984755,-0.792435,-0.957792,-0.352228,-0.482986,-0.094914,-0.283664,-0.29354,-0.284559,-0.228807,-0.096464,-0.190209,-0.160769,-1.063099,-0.566042,-0.415239,-0.110548,-0.381766,-0.27279,-0.059566,-0.066982,-0.077493,-0.208518,-0.221912,-0.130562,-0.605416,-0.332775,-0.043912,-0.03681,-0.028619,-0.022943,-0.688612,-0.010813,-0.050199,-0.084315,-0.324904,-0.698269,-0.148374,-0.10627,-0.498183,-0.109172,-0.127787,-0.251261,-0.13373,0,-0.022808,-0.275626,-0.22283,-0.178294,-0.38366,-0.229282,-0.148959,-0.131424,0,-0.025366,-0.062732,-0.182621,-0.345572,-0.06128,-0.244227,-0.268787,-0.183367,0,-0.455245,-0.108709,-0.27039,-0.349524,-0.134427,-0.071514,-0.098413,-0.197648,-0.007646,-0.141092,0,-0.234686
1,-1.605205,0.218248,-1.082445,1.843591,-0.484254,-0.014845,-2.237741,1.424978,-0.419463,-1.429717,0.214076,-0.433776,-1.898361,-0.997425,-1.191247,-0.405478,0.554391,-0.064783,-0.795991,-0.701295,-0.751846,-0.707134,-0.656924,-0.720184,-0.927736,-0.205823,0.613424,-0.41156,-0.228025,1.190669,0.330216,-0.503027,-1.271656,-0.655212,-0.030551,1.820102,-1.133044,0.101398,0,0.000608,-0.366244,-0.678793,-0.518161,-0.122239,1.057881,-0.018941,0.045849,0,-1.053853,-0.339997,0.085399,-0.218683,-0.110634,-0.161563,-0.099679,-0.074351,-0.68159,-0.366625,-0.026046,-0.455801,-0.389333,0.558959,0.470038,3.708714,-0.904357,1.825874,0.420646,0.617362,-0.352228,-0.482986,-0.094914,-0.283664,-0.29354,-0.284559,-0.228807,-0.096464,-0.190209,-0.160769,0.213635,-0.566042,0.961675,-0.110548,-0.381766,-0.27279,-0.059566,-0.066982,-0.077493,-0.208518,-0.221912,-0.130562,-0.605416,-0.332775,-0.043912,-0.03681,-0.028619,-0.022943,-0.688612,-0.010813,-0.050199,-0.084315,-0.324904,-0.698269,-0.148374,-0.10627,-0.498183,-0.109172,-0.127787,-0.251261,-0.13373,0,-0.022808,-0.275626,-0.22283,-0.178294,-0.38366,-0.229282,-0.148959,-0.131424,0,-0.025366,-0.062732,-0.182621,1.927285,-0.06128,-0.244227,-0.268787,-0.183367,0,-0.455245,-0.108709,-0.27039,-0.349524,-0.134427,-0.071514,-0.098413,-0.197648,-0.007646,-0.141092,0,-0.234686
2,1.398331,-2.365687,1.828663,-1.312711,1.62859,-0.014845,0.327945,-0.260409,-1.196015,-1.429717,-0.140776,-0.433776,-0.982105,-0.997425,-0.627734,1.060417,0.764573,0.071574,2.335516,1.134695,0.7555,0.064778,-0.656924,0.353105,0.147671,-0.936835,-0.427157,-0.41156,0.15259,1.432042,2.573983,1.672381,-0.595709,1.823763,-0.50879,1.838284,0.132082,-1.463461,0,-0.62583,-0.901319,-0.678793,-0.518161,1.891583,1.657646,-0.658641,-0.659861,0,1.459087,1.121175,-0.55392,-0.218683,-0.110634,-0.161563,-0.099679,-0.074351,2.947626,-0.366625,2.092383,-2.070429,-2.680384,-0.740221,2.102751,-0.430846,1.849604,1.123217,-0.792435,-0.184089,-0.352228,3.330211,-0.094914,-0.283664,-0.29354,-0.284559,-0.228807,-0.096464,-0.190209,-0.160769,-0.424732,-0.566042,-0.415239,-0.110548,1.92108,-0.27279,-0.059566,-0.066982,-0.077493,-0.208518,-0.221912,-0.130562,-0.605416,-0.332775,-0.043912,-0.03681,-0.028619,-0.022943,-0.688612,-0.010813,-0.050199,-0.084315,1.522263,3.34504,-0.148374,-0.10627,-0.498183,-0.109172,-0.127787,-0.251261,-0.13373,0,-0.022808,1.963279,-0.22283,4.821821,1.175684,-0.229282,-0.148959,-0.131424,0,-0.025366,-0.062732,-0.182621,-0.345572,-0.06128,-0.244227,-0.268787,-0.183367,0,-0.455245,-0.108709,-0.27039,-0.349524,-0.134427,-0.071514,-0.098413,-0.197648,-0.007646,-0.141092,0,-0.234686
3,1.206574,0.419689,-0.924243,0.910747,-0.078297,-0.014845,0.721672,-0.496416,1.142885,0.667143,-0.114735,-0.433776,0.433585,0.623813,0.534924,-0.333824,-0.394817,-0.04832,0.347912,0.548831,0.647602,-0.121534,-0.33311,0.353105,-0.390033,1.275488,-0.427157,-0.41156,-0.760029,-0.732335,0.084522,0.053625,0.369374,-0.159417,-0.05303,1.128007,0.761272,-0.352634,0,-0.62583,0.6,0.774954,-0.518161,-0.668108,0.450307,-0.018941,0.886788,0,-0.216751,0.329009,0.069736,-0.218683,-0.110634,-0.161563,-0.099679,-0.074351,0.699183,-0.366625,0.162434,-0.156959,-0.142751,-0.324533,0.034772,0.604044,0.01363,0.420559,1.633728,-0.51037,-0.352228,-0.482986,-0.094914,-0.283664,3.237819,-0.284559,-0.228807,-0.096464,-0.190209,-0.160769,1.490369,-0.566042,0.961675,-0.110548,-0.381766,-0.27279,-0.059566,-0.066982,-0.077493,-0.208518,-0.221912,-0.130562,0.709239,-0.332775,-0.043912,-0.03681,-0.028619,-0.022943,-0.042916,-0.010813,-0.050199,-0.084315,-0.324904,-0.698269,-0.148374,-0.10627,0.376539,-0.109172,-0.127787,-0.251261,-0.13373,0,-0.022808,-0.275626,-0.22283,-0.178294,-0.38366,-0.229282,-0.148959,-0.131424,0,-0.025366,-0.062732,5.396125,-0.345572,-0.06128,-0.244227,-0.268787,-0.183367,0,3.697183,-0.108709,-0.27039,-0.349524,-0.134427,-0.071514,-0.098413,-0.197648,-0.007646,-0.141092,0,-0.234686
4,-1.457194,-0.775753,1.084335,-1.643368,0.452624,-0.014845,-1.006963,0.608476,-0.670701,-1.429717,-0.129727,-0.433776,-1.024895,-0.997425,-0.480676,0.192574,1.266298,-0.003937,3.113549,3.0816,0.7555,0.219161,-0.656924,-0.720184,-0.927736,-0.936835,-0.427157,-0.41156,-0.971481,-0.409428,-0.338899,0.651518,-1.271656,2.319558,-0.468625,1.16868,-0.317248,-1.463461,0,-0.62583,1.200638,-0.678793,-0.518161,0.420647,-0.741414,1.580308,-0.659861,0,-0.455534,-0.339997,-0.55392,-0.218683,-0.110634,-0.161563,-0.099679,-0.074351,2.109089,-0.366625,2.434943,-0.484065,0.718696,0.558959,2.325647,0.604044,0.931617,1.123217,-0.792435,-3.129457,-0.352228,4.09285,-0.094914,-0.283664,-0.29354,-0.284559,-0.228807,-0.096464,-0.190209,-0.160769,-1.063099,0.059735,3.715504,-0.110548,1.92108,-0.27279,-0.059566,-0.066982,-0.077493,-0.208518,-0.221912,-0.130562,-0.605416,-0.332775,-0.043912,-0.03681,-0.028619,-0.022943,-0.688612,-0.010813,-0.050199,-0.084315,-0.324904,1.99727,-0.148374,-0.10627,-0.498183,-0.109172,-0.127787,-0.251261,-0.13373,0,-0.022808,-0.275626,-0.22283,-0.178294,-0.38366,-0.229282,-0.148959,-0.131424,0,-0.025366,-0.062732,-0.182621,-0.345572,-0.06128,-0.244227,-0.268787,-0.183367,0,-0.455245,-0.108709,-0.27039,-0.349524,-0.134427,-0.071514,-0.098413,-0.197648,-0.007646,-0.141092,0,-0.234686


In [65]:
y.head()

Unnamed: 0,values
0,1
1,1
2,1
3,1
4,1


In [None]:
x.isnull().sum().sum()

In [67]:
from imblearn.over_sampling import SMOTE
# Apply SMOTE to balance the entire dataset, increasing minority class instances
smote = SMOTE(sampling_strategy={0: 11000, 1: 11000}, random_state=42)

# Now use the imputed data for SMOTE
X_resampled, y_resampled = smote.fit_resample(x, y)

In [68]:
scaler = StandardScaler()
X_resampled_scaled = pd.DataFrame(scaler.fit_transform(X_resampled), columns = X_resampled.columns)

In [71]:
# Combine scaled features and target variable
final_df = pd.concat([X_resampled_scaled, y_resampled], axis=1)

# Save the DataFrame to a CSV file
final_df.to_csv('RDkit-2D_scal_aug_data.csv', index=False)

In [69]:
X_resampled_scaled.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_MRHI,BCUT2D_MRLOW,BalabanJ,HallKierAlpha,Kappa3,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA12,SlogP_VSA3,SlogP_VSA4,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,VSA_EState10,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState9,FractionCSP3,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticHeterocycles,MolLogP,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_NH,fr_Ar_OH,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_barbitur,fr_benzodiazepine,fr_bicyclic,fr_diazo,fr_dihydropyridine,fr_epoxide,fr_ester,fr_ether,fr_furan,fr_guanido,fr_halogen,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_isothiocyan,fr_ketone,fr_lactam,fr_lactone,fr_methoxy,fr_morpholine,fr_nitrile,fr_nitro,fr_nitro_arom_nonortho,fr_nitroso,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phos_acid,fr_piperdine,fr_piperzine,fr_priamide,fr_prisulfonamd,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,-1.087494,0.184938,1.35586,-0.492646,-1.397166,-0.01309,1.990064,-0.450123,0.934959,1.514625,0.793456,2.057611,2.807854,0.204433,1.902755,1.387607,1.402601,-0.063435,-0.245925,-0.901399,-0.759426,-0.399358,-0.335033,1.733959,-0.947518,0.549209,-0.434487,-0.40319,-0.979709,-1.491326,-1.525483,-1.442416,-0.494666,0.871015,-1.002864,-0.621,-1.125127,-1.46423,0.0,-0.640377,-0.914335,0.016257,0.241992,-0.70069,-0.741349,-0.666714,-0.47581,0.0,-0.232331,-0.336309,-0.552035,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,0.24347,-0.369777,-0.103562,0.659458,0.726244,-0.564955,-1.808499,-0.430983,-0.903394,-0.983262,-0.789543,-0.974885,-0.356889,-0.492406,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,-1.059784,-0.584379,-0.412612,-0.108315,-0.381783,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,-0.601011,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.689301,-0.009535,-0.049142,-0.087195,-0.333152,-0.71363,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,-0.386595,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666
1,-1.620405,0.22241,-1.098739,1.870766,-0.478515,-0.01309,-2.271368,1.445307,-0.431758,-1.425608,0.239492,-0.437626,-1.910615,-0.99109,-1.201756,-0.431694,0.552297,-0.068181,-0.805372,-0.702489,-0.759426,-0.714161,-0.671606,-0.731941,-0.947518,-0.192465,0.634161,-0.40319,-0.23088,1.206538,0.349922,-0.507008,-1.279521,-0.66508,-0.020221,1.833909,-1.125127,0.103963,0.0,-0.000943,-0.359178,-0.675827,-0.511374,-0.119503,1.059159,-0.004652,0.058322,0.0,-1.07081,-0.336309,0.107462,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,-0.688391,-0.369777,-0.020325,-0.472635,-0.401438,0.563001,0.466715,3.682938,-0.903394,1.850784,0.437342,0.616218,-0.356889,-0.492406,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,0.233283,-0.584379,1.037406,-0.108315,-0.381783,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,-0.601011,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.689301,-0.009535,-0.049142,-0.087195,-0.333152,-0.71363,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,-0.386595,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,1.988342,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666
2,1.430229,-2.388664,1.813916,-1.350621,1.680917,-0.01309,0.341549,-0.277731,-1.218088,-1.425608,-0.152393,-0.437626,-0.989406,-0.99109,-0.630977,1.056917,0.766601,0.079585,2.433247,1.185443,0.810166,0.072847,-0.671606,0.374292,0.157743,-0.934776,-0.434487,-0.40319,0.152485,1.451613,2.639769,1.673852,-0.586304,1.895078,-0.510572,1.852187,0.14149,-1.46423,0.0,-0.640377,-0.914335,-0.675827,-0.511374,1.945077,1.659328,-0.666714,-0.655419,0.0,1.507979,1.161327,-0.552035,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,3.056808,-0.369777,2.154536,-2.098684,-2.733128,-0.750297,2.104645,-0.430983,1.894249,1.142272,-0.789543,-0.193348,-0.356889,3.395675,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,-0.41325,-0.584379,-0.412612,-0.108315,1.944165,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,-0.601011,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.689301,-0.009535,-0.049142,-0.087195,1.552941,3.40735,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,1.996413,-0.20891,4.803252,1.178345,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666
3,1.235464,0.425966,-0.940453,0.918686,-0.063607,-0.01309,0.742523,-0.51901,1.150263,0.687203,-0.123635,-0.437626,0.433937,0.640437,0.546672,-0.35893,-0.415529,-0.050341,0.377658,0.583005,0.697812,-0.117109,-0.335033,0.374292,-0.394888,1.311743,-0.434487,-0.40319,-0.766729,-0.745958,0.099182,0.051039,0.403436,-0.153048,-0.04327,1.138168,0.771421,-0.351036,0.0,-0.640377,0.643328,0.797722,-0.511374,-0.67913,0.451175,-0.004652,0.908831,0.0,-0.211772,0.349392,0.091304,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,0.736509,-0.369777,0.173176,-0.171679,-0.150483,-0.330092,0.030058,0.597497,0.029154,0.433761,1.664227,-0.522932,-0.356889,-0.492406,-0.089717,-0.278579,3.443027,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,1.52635,-0.584379,1.037406,-0.108315,-0.381783,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,0.720907,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.042219,-0.009535,-0.049142,-0.087195,-0.333152,-0.71363,-0.149814,-0.107656,0.398957,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,-0.386595,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,5.790646,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,3.819079,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666
4,-1.470073,-0.782032,1.069193,-1.688096,0.479021,-0.01309,-1.017933,0.610565,-0.686159,-1.425608,-0.140191,-0.437626,-1.032427,-0.99109,-0.482023,0.175625,1.278166,-0.002244,3.237891,3.187427,0.810166,0.230248,-0.671606,-0.731941,-0.947518,-0.934776,-0.434487,-0.40319,-0.979709,-0.418098,-0.332935,0.65043,-1.279521,2.407109,-0.469389,1.179055,-0.30837,-1.46423,0.0,-0.640377,1.266507,-0.675827,-0.511374,0.437066,-0.741349,1.650506,-0.655419,0.0,-0.456812,-0.336309,-0.552035,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,2.191473,-0.369777,2.506222,-0.501099,0.726244,0.563001,2.328253,0.597497,0.961701,1.142272,-0.789543,-3.168537,-0.356889,4.173291,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,-1.059784,0.066341,3.937441,-0.108315,1.944165,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,-0.601011,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.689301,-0.009535,-0.049142,-0.087195,-0.333152,2.03369,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,-0.386595,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666


In [70]:
X_resampled_scaled.isnull().sum().sum()

0

In [75]:
final_df.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_MRHI,BCUT2D_MRLOW,BalabanJ,HallKierAlpha,Kappa3,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA12,SlogP_VSA3,SlogP_VSA4,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,VSA_EState10,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState9,FractionCSP3,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticHeterocycles,MolLogP,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_NH,fr_Ar_OH,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_barbitur,fr_benzodiazepine,fr_bicyclic,fr_diazo,fr_dihydropyridine,fr_epoxide,fr_ester,fr_ether,fr_furan,fr_guanido,fr_halogen,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_isothiocyan,fr_ketone,fr_lactam,fr_lactone,fr_methoxy,fr_morpholine,fr_nitrile,fr_nitro,fr_nitro_arom_nonortho,fr_nitroso,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phos_acid,fr_piperdine,fr_piperzine,fr_priamide,fr_prisulfonamd,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,values
0,-1.087494,0.184938,1.35586,-0.492646,-1.397166,-0.01309,1.990064,-0.450123,0.934959,1.514625,0.793456,2.057611,2.807854,0.204433,1.902755,1.387607,1.402601,-0.063435,-0.245925,-0.901399,-0.759426,-0.399358,-0.335033,1.733959,-0.947518,0.549209,-0.434487,-0.40319,-0.979709,-1.491326,-1.525483,-1.442416,-0.494666,0.871015,-1.002864,-0.621,-1.125127,-1.46423,0.0,-0.640377,-0.914335,0.016257,0.241992,-0.70069,-0.741349,-0.666714,-0.47581,0.0,-0.232331,-0.336309,-0.552035,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,0.24347,-0.369777,-0.103562,0.659458,0.726244,-0.564955,-1.808499,-0.430983,-0.903394,-0.983262,-0.789543,-0.974885,-0.356889,-0.492406,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,-1.059784,-0.584379,-0.412612,-0.108315,-0.381783,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,-0.601011,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.689301,-0.009535,-0.049142,-0.087195,-0.333152,-0.71363,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,-0.386595,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666,1
1,-1.620405,0.22241,-1.098739,1.870766,-0.478515,-0.01309,-2.271368,1.445307,-0.431758,-1.425608,0.239492,-0.437626,-1.910615,-0.99109,-1.201756,-0.431694,0.552297,-0.068181,-0.805372,-0.702489,-0.759426,-0.714161,-0.671606,-0.731941,-0.947518,-0.192465,0.634161,-0.40319,-0.23088,1.206538,0.349922,-0.507008,-1.279521,-0.66508,-0.020221,1.833909,-1.125127,0.103963,0.0,-0.000943,-0.359178,-0.675827,-0.511374,-0.119503,1.059159,-0.004652,0.058322,0.0,-1.07081,-0.336309,0.107462,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,-0.688391,-0.369777,-0.020325,-0.472635,-0.401438,0.563001,0.466715,3.682938,-0.903394,1.850784,0.437342,0.616218,-0.356889,-0.492406,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,0.233283,-0.584379,1.037406,-0.108315,-0.381783,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,-0.601011,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.689301,-0.009535,-0.049142,-0.087195,-0.333152,-0.71363,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,-0.386595,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,1.988342,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666,1
2,1.430229,-2.388664,1.813916,-1.350621,1.680917,-0.01309,0.341549,-0.277731,-1.218088,-1.425608,-0.152393,-0.437626,-0.989406,-0.99109,-0.630977,1.056917,0.766601,0.079585,2.433247,1.185443,0.810166,0.072847,-0.671606,0.374292,0.157743,-0.934776,-0.434487,-0.40319,0.152485,1.451613,2.639769,1.673852,-0.586304,1.895078,-0.510572,1.852187,0.14149,-1.46423,0.0,-0.640377,-0.914335,-0.675827,-0.511374,1.945077,1.659328,-0.666714,-0.655419,0.0,1.507979,1.161327,-0.552035,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,3.056808,-0.369777,2.154536,-2.098684,-2.733128,-0.750297,2.104645,-0.430983,1.894249,1.142272,-0.789543,-0.193348,-0.356889,3.395675,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,-0.41325,-0.584379,-0.412612,-0.108315,1.944165,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,-0.601011,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.689301,-0.009535,-0.049142,-0.087195,1.552941,3.40735,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,1.996413,-0.20891,4.803252,1.178345,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666,1
3,1.235464,0.425966,-0.940453,0.918686,-0.063607,-0.01309,0.742523,-0.51901,1.150263,0.687203,-0.123635,-0.437626,0.433937,0.640437,0.546672,-0.35893,-0.415529,-0.050341,0.377658,0.583005,0.697812,-0.117109,-0.335033,0.374292,-0.394888,1.311743,-0.434487,-0.40319,-0.766729,-0.745958,0.099182,0.051039,0.403436,-0.153048,-0.04327,1.138168,0.771421,-0.351036,0.0,-0.640377,0.643328,0.797722,-0.511374,-0.67913,0.451175,-0.004652,0.908831,0.0,-0.211772,0.349392,0.091304,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,0.736509,-0.369777,0.173176,-0.171679,-0.150483,-0.330092,0.030058,0.597497,0.029154,0.433761,1.664227,-0.522932,-0.356889,-0.492406,-0.089717,-0.278579,3.443027,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,1.52635,-0.584379,1.037406,-0.108315,-0.381783,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,0.720907,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.042219,-0.009535,-0.049142,-0.087195,-0.333152,-0.71363,-0.149814,-0.107656,0.398957,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,-0.386595,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,5.790646,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,3.819079,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666,1
4,-1.470073,-0.782032,1.069193,-1.688096,0.479021,-0.01309,-1.017933,0.610565,-0.686159,-1.425608,-0.140191,-0.437626,-1.032427,-0.99109,-0.482023,0.175625,1.278166,-0.002244,3.237891,3.187427,0.810166,0.230248,-0.671606,-0.731941,-0.947518,-0.934776,-0.434487,-0.40319,-0.979709,-0.418098,-0.332935,0.65043,-1.279521,2.407109,-0.469389,1.179055,-0.30837,-1.46423,0.0,-0.640377,1.266507,-0.675827,-0.511374,0.437066,-0.741349,1.650506,-0.655419,0.0,-0.456812,-0.336309,-0.552035,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,2.191473,-0.369777,2.506222,-0.501099,0.726244,0.563001,2.328253,0.597497,0.961701,1.142272,-0.789543,-3.168537,-0.356889,4.173291,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,-1.059784,0.066341,3.937441,-0.108315,1.944165,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,-0.601011,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.689301,-0.009535,-0.049142,-0.087195,-0.333152,2.03369,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,-0.386595,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666,1


In [74]:
final_df.tail()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_MRHI,BCUT2D_MRLOW,BalabanJ,HallKierAlpha,Kappa3,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA12,SlogP_VSA3,SlogP_VSA4,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,VSA_EState10,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState9,FractionCSP3,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticHeterocycles,MolLogP,fr_Al_COO,fr_Al_OH,fr_Al_OH_noTert,fr_ArN,fr_Ar_COO,fr_Ar_NH,fr_Ar_OH,fr_C_S,fr_HOCCN,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_Ndealkylation1,fr_Ndealkylation2,fr_SH,fr_aldehyde,fr_alkyl_carbamate,fr_alkyl_halide,fr_allylic_oxid,fr_amidine,fr_aniline,fr_aryl_methyl,fr_azide,fr_azo,fr_barbitur,fr_benzodiazepine,fr_bicyclic,fr_diazo,fr_dihydropyridine,fr_epoxide,fr_ester,fr_ether,fr_furan,fr_guanido,fr_halogen,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_isothiocyan,fr_ketone,fr_lactam,fr_lactone,fr_methoxy,fr_morpholine,fr_nitrile,fr_nitro,fr_nitro_arom_nonortho,fr_nitroso,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phos_acid,fr_piperdine,fr_piperzine,fr_priamide,fr_prisulfonamd,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,values
21995,0.763016,-1.758244,3.705534,-1.57821,2.045318,-0.01309,0.278603,-0.564698,-0.990037,-1.425608,-0.144904,-0.437626,-1.002007,-0.99109,-0.835676,-0.347409,0.73135,0.064785,2.22055,-0.901399,2.227239,-0.241956,-0.335033,0.374292,-0.014651,-0.934776,-0.434487,-0.40319,1.280102,1.041256,3.815691,2.652266,-0.692718,1.383046,-1.002864,3.939617,-0.304621,-1.200946,0.0,-0.640377,-0.914335,-0.675827,-0.511374,2.342411,3.685556,-0.666714,-0.47581,0.0,3.189122,0.178913,-0.552035,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,1.220052,-0.369777,1.53846,-0.775338,-5.417335,-0.659869,2.05017,-0.430983,4.400982,3.046785,-0.789543,1.326119,1.727614,1.840442,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,-1.059784,-0.584379,-0.412612,-0.108315,-0.381783,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,0.347016,-0.130515,-0.601011,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.689301,-0.009535,-0.049142,-0.087195,-0.333152,4.352494,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,1.286792,-0.20891,-0.18201,1.178345,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666,1
21996,-0.251118,0.330677,-0.92645,-0.103778,0.057515,-0.01309,-0.421182,0.912161,-0.43349,0.489955,-0.150869,-0.437626,-0.397834,0.045384,-0.393658,-0.463695,-0.529917,-0.039864,-0.666326,0.03859,-0.24188,-0.506366,0.216432,-0.731941,0.162614,0.341745,1.27276,-0.40319,-0.378726,1.277984,0.181539,0.074763,-0.615329,-0.66508,1.765429,-0.621,-0.067587,0.509806,0.0,0.837884,-0.235052,-0.406467,-0.511374,-0.719375,1.101651,-0.560539,1.942159,0.0,-0.695643,-0.336309,0.505092,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,0.560988,-0.369777,0.098256,-0.632843,0.313279,0.28739,-0.155215,-0.430983,-0.269954,-0.502001,3.724484,0.606651,-0.356889,-0.492406,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,2.612046,-0.375666,-0.412612,-0.108315,-0.381783,2.078968,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,-0.177016,7.272051,-0.045481,-0.035118,-0.028616,-0.021325,0.397316,-0.009535,-0.049142,-0.087195,-0.333152,-0.493335,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,0.115348,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,1.762355,-0.265537,-0.178146,0.0,0.999703,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666,1
21997,0.308656,-0.152637,0.098623,0.483339,0.025123,-0.01309,0.449143,-0.768483,0.444955,-0.902711,-0.124174,-0.437626,-0.641013,-0.923798,-0.270949,-0.248074,-0.015713,0.010106,0.969119,-0.068086,-0.405773,0.092905,-0.671606,0.385746,-0.394888,-0.17459,-0.434487,-0.40319,-0.592306,-0.64559,-0.269767,2.057464,-0.239881,-0.153048,0.022904,-0.621,1.591508,-0.558867,0.0,0.006041,0.576169,0.837222,-0.511374,0.066907,-0.741349,-0.004652,-0.655419,0.0,-0.211772,-0.336309,-0.552035,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,0.213269,-0.369777,-0.311726,0.680576,0.347303,-0.310906,0.559553,-0.430983,0.961701,0.433761,-0.789543,-0.544498,-0.356889,0.28521,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,-0.41325,0.71706,-0.412612,-0.108315,-0.381783,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,0.720907,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,-0.689301,-0.009535,-0.049142,-0.087195,-0.333152,1.34686,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,1.178345,4.12646,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,3.955957,1
21998,0.317687,-0.563765,2.24248,0.737198,-0.460982,-0.01309,-1.766131,1.794376,-0.59679,-1.425608,-0.137079,-0.437626,-1.647061,-0.99109,-0.806122,0.010699,1.029297,-0.070864,-1.153925,-0.494007,-0.035103,-0.714161,-0.671606,-0.731941,0.157743,-0.934776,-0.434487,-0.40319,-0.183389,1.797819,0.598269,-1.393638,-0.597279,-0.66508,-1.002864,2.831142,-1.125127,-1.085583,0.0,-0.640377,-0.914335,-0.675827,-0.511374,-0.303252,2.609581,-0.666714,-0.655419,0.0,0.339803,-0.336309,-0.552035,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,-1.173139,-0.369777,-0.533208,-1.045566,-1.54388,0.563001,1.530287,3.682938,-0.903394,1.850784,-0.789543,0.962923,-0.356889,-0.492406,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,-1.059784,-0.584379,-0.412612,-0.108315,-0.381783,-0.273154,-0.060203,-0.067344,-0.073133,-0.212754,0.603576,-0.130515,-0.601011,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,2.546108,-0.009535,-0.049142,-0.087195,-0.333152,-0.71363,-0.149814,-0.107656,-0.498325,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,4.271187,-0.20891,-0.18201,-0.386595,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,-0.241483,-0.265537,-0.178146,0.0,-0.450301,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,-0.237666,1
21999,0.68871,-0.252862,-0.123495,-0.400827,1.091422,-0.01309,0.397447,0.806871,-0.637971,0.556432,0.178911,-0.437626,-0.793027,0.160967,-0.434368,-0.554457,0.438757,0.003424,-0.145342,-0.495509,-0.334916,-0.065096,-0.39327,0.385746,0.187103,0.104354,-0.250825,-0.40319,0.061561,2.250805,1.697593,-0.910403,1.858195,-0.66508,0.478423,0.967849,0.507283,0.391508,0.0,-0.507922,-0.060542,0.414851,3.481254,0.151702,0.455858,1.069692,-0.655419,0.0,0.535009,-0.336309,-0.226549,-0.219531,-0.110027,-0.162311,-0.102046,-0.074939,2.01465,2.854892,-0.733424,-0.055754,-0.637463,0.563001,0.279693,0.391432,0.774857,0.858359,0.437342,1.299926,-0.356889,-0.492406,-0.089717,-0.278579,-0.277826,-0.290204,-0.23341,-0.09708,-0.190522,-0.161202,0.750278,-0.323624,1.037406,-0.108315,-0.381783,5.958615,-0.060203,-0.067344,-0.073133,-0.212754,-0.218858,-0.130515,-0.336153,-0.331572,-0.045481,-0.035118,-0.028616,-0.021325,0.345565,-0.009535,-0.049142,-0.087195,-0.333152,-0.576018,-0.149814,-0.107656,2.193521,-0.107523,-0.124451,-0.253989,-0.136756,0.0,-0.02149,-0.278361,-0.20891,-0.18201,-0.386595,-0.234379,-0.152121,-0.130848,0.0,-0.028616,-0.062684,-0.171476,-0.349649,-0.064823,4.476468,-0.265537,4.442088,0.0,1.256685,-0.109776,-0.262007,-0.353118,-0.132878,-0.071864,-0.09503,-0.190349,-0.006742,-0.139942,0.0,3.955957,1


In [None]:
final_df.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count each class and store in a variable
class_counts = final_df['values'].value_counts()

# Print the class counts
print(class_counts)

# Create a countplot
plt.figure(figsize=(5, 4))
sns.countplot(x=final_df["values"], palette="coolwarm")
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()