**Import Libraries**

In [1]:
!pip install rdkit-pypi 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install molvs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit import DataStructs
from molvs import standardize_smiles

from sklearn.model_selection import train_test_split

**Get Data**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
os.chdir('/content/drive/My Drive/')

In [6]:
x=pd.read_csv('erk2.csv')
x.head(3)

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Target Name,Target Organism,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties
0,CHEMBL440356,,0.0,243.06,0,1.09,2,O=C1CCNC(=O)c2[nH]c(Br)cc21,IC50,'=',...,MAP kinase ERK2,Homo sapiens,SINGLE PROTEIN,CHEMBL1135814,1.0,Scientific Literature,J. Med. Chem.,2002.0,,
1,CHEMBL260417,,0.0,373.21,0,3.44,SB-725317,O=C(Nc1n[nH]c2nc(-c3ccc(O)cc3)c(Br)cc12)C1CC1,Inhibition,'=',...,MAP kinase ERK2,Homo sapiens,SINGLE PROTEIN,CHEMBL1961873,16.0,GSK Published Kinase Inhibitor Set,,,,
2,CHEMBL213451,,0.0,323.33,0,3.62,43,CCNc1nnc2ccc(-c3ocnc3-c3ccc(F)cc3)cn12,IC50,'>',...,MAP kinase ERK2,Homo sapiens,SINGLE PROTEIN,CHEMBL1145312,1.0,Scientific Literature,Bioorg. Med. Chem. Lett.,2006.0,,


In [7]:
print(x.shape)
x.columns

(23306, 45)


Index(['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase',
       'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key',
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment',
       'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate',
       'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID',
       'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',
       'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation',
       'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'Source ID', 'Source Description',
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties'],
      dtype='object')

**Create Data subset**

In [8]:
x1=x[['Molecule ChEMBL ID', 'Smiles', 'Standard Type', 'Standard Value', 'Standard Units']]
x1=x1[x1['Standard Units'].str.contains('nM', na=False)]
x1.drop_duplicates(keep='first', inplace=True)
print(x1.shape)
x1.head(3)

(18810, 5)


Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Value,Standard Units
0,CHEMBL440356,O=C1CCNC(=O)c2[nH]c(Br)cc21,IC50,539.0,nM
2,CHEMBL213451,CCNc1nnc2ccc(-c3ocnc3-c3ccc(F)cc3)cn12,IC50,10000.0,nM
4,CHEMBL214198,CC(C)c1nnc2ccc(-c3c[nH]nc3-c3cc(F)ccc3F)cn12,IC50,10000.0,nM


**Check for missing values**

In [9]:
x1.isnull().apply(pd.value_counts)

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Value,Standard Units
False,18810.0,18762,18810.0,18806,18810.0
True,,48,,4,


In [10]:
x1.dropna(inplace=True)
print(x1.shape)

(18758, 5)


**Groupby ChEMBL ID and store the mean of Standard Value in a new column**

In [11]:
x1['New_value']=x1.groupby('Molecule ChEMBL ID')['Standard Value'].transform('mean')
x1.drop_duplicates('Molecule ChEMBL ID', keep='first', inplace=True)
x1.drop(['Standard Value'], axis=1, inplace=True)
x1=x1.sort_values('New_value', ascending=True).reset_index(drop=True)
print(x1.shape)
x1.head(3)

(17739, 5)


Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Units,New_value
0,CHEMBL4868141,Nc1ncc(-c2ccc(NS(=O)(=O)C3CC3)cc2OC2CCCCC2)cc1...,IC50,nM,-29600.0
1,CHEMBL4115001,Nc1ncc([C@@H]2CC[C@@H](O)[C@H](O)C2)nc1-c1ccc(...,IC50,nM,0.00431
2,CHEMBL4111166,NC[C@@H](NC(=O)c1ccc(-c2nc([C@@H]3CC[C@@H](O)[...,IC50,nM,0.005


 **Create a new Label class.
If New_value <= 10000, Label=1 If New_value > 10000, Label=0**

In [12]:
x1['Label']=x1['New_value'].apply(lambda x: 1 if x<=10000 else 0)
x1.head(3)

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Units,New_value,Label
0,CHEMBL4868141,Nc1ncc(-c2ccc(NS(=O)(=O)C3CC3)cc2OC2CCCCC2)cc1...,IC50,nM,-29600.0,1
1,CHEMBL4115001,Nc1ncc([C@@H]2CC[C@@H](O)[C@H](O)C2)nc1-c1ccc(...,IC50,nM,0.00431,1
2,CHEMBL4111166,NC[C@@H](NC(=O)c1ccc(-c2nc([C@@H]3CC[C@@H](O)[...,IC50,nM,0.005,1


In [13]:
x1['Label'].value_counts()

0    10534
1     7205
Name: Label, dtype: int64

**AVE (Assymetric Validation Embedding) - Bias Measurement in training/validation dataset**

**Method 1: Train/Test Random Split**

In [14]:
X_train, X_test, y_train, y_test=train_test_split(x1['Smiles'], x1['Label'], random_state=42, test_size=0.2, shuffle=True)

In [15]:
combined_train=pd.concat([X_train, y_train], axis=1)
combined_test=pd.concat([X_test, y_test], axis=1)

In [16]:
print(combined_train.shape)
print(combined_test.shape)

(14191, 2)
(3548, 2)


In [17]:
Ta=combined_train[combined_train['Label']==1]
Ti=combined_train[combined_train['Label']==0]
Va=combined_test[combined_test['Label']==1]
Vi=combined_test[combined_test['Label']==0]

In [18]:
Ta.to_csv('Ta_rand.smi', index=None, header=None, sep='\t')
Ti.to_csv('Ti_rand.smi', index=None, header=None, sep='\t')
Va.to_csv('Va_rand.smi', index=None, header=None, sep='\t')
Vi.to_csv('Vi_rand.smi', index=None, header=None, sep='\t')

In [19]:
print(Ta.shape)
print(Ti.shape)
print(Va.shape)
print(Vi.shape)

(5779, 2)
(8412, 2)
(1426, 2)
(2122, 2)


**Estimate Data Bias using AVE**

In [20]:
!python analyze_AVE_bias.py

usage: analyze_AVE_bias.py
       [-h]
       [-fpType {DayLight,ECFP4,ECFP6,ECFP12,AP,MACCS}]
       -activeMolsTraining
       ACTIVEMOLSTRAINING
       -inactiveMolsTraining
       INACTIVEMOLSTRAINING
       -activeMolsTesting
       ACTIVEMOLSTESTING
       -inactiveMolsTesting
       INACTIVEMOLSTESTING
       -outFile
       OUTFILE
       [-metric {jaccard,dice,euclidean}]
       [-numWorkers NUMWORKERS]
analyze_AVE_bias.py: error: the following arguments are required: -activeMolsTraining, -inactiveMolsTraining, -activeMolsTesting, -inactiveMolsTesting, -outFile


In [21]:
!python analyze_AVE_bias.py -activeMolsTraining Ta_rand.smi -inactiveMolsTraining Ti_rand.smi -activeMolsTesting Va_rand.smi -inactiveMolsTesting Vi_rand.smi -outFile result_rand.txt -numWorkers 4   

init 4 workers
#ActTrain= 5779 #InactTrain= 8412 #ActTest= 1426 #InactTest= 2122 knn1= 0.7190281337699093 lr= 0.7864045007686786 rf= 0.8030596449669727 svm= 0.8096631429504304 AA-AI= 0.2083870967741936 II-IA= 0.10628652214891621 (AA-AI)+(II-IA)= 0.31467361892310985


**Method 2: Molecular Clustering using MayaChem Tools**

In [22]:
# Standardize smiles
stdsmi=[standardize_smiles(smi) for smi in x1['Smiles'] if smi is not None]

In [23]:
stdsmi=pd.DataFrame(stdsmi)
stdsmi.columns=['Std_Smiles']

In [24]:
x2=pd.concat([x1,stdsmi], axis=1)
x2.head(3)

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Units,New_value,Label,Std_Smiles
0,CHEMBL4868141,Nc1ncc(-c2ccc(NS(=O)(=O)C3CC3)cc2OC2CCCCC2)cc1...,IC50,nM,-29600.0,1,Nc1ncc(-c2ccc(NS(=O)(=O)C3CC3)cc2OC2CCCCC2)cc1...
1,CHEMBL4115001,Nc1ncc([C@@H]2CC[C@@H](O)[C@H](O)C2)nc1-c1ccc(...,IC50,nM,0.00431,1,Nc1ncc([C@@H]2CC[C@@H](O)[C@H](O)C2)nc1-c1ccc(...
2,CHEMBL4111166,NC[C@@H](NC(=O)c1ccc(-c2nc([C@@H]3CC[C@@H](O)[...,IC50,nM,0.005,1,NC[C@@H](NC(=O)c1ccc(-c2nc([C@@H]3CC[C@@H](O)[...


In [25]:
x2.to_csv('x2.csv', index=None)

In [26]:
print(x2.shape)
print(x1.shape)
print(stdsmi.shape)

(17739, 7)
(17739, 6)
(17739, 1)


In [27]:
x2[['Std_Smiles', 'Molecule ChEMBL ID']].to_csv('erk2.smi', sep='\t', index=None, header=None)

In [28]:
!python mayachemtools/bin/RDKitClusterMolecules.py --infileParams "smilesColumn,1,smilesNameColumn,2,smilesDelimiter,tab,smilesTitleLine,auto,sanitize,yes" -i erk2.smi -o erk2_cluster.smi --overwrite

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3933,1
3934,1
3935,1
3936,1
3937,1
3938,1
3939,1
3940,1
3941,1
3942,1
3943,1
3944,1
3945,1
3946,1
3947,1
3948,1
3949,1
3950,1
3951,1
3952,1
3953,1
3954,1
3955,1
3956,1
3957,1
3958,1
3959,1
3960,1
3961,1
3962,1
3963,1
3964,1
3965,1
3966,1
3967,1
3968,1
3969,1
3970,1
3971,1
3972,1
3973,1
3974,1
3975,1
3976,1
3977,1
3978,1
3979,1
3980,1
3981,1
3982,1
3983,1
3984,1
3985,1
3986,1
3987,1
3988,1
3989,1
3990,1
3991,1
3992,1
3993,1
3994,1
3995,1
3996,1
3997,1
3998,1
3999,1
4000,1
4001,1
4002,1
4003,1
4004,1
4005,1
4006,1
4007,1
4008,1
4009,1
4010,1
4011,1
4012,1
4013,1
4014,1
4015,1
4016,1
4017,1
4018,1
4019,1
4020,1
4021,1
4022,1
4023,1
4024,1
4025,1
4026,1
4027,1
4028,1
4029,1
4030,1
4031,1
4032,1
4033,1
4034,1
4035,1
4036,1
4037,1
4038,1
4039,1
4040,1
4041,1
4042,1
4043,1
4044,1
4045,1
4046,1
4047,1
4048,1
4049,1
4050,1
4051,1
4052,1
4053,1
4054,1
4055,1
4056,1
4057,1
4058,1
4059,1
4060,1
4061,1
4062,1
4063,1
4064,1
4065,1
4066

In [29]:
clus=pd.read_csv('erk2_cluster.smi', sep=' ')
clus.head()

Unnamed: 0,SMILES,Name,ClusterNumber
0,COc1n[nH]c2cc(NC(=O)N[C@H](C)c3ccccc3)ncc12,CHEMBL4111718,1
1,Cc1cc(-c2n[nH]c3cc(NC(=O)N[C@H](C)c4ccccc4)ncc...,CHEMBL2408789,1
2,C[C@@H](NC(=O)Nc1cc2[nH]nc(-c3ccc4nn(C)cc4c3)c...,CHEMBL3663093,1
3,COC[C@@H](NC(=O)Nc1cc2[nH]nc(-c3ccncc3)c2cn1)c...,CHEMBL3663000,1
4,C[C@@H](NC(=O)Nc1cc2[nH]nc(-c3ccnc(F)c3)c2cn1)...,CHEMBL3663084,1


In [30]:
clus_distribution=clus['ClusterNumber'].value_counts().reset_index()
clus_distribution.columns=['ClusterNumber','Count']
clus_distribution.head(10)

Unnamed: 0,ClusterNumber,Count
0,1,576
1,2,396
2,4,229
3,18,109
4,10,108
5,19,105
6,27,82
7,8,80
8,6,66
9,41,48


In [31]:
print(clus.shape)
print(x2.shape)

(17739, 3)
(17739, 7)


In [32]:
x2=x2.rename(columns={'Molecule ChEMBL ID':'Name'})
combined=pd.merge(x2, clus, on='Name')
combined.drop(['Smiles','Std_Smiles'], axis=1, inplace=True)

In [33]:
train=combined.groupby('ClusterNumber').filter(lambda x:len(x)>=2)
test=combined.groupby('ClusterNumber').filter(lambda x:len(x)<2)

In [34]:
print(train.shape)
print(test.shape)

(11320, 7)
(6419, 7)


In [35]:
X_train_clus=train['SMILES']
y_train_clus=train['Label']

X_test_clus=test['SMILES']
y_test_clus=test['Label']

In [36]:
combined_train_clus=pd.concat([X_train_clus, y_train_clus], axis=1)
combined_test_clus=pd.concat([X_test_clus, y_test_clus], axis=1)

In [37]:
Ta=combined_train_clus[combined_train_clus['Label']==1]
Ti=combined_train_clus[combined_train_clus['Label']==0]
Va=combined_test_clus[combined_test_clus['Label']==1]
Vi=combined_test_clus[combined_test_clus['Label']==0]

In [38]:
Ta.to_csv('Ta_clus.smi', index=None, header=None, sep='\t')
Ti.to_csv('Ti_clus.smi', index=None, header=None, sep='\t')
Va.to_csv('Va_clus.smi', index=None, header=None, sep='\t')
Vi.to_csv('Vi_clus.smi', index=None, header=None, sep='\t')

In [39]:
print(Ta.shape)
print(Ti.shape)
print(Va.shape)
print(Vi.shape)

(5212, 2)
(6108, 2)
(1993, 2)
(4426, 2)


In [40]:
!python analyze_AVE_bias.py -activeMolsTraining Ta_clus.smi -inactiveMolsTraining Ti_clus.smi -activeMolsTesting Va_clus.smi -inactiveMolsTesting Vi_clus.smi -outFile result_clus.txt -numWorkers 4

init 4 workers
#ActTrain= 5212 #InactTrain= 6108 #ActTest= 1993 #InactTest= 4426 knn1= 0.557314246496266 lr= 0.5792209583973187 rf= 0.5901784238508525 svm= 0.5970656107945818 AA-AI= -0.011600602107375846 II-IA= 0.04804338002711256 (AA-AI)+(II-IA)= 0.03644277791973671


**Method 3: Murcko Sacffold-based clustering**

In [41]:
mols=[Chem.MolFromSmiles(smi) for smi in x2['Std_Smiles'] if smi is not None]

In [42]:
scaffolds=[MurckoScaffold.GetScaffoldForMol(mol) for mol in mols]

In [43]:
murcko_smi=[Chem.MolToSmiles(mol) for mol in scaffolds]

In [44]:
murcko_smi=pd.DataFrame(murcko_smi)
murcko_smi.columns=['Murcko_Smiles']

In [45]:
combined_murcko=pd.concat([murcko_smi,x2[['Std_Smiles', 'Name', 'Label']]], axis=1)
combined_murcko.head(3)

Unnamed: 0,Murcko_Smiles,Std_Smiles,Name,Label
0,O=C1NCCc2cc(-c3cncc(-c4ccc(NS(=O)(=O)C5CC5)cc4...,Nc1ncc(-c2ccc(NS(=O)(=O)C3CC3)cc2OC2CCCCC2)cc1...,CHEMBL4868141,1
1,O=C(NCc1ccccc1)c1ccc(-c2cncc(C3CCCCC3)n2)cc1,Nc1ncc([C@@H]2CC[C@@H](O)[C@H](O)C2)nc1-c1ccc(...,CHEMBL4115001,1
2,O=C(NCc1ccccc1)c1ccc(-c2cncc(C3CCCCC3)n2)cc1,NC[C@@H](NC(=O)c1ccc(-c2nc([C@@H]3CC[C@@H](O)[...,CHEMBL4111166,1


In [46]:
combined_murcko[['Murcko_Smiles','Name']].to_csv('murcko.smi', sep='\t', index=None, header=None)

In [47]:
!python mayachemtools/bin/RDKitClusterMolecules.py --infileParams "smilesColumn,1,smilesNameColumn,2,smilesDelimiter,tab,smilesTitleLine,auto,sanitize,yes" -i murcko.smi -o murcko_cluster.smi --overwrite


RDKitClusterMolecules.py (RDKit v2022.09.3; MayaChemTools v2022.24.10; Thu Dec 29 16:27:56 2022): Starting...

Processing options...

Reading file murcko.smi...
Total number of molecules: 17739
Number of valid molecules: 17689
Number of ignored molecules: 50

Generating Morgan IntVect fingerprints...

Clustering molecules using Butina methodology and Tanimoto similarity metric...
tcmalloc: large alloc 1233903616 bytes == 0x5ba78000 @  0x7ffa394e2615 0x5c8240 0x4f750a 0x4997a2 0x5d8868 0x4990ca 0x55d078 0x5d8941 0x4990ca 0x5d8868 0x4990ca 0x5d8868 0x4990ca 0x5d8868 0x4990ca 0x55cd91 0x55d743 0x642630 0x6426ae 0x644b78 0x64511c 0x677e5e 0x678029 0x7ffa390dec87 0x5e1baa
tcmalloc: large alloc 1542381568 bytes == 0xa54b6000 @  0x7ffa394e2615 0x5c8240 0x4f750a 0x4997a2 0x5d8868 0x4990ca 0x55d078 0x5d8941 0x4990ca 0x5d8868 0x4990ca 0x5d8868 0x4990ca 0x5d8868 0x4990ca 0x55cd91 0x55d743 0x642630 0x6426ae 0x644b78 0x64511c 0x677e5e 0x678029 0x7ffa390dec87 0x5e1baa
Generating file murcko_cluster

In [48]:
murcko_clus=pd.read_csv('murcko_cluster.smi', sep=' ')
print(murcko_clus.shape)
murcko_clus.head(3)

(17689, 3)


Unnamed: 0,SMILES,Name,ClusterNumber
0,O=C(NCc1ccccc1)Nc1cc2[nH]nc(-c3ccncc3)c2cn1,CHEMBL3658706,1
1,O=C(NCc1ccccc1)Nc1cc2[nH]nc(-c3ccncc3)c2cn1,CHEMBL3658845,1
2,O=C(NCc1ccccc1)Nc1cc2[nH]nc(-c3ccc(=O)[nH]c3)c...,CHEMBL3654722,1


In [49]:
murcko_distribution=murcko_clus['ClusterNumber'].value_counts().reset_index()
murcko_distribution.columns=['ClusterNumber','Count']
murcko_distribution.head(10)

Unnamed: 0,ClusterNumber,Count
0,1,1177
1,6,539
2,11,434
3,13,254
4,14,234
5,15,175
6,19,148
7,4,139
8,26,137
9,31,117


In [50]:
combined_murcko_clus=pd.merge(murcko_clus, combined_murcko, on='Name')
combined_murcko_clus.drop(['Murcko_Smiles', 'Std_Smiles'], axis=1, inplace=True)
combined_murcko_clus.head(3)

Unnamed: 0,SMILES,Name,ClusterNumber,Label
0,O=C(NCc1ccccc1)Nc1cc2[nH]nc(-c3ccncc3)c2cn1,CHEMBL3658706,1,1
1,O=C(NCc1ccccc1)Nc1cc2[nH]nc(-c3ccncc3)c2cn1,CHEMBL3658845,1,1
2,O=C(NCc1ccccc1)Nc1cc2[nH]nc(-c3ccc(=O)[nH]c3)c...,CHEMBL3654722,1,1


In [51]:
train_murcko=combined_murcko_clus.groupby('ClusterNumber').filter(lambda x:len(x)>=4)
test_murcko=combined_murcko_clus.groupby('ClusterNumber').filter(lambda x:len(x)<4)

In [52]:
print(train_murcko.shape)
print(test_murcko.shape)

(12626, 4)
(5063, 4)


In [54]:
X_train_murcko=train_murcko['SMILES']
y_train_murcko=train_murcko['Label']

X_test_murcko=test_murcko['SMILES']
y_test_murcko=test_murcko['Label']

In [55]:
combined_train_murcko=pd.concat([X_train_murcko, y_train_murcko], axis=1)
combined_test_murcko=pd.concat([X_test_murcko, y_test_murcko], axis=1)

In [56]:
Ta=combined_train_murcko[combined_train_murcko['Label']==1]
Ti=combined_train_murcko[combined_train_murcko['Label']==0]
Va=combined_test_murcko[combined_test_murcko['Label']==1]
Vi=combined_test_murcko[combined_test_murcko['Label']==0]

In [57]:
Ta.to_csv('Ta_murcko.smi', index=None, header=None, sep='\t')
Ti.to_csv('Ti_murcko.smi', index=None, header=None, sep='\t')
Va.to_csv('Va_murcko.smi', index=None, header=None, sep='\t')
Vi.to_csv('Vi_murcko.smi', index=None, header=None, sep='\t')

In [58]:
print(Ta.shape)
print(Ti.shape)
print(Va.shape)
print(Vi.shape)

(5536, 2)
(7090, 2)
(1645, 2)
(3418, 2)


In [59]:
!python analyze_AVE_bias.py -activeMolsTraining Ta_murcko.smi -inactiveMolsTraining Ti_murcko.smi -activeMolsTesting Va_murcko.smi -inactiveMolsTesting Vi_murcko.smi -outFile result_murcko.txt -numWorkers 4

init 4 workers
#ActTrain= 5536 #InactTrain= 7090 #ActTest= 1645 #InactTest= 3418 knn1= 0.5518375807676505 lr= 0.5830133158799917 rf= 0.6087495842678045 svm= 0.6088731923430578 AA-AI= -0.010480243161094194 II-IA= 0.04410766530134591 (AA-AI)+(II-IA)= 0.03362742214025172


**Bias Comparison**

In [63]:
Bias = pd.DataFrame({'Data':['Random Split','Chemical Clustering','Scaffold-based Clustering'], 'Bias(AVE)':[0.315, 0.036, 0.034]})
Bias

Unnamed: 0,Data,Bias(AVE)
0,Random Split,0.315
1,Chemical Clustering,0.036
2,Scaffold-based Clustering,0.034
