



## **Predicting Drug Induced Liver Injury: Analysing Data Bias**





**Dataset** : Toxicological Sciences, Volume 136, Issue 1, November 2013, Pages 242–249, https://doi.org/10.1093/toxsci/kft189

### **Import Libraries**

In [1]:
!pip install rdkit-pypi 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install molvs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os

# Base Libraries
import numpy as np
import pandas as pd

# RDKit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import DataStructs  

# molvs
from molvs import standardize_smiles

# SKlearn
from sklearn.model_selection import train_test_split

###  **Get Data**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
os.chdir('/content/drive/My Drive/')

In [6]:
s1=pd.read_excel('toxsci_13_0303_File013.xls', sheet_name=0)
print(s1.shape)
s1.head()

(197, 3)


Unnamed: 0,PubChem,CompoundName,Drug label-based annotation
0,3474,glafenine,Most DILI-concern
1,2478,busulfan,Most DILI-concern
2,2520,verapamil,Most DILI-concern
3,2662,celecoxib,Most DILI-concern
4,2898,cyclofenil,Most DILI-concern


In [7]:
s2=pd.read_excel('toxsci_13_0303_File013.xls', sheet_name=1)
print(s2.shape)
s2.head()

(190, 4)


Unnamed: 0,PubChem,CompoundName,Drug label-based annotation,QSAR Prediction
0,28417,danazol,Most DILI-concern,0
1,65027,tipranavir,Most DILI-concern,1
2,50599,didanosine,Most DILI-concern,1
3,18283,stavudine,Most DILI-concern,1
4,60825,lamivudine,Most DILI-concern,1


### **Get corresponding Smiles data for molecules in s1 and s2 from PubChem database**

In [8]:
x1=pd.read_csv('Supp1.csv')
print(x1.shape)
x1.head(2)

(193, 24)


Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,hbonddonor,...,inchikey,iupacname,meshheadings,annothits,annothitcnt,aids,cidcdate,sidsrcname,depcatg,annotation
0,298,"2,2-dichloro-N-[1,3-dihydroxy-1-(4-nitrophenyl...","579-51-1|2,2-dichloro-N-[1,3-dihydroxy-1-(4-ni...",323.13,C11H12Cl2N2O5,115.0,342.0,1.1,20,3,...,WIIZWVCIJKGZOK-UHFFFAOYSA-N,"2,2-dichloro-N-[1,3-dihydroxy-1-(4-nitrophenyl...",,Biological Test Results|Chemical and Physical ...,9,155|157|175|248|256|328|485|631|731|757|758|75...,20050325,001Chemical|AAA Chemistry|abcr GmbH|ABI Chem|A...,Chemical Vendors|Curation Efforts|Governmental...,
1,338,Salicylic Acid,salicylic acid|2-Hydroxybenzoic acid|69-72-7|o...,138.12,C7H6O3,57.5,133.0,2.3,10,2,...,YGSDEFSMJLZEOE-UHFFFAOYSA-N,2-hydroxybenzoic acid,Salicylic Acid,Biological Test Results|Chemical and Physical ...,13,155|157|161|165|167|175|179|248|485|568|583|59...,20040916,001Chemical|3B Scientific (Wuhan) Corp|3WAY PH...,Chemical Vendors|Curation Efforts|Governmental...,


In [9]:
x2=pd.read_csv('Supp2.csv')
print(x2.shape)
x2.head(2)

(184, 24)


Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,hbonddonor,...,inchikey,iupacname,meshheadings,annothits,annothitcnt,aids,cidcdate,sidsrcname,depcatg,annotation
0,247,Betaine,betaine|107-43-7|glycine betaine|oxyneurine|Tr...,117.15,C5H11NO2,40.1,87.6,0.5,8,0,...,KWIUHFFTVRNATP-UHFFFAOYSA-N,2-(trimethylazaniumyl)acetate,Betaine,Biological Test Results|Chemical and Physical ...,13,192|248|608|1033|1195|1549|1552|1637|1648|7104...,20040916,001Chemical|3B Scientific (Wuhan) Corp|3WAY PH...,Chemical Vendors|Curation Efforts|Governmental...,
1,453,Hexitol,"Hexitol|hexane-1,2,3,4,5,6-hexol|hexane-1,2,3,...",182.17,C6H14O6,121.0,105.0,-3.1,12,6,...,FBPFZTCFMRRESA-UHFFFAOYSA-N,"hexane-1,2,3,4,5,6-hexol",,Biological Test Results|Classification|Literat...,6,155|157|161|165|167|175|188|200|208|214|220|24...,20050325,001Chemical|3B Scientific (Wuhan) Corp|3WAY PH...,Chemical Vendors|Curation Efforts|Governmental...,


In [10]:
# Take only columns 'cid' and 'isosmiles'
x1=x1[['cid', 'isosmiles']]
x2=x2[['cid', 'isosmiles']]

### **Merge (s1,x1) and (s2,x2) on PubChem id and combine**

In [11]:
s1=s1.rename(columns={'PubChem':'cid'})
s2=s2.rename(columns={'PubChem':'cid'})

In [12]:
merged_1=pd.merge(s1, x1, on='cid')
merged_2=pd.merge(s2, x2, on='cid')

In [13]:
print(merged_1.shape)
print(merged_2.shape)

(193, 4)
(184, 5)


In [14]:
combined=pd.concat([merged_1, merged_2], axis=0)
combined.drop(['QSAR Prediction'], axis=1, inplace=True)
combined=combined.reset_index(drop=True)
print(combined.shape)
combined.head()

(377, 4)


Unnamed: 0,cid,CompoundName,Drug label-based annotation,isosmiles
0,3474,glafenine,Most DILI-concern,C1=CC=C(C(=C1)C(=O)OCC(CO)O)NC2=C3C=CC(=CC3=NC...
1,2478,busulfan,Most DILI-concern,CS(=O)(=O)OCCCCOS(=O)(=O)C
2,2520,verapamil,Most DILI-concern,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...
3,2662,celecoxib,Most DILI-concern,CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)...
4,2898,cyclofenil,Most DILI-concern,CC(=O)OC1=CC=C(C=C1)C(=C2CCCCC2)C3=CC=C(C=C3)O...


In [15]:
combined.to_csv('DILI_dataset.csv', index=None)

### **Create DILI_positive and DILI_negative datasets**

In [16]:
combined['Drug label-based annotation'].value_counts()

no DILI-concern      209
Most DILI-concern    168
Name: Drug label-based annotation, dtype: int64

In [17]:
DILI_positive=combined[combined['Drug label-based annotation']=='Most DILI-concern']
DILI_positive['label']=1
DILI_positive.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DILI_positive['label']=1


Unnamed: 0,cid,CompoundName,Drug label-based annotation,isosmiles,label
0,3474,glafenine,Most DILI-concern,C1=CC=C(C(=C1)C(=O)OCC(CO)O)NC2=C3C=CC(=CC3=NC...,1
1,2478,busulfan,Most DILI-concern,CS(=O)(=O)OCCCCOS(=O)(=O)C,1
2,2520,verapamil,Most DILI-concern,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,1


In [18]:
DILI_negative=combined[combined['Drug label-based annotation']=='no DILI-concern']
DILI_negative['label']=0
DILI_negative.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DILI_negative['label']=0


Unnamed: 0,cid,CompoundName,Drug label-based annotation,isosmiles,label
77,3100,diphenhydramine,no DILI-concern,CN(C)CCOC(C1=CC=CC=C1)C2=CC=CC=C2,0
78,3354,flavoxate,no DILI-concern,CC1=C(OC2=C(C1=O)C=CC=C2C(=O)OCCN3CCCCC3)C4=CC...,0
79,2370,bethanechol,no DILI-concern,CC(C[N+](C)(C)C)OC(=O)N,0


In [19]:
DILI=pd.concat([DILI_positive, DILI_negative], axis=0)
DILI=DILI.reset_index(drop=True)
DILI

Unnamed: 0,cid,CompoundName,Drug label-based annotation,isosmiles,label
0,3474,glafenine,Most DILI-concern,C1=CC=C(C(=C1)C(=O)OCC(CO)O)NC2=C3C=CC(=CC3=NC...,1
1,2478,busulfan,Most DILI-concern,CS(=O)(=O)OCCCCOS(=O)(=O)C,1
2,2520,verapamil,Most DILI-concern,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,1
3,2662,celecoxib,Most DILI-concern,CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)...,1
4,2898,cyclofenil,Most DILI-concern,CC(=O)OC1=CC=C(C=C1)C(=C2CCCCC2)C3=CC=C(C=C3)O...,1
...,...,...,...,...,...
372,54678501,vitamin c,no DILI-concern,C([C@@H](C1C(=C(C(=O)O1)O)O)O)O,0
373,2707,chloral hydrate,no DILI-concern,C(C(Cl)(Cl)Cl)(O)O,0
374,24705,dimethicone,no DILI-concern,C[Si](C)(C)O[Si](C)(C)O[Si](C)(C)C,0
375,2341,benzphetamine,no DILI-concern,CC(CC1=CC=CC=C1)N(C)CC2=CC=CC=C2,0


In [20]:
DILI = DILI.drop(labels=[186,187,193,376], axis=0)
DILI=DILI.reset_index(drop=True)
DILI.shape

(373, 5)

In [21]:
DILI.to_csv('DILI-labelled.csv')

In [22]:
std_smi=[standardize_smiles(smi) for smi in DILI['isosmiles'] if smi is not None]
stdsmi=pd.DataFrame(std_smi, columns=['STD_SMILES'])
print(stdsmi.shape)
stdsmi.head(3)

(373, 1)


Unnamed: 0,STD_SMILES
0,O=C(OCC(O)CO)c1ccccc1Nc1ccnc2cc(Cl)ccc12
1,CS(=O)(=O)OCCCCOS(C)(=O)=O
2,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...


In [23]:
DILI=pd.concat([DILI, stdsmi], axis=1)
print(DILI.shape)
DILI.head(3)

(373, 6)


Unnamed: 0,cid,CompoundName,Drug label-based annotation,isosmiles,label,STD_SMILES
0,3474,glafenine,Most DILI-concern,C1=CC=C(C(=C1)C(=O)OCC(CO)O)NC2=C3C=CC(=CC3=NC...,1,O=C(OCC(O)CO)c1ccccc1Nc1ccnc2cc(Cl)ccc12
1,2478,busulfan,Most DILI-concern,CS(=O)(=O)OCCCCOS(=O)(=O)C,1,CS(=O)(=O)OCCCCOS(C)(=O)=O
2,2520,verapamil,Most DILI-concern,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,1,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...


In [24]:
DILI['New_ID']=DILI['cid'].astype(str)+'_'+DILI['label'].astype(str)
DILI.head(3)

Unnamed: 0,cid,CompoundName,Drug label-based annotation,isosmiles,label,STD_SMILES,New_ID
0,3474,glafenine,Most DILI-concern,C1=CC=C(C(=C1)C(=O)OCC(CO)O)NC2=C3C=CC(=CC3=NC...,1,O=C(OCC(O)CO)c1ccccc1Nc1ccnc2cc(Cl)ccc12,3474_1
1,2478,busulfan,Most DILI-concern,CS(=O)(=O)OCCCCOS(=O)(=O)C,1,CS(=O)(=O)OCCCCOS(C)(=O)=O,2478_1
2,2520,verapamil,Most DILI-concern,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,1,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,2520_1


## **Analyze Chemical Bias via Asymmetric Validation Embedding (AVE) 3 ways:**

1) Random Split 

2) Molecular Clustering 

3) Scaffold-based clustering

### **Model 1: Train/Test Random Split**

In [25]:
X_train, X_test, y_train, y_test = train_test_split(DILI['STD_SMILES'], DILI['New_ID'], test_size=0.2, random_state=42, shuffle=True)

In [26]:
train1=pd.concat([X_train, y_train], axis=1)
test1=pd.concat([X_test, y_test], axis=1)
print(train1.shape)
print(test1.shape)

(298, 2)
(75, 2)


In [27]:
train1.head(3)

Unnamed: 0,STD_SMILES,New_ID
192,CC(=CCC1=C(C)C(=O)c2ccccc2C1=O)CCCC(C)CCCC(C)C...,4812_0
75,CN1CCC(Nc2ncc3ncnc(Nc4ccc(F)c(Cl)c4)c3n2)CC1,6918508_1
84,Cc1nnc(C(C)C)n1C1C[C@H]2CC[C@@H](C1)N2CC[C@H](...,3002977_1


In [28]:
train_rand=train1['New_ID'].str.split('_', n=1, expand=True)
train_rand.columns=['cid', 'label']
train_rand=train_rand[['cid', 'label']].astype(np.int64)
train_rand=pd.concat([train1['STD_SMILES'], train_rand], axis=1)

test_rand=test1['New_ID'].str.split('_', n=1, expand=True)
test_rand.columns=['cid', 'label']
test_rand=test_rand[['cid', 'label']].astype(np.int64)
test_rand=pd.concat([test1['STD_SMILES'], test_rand], axis=1)

In [29]:
train_rand.head()

Unnamed: 0,STD_SMILES,cid,label
192,CC(=CCC1=C(C)C(=O)c2ccccc2C1=O)CCCC(C)CCCC(C)C...,4812,0
75,CN1CCC(Nc2ncc3ncnc(Nc4ccc(F)c(Cl)c4)c3n2)CC1,6918508,1
84,Cc1nnc(C(C)C)n1C1C[C@H]2CC[C@@H](C1)N2CC[C@H](...,3002977,1
361,C#CCN(C)[C@H](C)Cc1ccccc1,26757,0
16,C[C@]12CC(=CO)C(=O)C[C@@H]1CC[C@@H]1[C@@H]2CC[...,9902,1


**For AVE bias analysis seggreate train and test into:**

- Tp (train DILI positive); Tn (train DILI negative)
 
- Vp (test/validation DILI positive); Vn (test/validation DILI negative) 

In [30]:
Tp_rand=train_rand[train_rand['label']==1]
Tn_rand=train_rand[train_rand['label']==0]
Vp_rand=test_rand[test_rand['label']==1]
Vn_rand=test_rand[test_rand['label']==0]

In [31]:
print(Tp_rand.shape)
print(Tn_rand.shape)
print(Vp_rand.shape)
print(Vn_rand.shape)

(129, 3)
(169, 3)
(39, 3)
(36, 3)


In [32]:
Tp_rand[['STD_SMILES', 'label']].to_csv('Tp_rand.smi', sep='\t', header=None, index=None)
Tn_rand[['STD_SMILES', 'label']].to_csv('Tn_rand.smi', sep='\t', header=None, index=None)
Vp_rand[['STD_SMILES', 'label']].to_csv('Vp_rand.smi', sep='\t', header=None, index=None)
Vn_rand[['STD_SMILES', 'label']].to_csv('Vn_rand.smi', sep='\t', header=None, index=None)

**Estimate Bias**

In [33]:
!python analyze_AVE_bias.py

usage: analyze_AVE_bias.py
       [-h]
       [-fpType {DayLight,ECFP4,ECFP6,ECFP12,AP,MACCS}]
       -activeMolsTraining
       ACTIVEMOLSTRAINING
       -inactiveMolsTraining
       INACTIVEMOLSTRAINING
       -activeMolsTesting
       ACTIVEMOLSTESTING
       -inactiveMolsTesting
       INACTIVEMOLSTESTING
       -outFile
       OUTFILE
       [-metric {jaccard,dice,euclidean}]
       [-numWorkers NUMWORKERS]
analyze_AVE_bias.py: error: the following arguments are required: -activeMolsTraining, -inactiveMolsTraining, -activeMolsTesting, -inactiveMolsTesting, -outFile


In [34]:
!python analyze_AVE_bias.py -activeMolsTraining Tp_rand.smi -inactiveMolsTraining Tn_rand.smi -activeMolsTesting Vp_rand.smi -inactiveMolsTesting Vn_rand.smi -outFile result_rand.txt -numWorkers 4 

init 4 workers
#ActTrain= 129 #InactTrain= 169 #ActTest= 39 #InactTest= 36 knn1= 0.6955128205128206 lr= 0.8561253561253562 rf= 0.868945868945869 svm= 0.8824786324786325 AA-AI= 0.10153846153846152 II-IA= 0.13055555555555554 (AA-AI)+(II-IA)= 0.23209401709401706


### **Method 2: Molecular Clustering**

In [35]:
DILI[['STD_SMILES', 'New_ID']].to_csv('DILI_labelled.smi', sep='\t', index=None, header=None)

In [None]:
!python mayachemtools/bin/RDKitClusterMolecules.py --infileParams "smilesColumn,1,smilesNameColumn,2,smilesDelimiter,tab,smilesTitleLine,auto,sanitize,yes" -i DILI_labelled.smi -o DILI_cluster.smi --overwrite

In [37]:
clus=pd.read_csv('DILI_cluster.smi', sep=' ')
clus.rename(columns={'Name':'New_ID'}, inplace=True)
clus

Unnamed: 0,SMILES,New_ID,ClusterNumber
0,CN(C)CCC(c1ccc(Cl)cc1)c1ccccn1,2725_0,1
1,CN(C)CCOC(c1ccc(Cl)cc1)c1ccccn1,2564_0,1
2,CN(C)CCC(c1ccc(Br)cc1)c1ccccn1,6834_0,1
3,CN(C)CC[C@@H](c1ccc(Br)cc1)c1ccccn1,16960_0,1
4,CN(C)CC[C@@H](c1ccc(Cl)cc1)c1ccccn1,33036_0,1
...,...,...,...
368,CC(=O)Oc1ccc(C(=C2CCCCC2)c2ccc(OC(C)=O)cc2)cc1,2898_1,315
369,Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2...,2662_1,316
370,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,2520_1,317
371,CS(=O)(=O)OCCCCOS(C)(=O)=O,2478_1,318


In [38]:
cluster_dist=clus['ClusterNumber'].value_counts().reset_index()
cluster_dist.columns=['ClusterNumber', 'Count']
cluster_dist

Unnamed: 0,ClusterNumber,Count
0,1,5
1,5,4
2,6,4
3,2,4
4,4,4
...,...,...
314,135,1
315,136,1
316,137,1
317,138,1


In [39]:
train2=clus.groupby('ClusterNumber').filter(lambda x: len(x)<2)
test2=clus.groupby('ClusterNumber').filter(lambda x: len(x)>=2)

In [40]:
print(train2.shape)
print(test2.shape)

(283, 3)
(90, 3)


In [41]:
train2.head()

Unnamed: 0,SMILES,New_ID,ClusterNumber
66,CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H...,5288826_0,25
91,CC(Cc1ccccc1)N(C)Cc1ccccc1,2341_0,38
92,C[Si](C)(C)O[Si](C)(C)O[Si](C)(C)C,24705_0,39
93,OC(O)C(Cl)(Cl)Cl,2707_0,40
94,O=C1OC([C@@H](O)CO)C(O)=C1O,54678501_0,41


In [42]:
train_clus=train2['New_ID'].str.split('_', n=1, expand=True)
train_clus.columns=['cid', 'label']
train_clus=train_clus[['cid', 'label']].astype(np.int64)
train_clus=pd.concat([train2['SMILES'], train_clus], axis=1)

test_clus=test2['New_ID'].str.split('_', n=1, expand=True)
test_clus.columns=['cid', 'label']
test_clus=test_clus[['cid', 'label']].astype(np.int64)
test_clus=pd.concat([test2['SMILES'], test_clus], axis=1)

#####   **Seggregate:**
- combined_train into Tp (DILI train positive) and Tn (DILI train negative)

- combined_test into Vp (DILI test positive) and Vn (DILI test negative)

In [43]:
Tp_clus=train_clus[train_clus['label']==1]
Tn_clus=train_clus[train_clus['label']==0]
Vp_clus=test_clus[test_clus['label']==1]
Vn_clus=test_clus[test_clus['label']==0]

In [44]:
print(Tp_clus.shape)
print(Tn_clus.shape)
print(Vp_clus.shape)
print(Vn_clus.shape)

(139, 3)
(144, 3)
(29, 3)
(61, 3)


In [45]:
Tp_clus[['SMILES', 'label']].to_csv('Tp_clus.smi', sep='\t', header=None, index=None)
Tn_clus[['SMILES', 'label']].to_csv('Tn_clus.smi', sep='\t', header=None, index=None)
Vp_clus[['SMILES', 'label']].to_csv('Vp_clus.smi', sep='\t', header=None, index=None)
Vn_clus[['SMILES', 'label']].to_csv('Vn_clus.smi', sep='\t', header=None, index=None)

#### **Estimate Bias**

In [46]:
!python analyze_AVE_bias.py -activeMolsTraining Tp_clus.smi -inactiveMolsTraining Tn_clus.smi -activeMolsTesting Vp_clus.smi -inactiveMolsTesting Vn_clus.smi -outFile result_clus.txt -numWorkers 4

init 4 workers
#ActTrain= 139 #InactTrain= 144 #ActTest= 29 #InactTest= 61 knn1= 0.6873940079140757 lr= 0.8066704352741663 rf= 0.8247597512719049 svm= 0.8094968908988129 AA-AI= 0.07586206896551723 II-IA= 0.06852459016393442 (AA-AI)+(II-IA)= 0.14438665912945164


###  **Method 3: Bemis-Murcko Scaffold-based Clustering**

In [47]:
mols = [Chem.MolFromSmiles(smi) for smi in DILI['STD_SMILES'] if smi is not None ]

In [48]:
scaffolds=[MurckoScaffold.GetScaffoldForMol(mol) for mol in mols if mol is not None]

In [49]:
murcko_smi=[Chem.MolToSmiles(mol) for mol in scaffolds if mol is not None]

In [50]:
murcko=pd.DataFrame(murcko_smi, columns=['Murcko_smi'])
murcko

Unnamed: 0,Murcko_smi
0,c1ccc(Nc2ccnc3ccccc23)cc1
1,
2,c1ccc(CCCCNCCc2ccccc2)cc1
3,c1ccc(-c2ccnn2-c2ccccc2)cc1
4,c1ccc(C(=C2CCCCC2)c2ccccc2)cc1
...,...
368,
369,O=C1C=CCO1
370,
371,


In [51]:
combined_murcko=pd.concat([DILI,murcko], axis=1)
combined_murcko.head()

Unnamed: 0,cid,CompoundName,Drug label-based annotation,isosmiles,label,STD_SMILES,New_ID,Murcko_smi
0,3474,glafenine,Most DILI-concern,C1=CC=C(C(=C1)C(=O)OCC(CO)O)NC2=C3C=CC(=CC3=NC...,1,O=C(OCC(O)CO)c1ccccc1Nc1ccnc2cc(Cl)ccc12,3474_1,c1ccc(Nc2ccnc3ccccc23)cc1
1,2478,busulfan,Most DILI-concern,CS(=O)(=O)OCCCCOS(=O)(=O)C,1,CS(=O)(=O)OCCCCOS(C)(=O)=O,2478_1,
2,2520,verapamil,Most DILI-concern,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,1,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,2520_1,c1ccc(CCCCNCCc2ccccc2)cc1
3,2662,celecoxib,Most DILI-concern,CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)...,1,Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2...,2662_1,c1ccc(-c2ccnn2-c2ccccc2)cc1
4,2898,cyclofenil,Most DILI-concern,CC(=O)OC1=CC=C(C=C1)C(=C2CCCCC2)C3=CC=C(C=C3)O...,1,CC(=O)Oc1ccc(C(=C2CCCCC2)c2ccc(OC(C)=O)cc2)cc1,2898_1,c1ccc(C(=C2CCCCC2)c2ccccc2)cc1


In [52]:
combined_murcko.to_csv('murcko.csv')

In [53]:
combined_murcko[['Murcko_smi','New_ID']].to_csv('murcko.smi', sep='\t', header=None, index=None)

**Clustering using MayaChem Tool**

In [None]:
!python mayachemtools/bin/RDKitClusterMolecules.py --infileParams "smilesColumn,1,smilesNameColumn,2,smilesDelimiter,tab,smilesTitleLine,auto,sanitize,yes" -i murcko.smi -o murcko_cluster.smi --overwrite

In [55]:
murcko_clus=pd.read_csv('murcko_cluster.smi', sep=' ')
murcko_clus.rename(columns={'Name':'New_ID', 'SMILES':'Murcko_smi'}, inplace=True)
murcko_clus

Unnamed: 0,Murcko_smi,New_ID,ClusterNumber
0,c1ccccc1,5281081_0,1
1,c1ccccc1,4260_1,1
2,c1ccccc1,5354_1,1
3,c1ccccc1,15250_1,1
4,c1ccccc1,3397_1,1
...,...,...,...
334,c1ccc(CN2CCc3sccc3C2)cc1,5472_1,184
335,C1=Cc2ccccc2C1=Cc1ccccc1,5352_1,185
336,O=C1C[C@H](c2ccccc2)Sc2ccccc2N1,39186_1,186
337,C1CCC(C(CC2CCCCN2)C2CCCCC2)CC1,4746_1,187


In [56]:
murcko_cluster_dist=murcko_clus['ClusterNumber'].value_counts().reset_index()
murcko_cluster_dist.columns=['ClusterNumber', 'Count']
murcko_cluster_dist

Unnamed: 0,ClusterNumber,Count
0,1,46
1,2,24
2,3,11
3,5,7
4,6,7
...,...,...
183,92,1
184,93,1
185,94,1
186,96,1


In [57]:
combined_murcko_clus=pd.merge(murcko_clus,combined_murcko[['New_ID','CompoundName', 'Drug label-based annotation',	'STD_SMILES','isosmiles', 'label']], on='New_ID')
print(combined_murcko_clus.shape)
combined_murcko_clus.head()

(339, 8)


Unnamed: 0,Murcko_smi,New_ID,ClusterNumber,CompoundName,Drug label-based annotation,STD_SMILES,isosmiles,label
0,c1ccccc1,5281081_0,1,entacapone,no DILI-concern,CCN(CC)C(=O)/C(C#N)=C/c1cc(O)c(O)c([N+](=O)[O-...,CCN(CC)C(=O)/C(=C/C1=CC(=C(C(=C1)O)O)[N+](=O)[...,0
1,c1ccccc1,4260_1,1,moxisylyte,Most DILI-concern,CC(=O)Oc1cc(C(C)C)c(OCCN(C)C)cc1C,CC1=CC(=C(C=C1OC(=O)C)C(C)C)OCCN(C)C,1
2,c1ccccc1,5354_1,1,suloctidil,Most DILI-concern,CCCCCCCCNC(C)C(O)c1ccc(SC(C)C)cc1,CCCCCCCCNC(C)C(C1=CC=C(C=C1)SC(C)C)O,1
3,c1ccccc1,15250_1,1,ibufenac,Most DILI-concern,CC(C)Cc1ccc(CC(=O)O)cc1,CC(C)CC1=CC=C(C=C1)CC(=O)O,1
4,c1ccccc1,3397_1,1,flutamide,Most DILI-concern,CC(C)C(=O)Nc1ccc([N+](=O)[O-])c(C(F)(F)F)c1,CC(C)C(=O)NC1=CC(=C(C=C1)[N+](=O)[O-])C(F)(F)F,1


In [58]:
combined_murcko_clus.to_csv('murcko_clus.csv', sep='\t', index=None)

In [59]:
train3_murcko=combined_murcko_clus.groupby('ClusterNumber').filter(lambda x: len(x)>=2)
test3_murcko=combined_murcko_clus.groupby('ClusterNumber').filter(lambda x: len(x)<2)

In [60]:
print(train3_murcko.shape)
print(test3_murcko.shape)

(195, 8)
(144, 8)


In [61]:
Tp_murcko=train3_murcko[train3_murcko['label']==1]
Tn_murcko=train3_murcko[train3_murcko['label']==0]
Vp_murcko=test3_murcko[test3_murcko['label']==1]
Vn_murcko=test3_murcko[test3_murcko['label']==0]

In [62]:
print(Tp_murcko.shape)
print(Tn_murcko.shape)
print(Vp_murcko.shape)
print(Vn_murcko.shape)

(77, 8)
(118, 8)
(85, 8)
(59, 8)


In [63]:
Tp_murcko[['STD_SMILES', 'label']].to_csv('Tp_murcko.smi', sep='\t', header=None, index=None)
Tn_murcko[['STD_SMILES', 'label']].to_csv('Tn_murcko.smi', sep='\t', header=None, index=None)
Vp_murcko[['STD_SMILES', 'label']].to_csv('Vp_murcko.smi', sep='\t', header=None, index=None)
Vn_murcko[['STD_SMILES', 'label']].to_csv('Vn_murcko.smi', sep='\t', header=None, index=None)

In [64]:
!python analyze_AVE_bias.py -activeMolsTraining Tp_murcko.smi -inactiveMolsTraining Tn_murcko.smi -activeMolsTesting Vp_murcko.smi -inactiveMolsTesting Vn_murcko.smi -outFile result_murcko.txt -numWorkers 4

init 4 workers
#ActTrain= 77 #InactTrain= 118 #ActTest= 85 #InactTest= 59 knn1= 0.6010967098703888 lr= 0.6739780658025922 rf= 0.6795613160518444 svm= 0.6532402791625125 AA-AI= 0.021176470588235297 II-IA= 0.03423728813559318 (AA-AI)+(II-IA)= 0.05541375872382848


### **Bias Comaprison**

In [65]:
Bias = pd.DataFrame({'Data':['Random Split','Molecular Clustering','Scaffold-based Clustering'], 'Bias(AVE)':[0.2321, 0.1444, 0.0554]})
Bias

Unnamed: 0,Data,Bias(AVE)
0,Random Split,0.2321
1,Molecular Clustering,0.1444
2,Scaffold-based Clustering,0.0554


## **Remove Bias**

### **Method 1: Train/Test Random Split**

In [66]:
!python remove_AVE_bias.py

usage: remove_AVE_bias.py
       [-h]
       [-fpType {DayLight,ECFP4,ECFP6,ECFP12,AP,MACCS}]
       -activeMols
       ACTIVEMOLS
       -inactiveMols
       INACTIVEMOLS
       [-trainingToValidationRatio TRAININGTOVALIDATIONRATIO]
       [-outDir OUTDIR]
       [-numWorkers NUMWORKERS]
       [-statePickleFile STATEPICKLEFILE]
       [-maxIter MAXITER]
       [-maxNumMols MAXNUMMOLS]
remove_AVE_bias.py: error: the following arguments are required: -activeMols, -inactiveMols


In [67]:
active_rand=pd.concat([Tp_rand,Vp_rand], axis=0)
inactive_rand=pd.concat([Tn_rand,Vn_rand], axis=0)

In [68]:
active_rand[['STD_SMILES', 'label']].to_csv('active_rand.smi', sep='\t', header=None, index=None)
inactive_rand[['STD_SMILES', 'label']].to_csv('inactive_rand.smi', sep='\t', header=None, index=None)

In [69]:
!python remove_AVE_bias.py -activeMols active_rand.smi -inactiveMols inactive_rand.smi -numWorkers 4 -maxIter 1000

('read', 168, 'actives and', 205, 'inactives')
calc aa_D_ref
calc ii_D_ref
calc ai_D_ref
done
calculate objectives for the population
remove similar sets
('removing', 20, 'similar sets')
('population size after similarity filter: ', 20)
select the next generation
('iter=', 1, 'fullPopObj=', 0.179, 'topPopObj=', 0.179, 'finalPopObj=', 0.179, 'minObj=', 99999)
breed
calculate objectives for the population
remove similar sets
('removing', 20, 'similar sets')
('population size after similarity filter: ', 20)
select the next generation
('iter=', 2, 'fullPopObj=', 0.176, 'topPopObj=', 0.176, 'finalPopObj=', 0.157, 'minObj=', 0.179)
breed
calculate objectives for the population
remove similar sets
('removing', 20, 'similar sets')
('population size after similarity filter: ', 20)
select the next generation
('iter=', 3, 'fullPopObj=', 0.172, 'topPopObj=', 0.172, 'finalPopObj=', 0.163, 'minObj=', 0.157)
breed
calculate objectives for the population
remove similar sets
('removing', 20, 'similar s

#### ***Re-estimate Bias***

In [70]:
!python analyze_AVE_bias.py -activeMolsTraining actives.T.smi -inactiveMolsTraining inactives.T.smi -activeMolsTesting actives.V.smi -inactiveMolsTesting inactives.V.smi -outFile result_rand.txt -numWorkers 4

init 4 workers
#ActTrain= 122 #InactTrain= 149 #ActTest= 18 #InactTest= 26 knn1= 0.423076923076923 lr= 0.6025641025641025 rf= 0.6228632478632479 svm= 0.6324786324786325 AA-AI= 0.0044444444444444176 II-IA= -0.0007692307692307443 (AA-AI)+(II-IA)= 0.0036752136752136733


#### ***New Train and Test data from AVE output***


*  train = actives.T.smi + inactives.T.smi
*  test = actives.V.smi + inactives.V.smi




In [71]:
# Train data
activesT_rand=pd.read_csv('actives.T.smi', sep=' ', header=None)
activesT_rand.columns=['Smiles', 'label']

inactivesT_rand=pd.read_csv('inactives.T.smi', sep=' ', header=None)
inactivesT_rand.columns=['Smiles', 'label']

train_data_rand=pd.concat([activesT_rand, inactivesT_rand], axis=0)
train_data_rand=train_data_rand.reset_index(drop=True)
print('Train_rand shape:', train_data_rand.shape)
train_data_rand.to_csv('train_data_rand.csv', sep='\t', index=None)

# Test data
activesV_rand=pd.read_csv('actives.V.smi', sep=' ', header=None)
activesV_rand.columns=['Smiles', 'label']

inactivesV_rand=pd.read_csv('inactives.V.smi', sep=' ', header=None)
inactivesV_rand.columns=['Smiles', 'label']

test_data_rand=pd.concat([activesV_rand, inactivesV_rand], axis=0)
test_data_rand=test_data_rand.reset_index(drop=True)
print('Test_rand shape:', test_data_rand.shape)
test_data_rand.to_csv('test_data_rand.csv', sep='\t', index=None)

Train_rand shape: (271, 2)
Test_rand shape: (44, 2)


### **Method 2: Molecular Clustering**

In [72]:
active_clus=pd.concat([Tp_clus,Vp_clus], axis=0)
inactive_clus=pd.concat([Tn_clus,Vn_clus], axis=0)

In [73]:
active_clus[['SMILES', 'label']].to_csv('active_clus.smi', sep='\t', header=None, index=None)
inactive_clus[['SMILES', 'label']].to_csv('inactive_clus.smi', sep='\t', header=None, index=None)

In [74]:
!python remove_AVE_bias.py -activeMols active_clus.smi -inactiveMols inactive_clus.smi -numWorkers 4 -maxIter 1000

('read', 168, 'actives and', 205, 'inactives')
calc aa_D_ref
calc ii_D_ref
calc ai_D_ref
done
calculate objectives for the population
remove similar sets
('removing', 20, 'similar sets')
('population size after similarity filter: ', 20)
select the next generation
('iter=', 1, 'fullPopObj=', 0.159, 'topPopObj=', 0.159, 'finalPopObj=', 0.159, 'minObj=', 99999)
breed
calculate objectives for the population
remove similar sets
('removing', 20, 'similar sets')
('population size after similarity filter: ', 20)
select the next generation
('iter=', 2, 'fullPopObj=', 0.157, 'topPopObj=', 0.157, 'finalPopObj=', 0.15, 'minObj=', 0.159)
breed
calculate objectives for the population
remove similar sets
('removing', 20, 'similar sets')
('population size after similarity filter: ', 20)
select the next generation
('iter=', 3, 'fullPopObj=', 0.151, 'topPopObj=', 0.151, 'finalPopObj=', 0.137, 'minObj=', 0.15)
breed
calculate objectives for the population
remove similar sets
('removing', 20, 'similar set

#### ***Re-estimate Bias***

In [75]:
!python analyze_AVE_bias.py -activeMolsTraining actives.T.smi -inactiveMolsTraining inactives.T.smi -activeMolsTesting actives.V.smi -inactiveMolsTesting inactives.V.smi -outFile result_clus.txt -numWorkers 4

init 4 workers
#ActTrain= 125 #InactTrain= 152 #ActTest= 21 #InactTest= 25 knn1= 0.680952380952381 lr= 0.6628571428571428 rf= 0.5752380952380952 svm= 0.6247619047619047 AA-AI= 0.006666666666666654 II-IA= -0.0007999999999999674 (AA-AI)+(II-IA)= 0.005866666666666687


#### ***New Train and Test data from AVE output***







In [76]:
# Train data
activesT_clus=pd.read_csv('actives.T.smi', sep=' ', header=None)
activesT_clus.columns=['Smiles', 'label']

inactivesT_clus=pd.read_csv('inactives.T.smi', sep=' ', header=None)
inactivesT_clus.columns=['Smiles', 'label']

train_data_clus=pd.concat([activesT_clus, inactivesT_clus], axis=0)
train_data_clus=train_data_clus.reset_index(drop=True)
print('Train_clus shape:', train_data_clus.shape)
train_data_rand.to_csv('train_data_rand.csv', sep='\t', index=None)

# Test data
activesV_clus=pd.read_csv('actives.V.smi', sep=' ', header=None)
activesV_clus.columns=['Smiles', 'label']

inactivesV_clus=pd.read_csv('inactives.V.smi', sep=' ', header=None)
inactivesV_clus.columns=['Smiles', 'label']

test_data_clus=pd.concat([activesV_clus, inactivesV_clus], axis=0)
test_data_clus=test_data_clus.reset_index(drop=True)
print('Test_clus shape:', test_data_clus.shape)
test_data_clus.to_csv('test_data_clus.csv', sep='\t', index=None)

Train_clus shape: (277, 2)
Test_clus shape: (46, 2)


### **Method 3: Scaffold-based Clustering**

In [77]:
active_murcko=pd.concat([Tp_murcko,Vp_murcko], axis=0)
inactive_murcko=pd.concat([Tn_murcko,Vn_murcko], axis=0)

In [78]:
active_murcko[['STD_SMILES', 'label']].to_csv('active_murcko.smi', sep='\t', header=None, index=None)
inactive_murcko[['STD_SMILES', 'label']].to_csv('inactive_murcko.smi', sep='\t', header=None, index=None)

In [79]:
!python remove_AVE_bias.py -activeMols active_murcko.smi -inactiveMols inactive_murcko.smi -numWorkers 4 -maxIter 1000

('read', 162, 'actives and', 177, 'inactives')
calc aa_D_ref
calc ii_D_ref
calc ai_D_ref
done
calculate objectives for the population
remove similar sets
('removing', 20, 'similar sets')
('population size after similarity filter: ', 20)
select the next generation
('iter=', 1, 'fullPopObj=', 0.249, 'topPopObj=', 0.249, 'finalPopObj=', 0.249, 'minObj=', 99999)
breed
calculate objectives for the population
remove similar sets
('removing', 20, 'similar sets')
('population size after similarity filter: ', 20)
select the next generation
('iter=', 2, 'fullPopObj=', 0.246, 'topPopObj=', 0.246, 'finalPopObj=', 0.236, 'minObj=', 0.249)
breed
calculate objectives for the population
remove similar sets
('removing', 20, 'similar sets')
('population size after similarity filter: ', 20)
select the next generation
('iter=', 3, 'fullPopObj=', 0.238, 'topPopObj=', 0.238, 'finalPopObj=', 0.227, 'minObj=', 0.236)
breed
calculate objectives for the population
remove similar sets
('removing', 20, 'similar s

#### ***Re-estimate Bias***

In [80]:
!python analyze_AVE_bias.py -activeMolsTraining actives.T.smi -inactiveMolsTraining inactives.T.smi -activeMolsTesting actives.V.smi -inactiveMolsTesting inactives.V.smi -outFile result_murcko.txt -numWorkers 4

init 4 workers
#ActTrain= 115 #InactTrain= 129 #ActTest= 11 #InactTest= 16 knn1= 0.4630681818181818 lr= 0.5738636363636364 rf= 0.6534090909090908 svm= 0.6420454545454545 AA-AI= -0.02181818181818182 II-IA= 0.023749999999999993 (AA-AI)+(II-IA)= 0.0019318181818182012


#### ***New Train and Test data from AVE output***

In [81]:
# Train data
activesT_murk=pd.read_csv('actives.T.smi', sep=' ', header=None)
activesT_murk.columns=['Smiles', 'label']

inactivesT_murk=pd.read_csv('inactives.T.smi', sep=' ', header=None)
inactivesT_murk.columns=['Smiles', 'label']

train_data_murk=pd.concat([activesT_murk, inactivesT_murk], axis=0)
train_data_murk=train_data_murk.reset_index(drop=True)
print('Train_murk shape:', train_data_murk.shape)
train_data_rand.to_csv('train_data_rand.csv', sep='\t', index=None)

# Test data
activesV_murk=pd.read_csv('actives.V.smi', sep=' ', header=None)
activesV_murk.columns=['Smiles', 'label']

inactivesV_murk=pd.read_csv('inactives.V.smi', sep=' ', header=None)
inactivesV_murk.columns=['Smiles', 'label']

test_data_murk=pd.concat([activesV_murk, inactivesV_murk], axis=0)
test_data_murk=test_data_murk.reset_index(drop=True)
print('Test_murk shape:', test_data_murk.shape)
test_data_murk.to_csv('test_data_murk.csv', sep='\t', index=None)

Train_murk shape: (244, 2)
Test_murk shape: (27, 2)


## **Revised Bias**

In [91]:
Revised_Bias = pd.DataFrame({'Data':['Random Split','Molecular Clustering','Scaffold-based Clustering'], 'Bias(AVE)':[0.2321, 0.1444, 0.0185], 'Revised_Bias(AVE)':[0.0037, 0.0059, 0.0019]})
Revised_Bias

Unnamed: 0,Data,Bias(AVE),Revised_Bias(AVE)
0,Random Split,0.2321,0.0037
1,Molecular Clustering,0.1444,0.0059
2,Scaffold-based Clustering,0.0185,0.0019


### **Scaffold-based Clustering yields minimum bias. Save corresponding train/test dataset for model building.**

### **Train Data**

In [94]:
train_data_murk.head(3)

Unnamed: 0,Smiles,label
0,CCCCCOC(=O)Nc1nc(=O)n([C@@H]2O[C@H](C)[C@@H](O...,1
1,NC(=O)C1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1,1
2,Cc1ccc(Nc2c(F)cccc2Cl)c(CC(=O)O)c1,1


In [95]:
train_data_murk.rename(columns={'Smiles':'STD_SMILES'}, inplace=True)

In [96]:
train_final=pd.merge(train_data_murk, combined_murcko_clus[['STD_SMILES','Murcko_smi', 'isosmiles','New_ID','ClusterNumber', 'CompoundName', 'Drug label-based annotation']], on='STD_SMILES')
train_final=train_final[['isosmiles','STD_SMILES', 'Murcko_smi', 'New_ID', 'label', 'ClusterNumber', 'CompoundName', 'Drug label-based annotation']]
train_final.head(3)

Unnamed: 0,isosmiles,STD_SMILES,Murcko_smi,New_ID,label,ClusterNumber,CompoundName,Drug label-based annotation
0,CCCCCOC(=O)NC1=NC(=O)N(C=C1F)[C@H]2[C@@H]([C@@...,CCCCCOC(=O)Nc1nc(=O)n([C@@H]2O[C@H](C)[C@@H](O...,O=c1ncccn1[C@H]1CCCO1,60953_1,1,11,capecitabine,Most DILI-concern
1,C1CN(CCC1C(=O)N)CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)Cl,NC(=O)C1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1,c1ccc2c(c1)Sc1ccccc1N2CCCN1CCCCC1,6761_1,1,42,pipamazine,Most DILI-concern
2,CC1=CC(=C(C=C1)NC2=C(C=CC=C2Cl)F)CC(=O)O,Cc1ccc(Nc2c(F)cccc2Cl)c(CC(=O)O)c1,c1ccc(Nc2ccccc2)cc1,151166_1,1,2,lumiracoxib,Most DILI-concern


In [97]:
train_final.to_csv('train.csv', sep='\t', index=None)

### **Test Data**

In [103]:
test_data_murk.head(3)

Unnamed: 0,STD_SMILES,label
0,COc1ccc(Cc2nccc3cc(OC)c(OC)cc23)cc1OC,1
1,CO/N=C(\N)c1ccc(-c2ccc(-c3ccc(/C(N)=N/OC)cc3)o...,1
2,C=CCOc1ccc(CC(=O)O)cc1Cl,1


In [100]:
test_data_murk.rename(columns={'Smiles':'STD_SMILES'}, inplace=True)

In [101]:
test_final=pd.merge(test_data_murk, combined_murcko_clus[['STD_SMILES','Murcko_smi', 'isosmiles','New_ID','ClusterNumber', 'CompoundName', 'Drug label-based annotation']], on='STD_SMILES')
test_final=test_final[['isosmiles','STD_SMILES', 'Murcko_smi', 'New_ID', 'label', 'ClusterNumber', 'CompoundName', 'Drug label-based annotation']]
test_final.head()

Unnamed: 0,isosmiles,STD_SMILES,Murcko_smi,New_ID,label,ClusterNumber,CompoundName,Drug label-based annotation
0,COC1=C(C=C(C=C1)CC2=NC=CC3=CC(=C(C=C32)OC)OC)OC,COc1ccc(Cc2nccc3cc(OC)c(OC)cc23)cc1OC,c1ccc(Cc2nccc3ccccc23)cc1,4680_1,1,2,papaverine,Most DILI-concern
1,CO/N=C(\N)/C1=CC=C(C=C1)C2=CC=C(O2)C3=CC=C(C=C...,CO/N=C(\N)c1ccc(-c2ccc(-c3ccc(/C(N)=N/OC)cc3)o...,c1ccc(-c2ccc(-c3ccccc3)o2)cc1,5480200_1,1,49,pafuramidine,Most DILI-concern
2,C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl,C=CCOc1ccc(CC(=O)O)cc1Cl,c1ccccc1,30951_1,1,1,alclofenac,Most DILI-concern
3,CC1(C(=O)N(C(=O)N1)C2=CC(=C(C=C2)[N+](=O)[O-])...,CC1(C)NC(=O)N(c2ccc([N+](=O)[O-])c(C(F)(F)F)c2...,O=C1CNC(=O)N1c1ccccc1,4493_1,1,129,nilutamide,Most DILI-concern
4,CN1C(S(=O)(=O)CCC1=O)C2=CC=C(C=C2)Cl,CN1C(=O)CCS(=O)(=O)C1c1ccc(Cl)cc1,O=C1CCS(=O)(=O)C(c2ccccc2)N1,2717_1,1,114,chlormezanone,Most DILI-concern


In [104]:
test_final.to_csv('test.csv', sep='\t', index=None)