### curating DTC data using CYP2D6 (FRCE)

Written by Titli Sarkar (titli.sarkar@nih.gov) on March 3, 2023

Simplified data curation steps for raw data pulled from DTC.

Data location (on FRCE): Initial dataset downloaded from Drug Target Commons (DTC) database and stored below:

1> cd /mnt/projects/ATOM/sarkart4/CYP_Data_Curation/data/

   mkdir dtc
   
   cd dtc

2> wget https://drugtargetcommons.fimm.fi/static/Excell_files/DTC_data.csv
   
   ### **Be careful to not overwrite previous versions. Default behavior should be to create a new version.
   
   ### **Note the file seems to be last updated March  2023.
   
     The initial downloaded file name will be DTC_data.csv. Check it with $ls 
   
     $ wc -l DTC_data.csv  -> 5981099  (size of the full DTC database)
   
     $ grep CYP2D6 DTC_data.csv | wc -l -> 30057  (size of the CYP3A4 in DTC database)
      
     You can check other CYP's also:
   
       $ grep CYP3A4 DTC_data.csv | wc -l ->  47075

       $ grep CYP2C9 DTC_data.csv | wc -l -> 28268

3> grep CYP2D6 DTC_data.csv > raw_data.txt

4> head -1 DTC_data.csv > header

5> cat header raw_data.txt > cyp2d6.csv

### **Data File: cyp2d6.csv

Note the "ACTION" in comments 

Using data_curation_functions.py


In [23]:
#import sys
import atomsci.ddm.utils.data_curation_functions as dcf
import atomsci.ddm.utils.curate_data as curate_data

In [2]:
import importlib as imp
import pandas as pd
import numpy as np

In [3]:
target = 'CYP2D6'

In [4]:
data_dir = "/mnt/projects/ATOM/sarkart4/CYP_Data_Curation/data/dtc/"
file= data_dir + target.lower()+'.csv'
!ls $data_dir

orig_df=pd.read_csv(file,sep=",",engine="python",error_bad_lines=False)
print(orig_df.shape)
orig_df.head(2)

cyp2c9.csv    header	    save_smiles_cyp2c9_nm_raw.csv
cyp2d6.csv    ml_ready	    save_smiles_cyp2d6_nm_raw.csv
cyp3a4.csv    raw	    save_smiles_cyp3a4_nm_raw.csv
DTC_data.csv  raw_data.txt




  exec(code_obj, self.user_global_ns, self.user_ns)


(30057, 33)


Unnamed: 0,compound_id,standard_inchi_key,compound_name,synonym,target_id,target_pref_name,gene_names,wildtype_or_mutant,mutation_info,pubmed_id,...,compound_concentration_value_unit,substrate_type,substrate_relation,substrate_value,substrate_units,assay_description,title,journal,doc_type,annotation_comments
0,CHEMBL1232461,AAAQFGUYHFJNHI-SFHVURJKSA-N,GSK525762A,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,24015967.0,...,,,,,,Inhibition of human CYP2D6,Discovery of epigenetic regulator I-BET762: le...,J. Med. Chem.,PUBLICATION,
1,CHEMBL1232461,AAAQFGUYHFJNHI-SFHVURJKSA-N,GSK525762A,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,24015967.0,...,,,,,,Time-dependent inhibition of human CYP2D6,Discovery of epigenetic regulator I-BET762: le...,J. Med. Chem.,PUBLICATION,


In [5]:
orig_df.columns

Index(['compound_id', 'standard_inchi_key', 'compound_name', 'synonym',
       'target_id', 'target_pref_name', 'gene_names', 'wildtype_or_mutant',
       'mutation_info', 'pubmed_id', 'standard_type', 'standard_relation',
       'standard_value', 'standard_units', 'activity_comment',
       'ep_action_mode', 'assay_format', 'assaytype', 'assay_subtype',
       'inhibitor_type', 'detection_tech', 'assay_cell_line',
       'compound_concentration_value', 'compound_concentration_value_unit',
       'substrate_type', 'substrate_relation', 'substrate_value',
       'substrate_units', 'assay_description', 'title', 'journal', 'doc_type',
       'annotation_comments'],
      dtype='object')

In [6]:
# Save raw file in FRCE from pandas format
#!mkdir $data_dir'raw/'
orig_df.to_csv(data_dir+ 'raw/'+target.lower()+'.csv',index=False)

In [7]:
for v in orig_df['standard_units'].unique() :  
    t=orig_df[orig_df['standard_units']==v]
    print(v,t.shape)

NM (14614, 33)
nan (0, 33)
% (1488, 33)
UM (7, 33)
MIN-1 (9, 33)
MM (2, 33)
/MIN (1, 33)
UL/MIN (11, 33)
HR (9, 33)
PMOL/MIN (8, 33)
UG.ML-1 (2, 33)
MIN (4, 33)
NMOL (2, 33)
10'-4/MIN (3, 33)
1/MIN (7, 33)
ML.MIN-1.G-1 (2, 33)
NMOL/MIN (10, 33)
10'3ML/MIN (3, 33)
UL.MIN.NMOL-1 (1, 33)
PMOL.MIN.MG-1 (1, 33)


### Obtain unique standard_inchi_key

In [8]:
imp.reload(dcf)
geneNames = [target.upper()]

nm_df = dcf.filter_dtc_data(orig_df,geneNames) ## --> DTC specific function
#Note: AMPL supports extraction of JAK1, 2, and 3 datasets from DTC

print(nm_df.shape)
myList=nm_df['standard_inchi_key'].unique().tolist()

print(len(myList))

(3961, 33)
3720


In [9]:
myList[:5]

['AAAQFGUYHFJNHI-SFHVURJKSA-N',
 'AAKBFDPOXZBIAO-UHFFFAOYSA-N',
 'AATCBLYHOUOCTO-UHFFFAOYSA-N',
 'ABACVOXFUHDKNZ-UHFFFAOYSA-N',
 'ABAOWFCFAADMKA-UHFFFAOYSA-N']

## Retrieve SMILES strings for compounds through PUBCHEM web interface.

## TURN NEXT CELL TO TEXT TO AVOID RE-RUNNING (unintentionally)

In [12]:
import imp
import atomsci.ddm.utils.pubchem_utils as pu
imp.reload(pu)
ofile = data_dir + 'save_smiles_'+target.lower()+'_nm_raw.csv'

## this is slow, so don't re-do if the SMILES are already downloaded
#if not -e ofile :
save_smiles_df,fail_lst,discard_lst = pu.download_smiles(myList) # Note: I have used not used pu.download_smiles(), used the above
save_smiles_df.to_csv(ofile)

In [13]:
print(len(fail_lst))
print(save_smiles_df.shape)
# 484
#(16670, 3)

294
(3426, 3)


In [14]:
ifile = data_dir + 'save_smiles_'+target.lower()+'_nm_raw.csv'
save_smiles_df=pd.read_csv(ifile)
save_smiles_df.head()

Unnamed: 0.1,Unnamed: 0,CID,standard_inchi_key,smiles
0,0,46943432,AAAQFGUYHFJNHI-SFHVURJKSA-N,"""CCNC(=O)CC1C2=NN=C(N2C3=C(C=C(C=C3)OC)C(=N1)C..."
1,1,44408118,AAKBFDPOXZBIAO-UHFFFAOYSA-N,"""C1CCN(C(=O)C1)CC2=CC=C(C=C2)OCCCN3CCC(CC3)CC4..."
2,2,11964036,AATCBLYHOUOCTO-UHFFFAOYSA-N,"""CCN1CCN(CC1)CC(=O)NC2=C3C4=CC=CC=C4SC3=C(C=C2..."
3,3,23642319,ABACVOXFUHDKNZ-UHFFFAOYSA-N,"""CCC(=O)C1=C(N(C(=C1)C2=CC=C(C=C2)Cl)C3=CC=C(C..."
4,4,44230873,ABAOWFCFAADMKA-UHFFFAOYSA-N,"""C1CNCC1(CC2=CC=CC=N2)C3=CC4=C(C=C3)NC=C4"""


## Retrieve specific CYP data
## Will include censored data in smiles 
## Combine gene data with SMILES strings and call this our starting "raw" dataset.

### ** Imp! AMPL pu.download_smiles() was giving 

In [15]:
imp.reload(dcf)
targ_lst=[target.upper()]
smiles_lst, shared_inchi_keys = dcf.get_smiles_dtc_data(nm_df,targ_lst,save_smiles_df) # This function calculates pIC50 and adds in data
# Combine gene data with SMILES strings and call this our starting "raw" dataset.
# This is designed for combining JAK1, JAK2 and JAK3.
smiles_df=pd.concat(smiles_lst) # smiles_lst -> returns full dataframe in list form. pd.concat() transforms it to a datframe format
print(smiles_df.shape)

CYP2D6 distinct compounds = only 1864
CYP2D6 distinct compounds <,>,= 3720
num shared compounds 3719
Add pIC50 values.
0         33000.00
23         5000.00
36        10000.00
45         2000.00
46         3500.00
           ...    
29609     50000.00
29619     31622.78
29622      2300.00
29636     13000.00
29669    100000.00
Name: standard_value, Length: 3960, dtype: float64
33000.0
5000.0
10000.0
2000.0
3500.0
6400.0
6400.0
300.0
1100.0
50000.0
1800.0
400.0
2900.0
50000.0
100.0
320.0
100000.0
50000.0
50000.0
10000.0
100000.0
8845.0
30000.0
200.0
40000.0
30000.0
8200.0
10000.0
10000.0
30000.0
50000.0
26.0
10000.0
30000.0
50000.0
800.0
10000.0
30000.0
40000.0
10000.0
19500.0
10200.0
100000.0
50000.0
1900.0
1000.0
30000.0
10000.0
40000.0
3400.0
3400.0
1900.0
15000.0
19000.0
300.0
50000.0
4000.0
1000.0
50.0
10000.0
40000.0
26400.0
50000.0
2774.3
10000.0
43000.0
2000.0
16000.0
30000.0
2100.0
310.0
6500.0
10000.0
86000.0
100.0
10000.0
27000.0
5000.0
5000.0
2000.0
5000.0
9800.0
30000.0
800.

50000.0
40000.0
3162.28
398.0
9500.0
6700.0
400.0
400.0
100000.0
1700.0
5702.0
4500.0
9000.0
10000.0
50000.0
10000.0
25000.0
10000.0
6000.0
50.0
5100.0
3800.0
10000.0
10400.0
8000.0
10000.0
10000.0
1000.0
17800.0
100000.0
310.0
100000.0
12900.0
230.0
20000.0
20000.0
29000.0
10000.0
20000.0
17700.0
10000.0
1300.0
30000.0
48000.0
500.0
5000.0
50000.0
6600.0
5800.0
6309.57
20000.0
50000.0
30000.0
490.0
100000.0
20000.0
50000.0
30000.0
10000.0
30000.0
10000.0
50000.0
610.0
10000.0
10000.0
2200.0
4100.0
10000.0
30000.0
10000.0
30000.0
10000.0
30000.0
3300.0
5000.0
32000.0
10000.0
1700.0
100000.0
10500.0
500.0
10000.0
16800.0
20000.0
30000.0
30000.0
6300.0
20000.0
30000.0
30000.0
50000.0
12000.0
11017.0
11000.0
5000.0
9640.0
8700.0
6590.0
30000.0
10000.0
36300.0
3000.0
2300.0
10000.0
50000.0
3600.0
10000.0
10000.0
40000.0
9000.0
100000.0
15000.0
23000.0
300.0
8500.0
1600.0
10000.0
940.8
50000.0
10000.0
20000.0
9000.0
1500.0
10000.0
4930.0
100.0
30000.0
6600.0
50000.0
10000.0
5400.0
30000.0
5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PIC50']=df['standard_value'].apply(ic50topic50)


(3660, 38)
3425
(3660, 38)


In [16]:
# Save file to FRCE
smiles_df.to_csv(data_dir + 'raw/' + target.lower()+ '_dtc_smiles.csv',index=False)

In [18]:
print(smiles_df.shape)
smiles_df.head()

(3660, 38)


Unnamed: 0.1,compound_id,standard_inchi_key,compound_name,synonym,target_id,target_pref_name,gene_names,wildtype_or_mutant,mutation_info,pubmed_id,...,assay_description,title,journal,doc_type,annotation_comments,PIC50,Unnamed: 0,CID,smiles,rdkit_smiles
0,CHEMBL1232461,AAAQFGUYHFJNHI-SFHVURJKSA-N,GSK525762A,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,24015967.0,...,Inhibition of human CYP2D6,Discovery of epigenetic regulator I-BET762: le...,J. Med. Chem.,PUBLICATION,,4.481486,0,46943432,CCNC(=O)CC1C2=NN=C(N2C3=C(C=C(C=C3)OC)C(=N1)C4...,CCNC(=O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...
1,CHEMBL204705,AAKBFDPOXZBIAO-UHFFFAOYSA-N,,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,16297617.0,...,Inhibition of CYP2D6 by supersome assay,Reduction of CYP450 inhibition in the 4-[(1H-i...,Bioorg. Med. Chem. Lett.,PUBLICATION,,5.30103,1,44408118,C1CCN(C(=O)C1)CC2=CC=C(C=C2)OCCCN3CCC(CC3)CC4=...,O=C1CCCCN1Cc1ccc(OCCCN2CCC(Cc3cnc[nH]3)CC2)cc1
2,CHEMBL1086377,AATCBLYHOUOCTO-UHFFFAOYSA-N,KU-0060648,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,23855836.0,...,Inhibition of CYP2D6 (unknown origin),"1-substituted (Dibenzo[b,d]thiophen-4-yl)-2-mo...",J. Med. Chem.,PUBLICATION,,5.0,2,11964036,CCN1CCN(CC1)CC(=O)NC2=C3C4=CC=CC=C4SC3=C(C=C2)...,CCN1CCN(CC(=O)Nc2ccc(-c3cccc4c(=O)cc(N5CCOCC5)...
3,CHEMBL490153,ABACVOXFUHDKNZ-UHFFFAOYSA-N,A-867744,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,19419141.0,...,Inhibition of CYP2D6,Discovery of 4-(5-(4-chlorophenyl)-2-methyl-3-...,J. Med. Chem.,PUBLICATION,,5.69897,3,23642319,CCC(=O)C1=C(N(C(=C1)C2=CC=C(C=C2)Cl)C3=CC=C(C=...,CCC(=O)c1cc(-c2ccc(Cl)cc2)n(-c2ccc(S(N)(=O)=O)...
4,CHEMBL456654,ABAOWFCFAADMKA-UHFFFAOYSA-N,,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,18954985.0,...,Inhibition of CYP2D6 by fluorescence based assay,"Novel 3,3-disubstituted pyrrolidines as select...",Bioorg. Med. Chem. Lett.,PUBLICATION,,5.455932,4,44230873,C1CNCC1(CC2=CC=CC=N2)C3=CC4=C(C=C3)NC=C4,c1ccc(CC2(c3ccc4[nH]ccc4c3)CCNC2)nc1


### Apply ATOM standard 'curation' step: Average replicate assays, remove duplicates and drop cases with large variance between replicates.


### Use atom_curation()

In [19]:
imp.reload(dcf)
curated_lst = dcf.atom_curation(targ_lst, smiles_lst, shared_inchi_keys)

gene_names ['CYP2D6']
standard_type ['IC50']
standard_relation ['=']
before (1881, 38)
Bad duplicates removed from dataset
Dataframe size (1803, 42)

Dataset de-duplicated
Dataframe size (1611, 42)
New column created with averaged values:  VALUE_NUM_mean
After (1611, 42) # of dropped compounds 2108


### Use average_and_remove_duplicates() 

In [24]:
data=smiles_df
column = 'PIC50'
tolerance = 10
list_bad_duplicates = 'Yes'
max_std = 1
compound_id = 'compound_id'
smiles_col = 'rdkit_smiles'
check_df = curate_data.average_and_remove_duplicates(column, tolerance, 
                                                       list_bad_duplicates, 
                                                       data, max_std, 
                                                       compound_id=compound_id, 
                                                       smiles_col=smiles_col)
# note: checked that this is already curated
[check_df.shape]
check_df.head()

Bad duplicates removed from dataset
Dataframe size (3568, 42)
List of 'bad' duplicates removed
        compound_id     PIC50  VALUE_NUM_mean   Perc_Var  VALUE_NUM_std
3209  CHEMBL1079256  4.000000        4.500000  11.111111       0.707107
3208  CHEMBL1079256  5.000000        4.500000  11.111111       0.707107
2989  CHEMBL1083708  4.522879        5.118520  11.636990       0.526728
1875  CHEMBL1090433  5.481486        4.859614  12.796729       0.436429
1398  CHEMBL1091776  5.136677        6.373215  19.402100       2.015510
...             ...       ...             ...        ...            ...
458    CHEMBL522295  6.397940        6.555182   2.398745       1.073836
151    CHEMBL556506  5.000000        4.472542  11.793246       0.351656
2136      CHEMBL682  6.193820        5.541053  11.780550       0.565312
2461      CHEMBL900  6.397940        5.460409  17.169603       1.325869
2460      CHEMBL900  4.522879        5.460409  17.169603       1.325869

[92 rows x 5 columns]

Dataset de-duplic

Unnamed: 0.1,compound_id,standard_inchi_key,compound_name,synonym,target_id,target_pref_name,gene_names,wildtype_or_mutant,mutation_info,pubmed_id,...,annotation_comments,PIC50,Unnamed: 0,CID,smiles,rdkit_smiles,VALUE_NUM_mean,VALUE_NUM_std,Perc_Var,Remove_BadDuplicate
0,CHEMBL1232461,AAAQFGUYHFJNHI-SFHVURJKSA-N,GSK525762A,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,24015967.0,...,,4.481486,0,46943432,CCNC(=O)CC1C2=NN=C(N2C3=C(C=C(C=C3)OC)C(=N1)C4...,CCNC(=O)CC1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-n2c(C...,4.481486,,0.0,0
1,CHEMBL204705,AAKBFDPOXZBIAO-UHFFFAOYSA-N,,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,16297617.0,...,,5.30103,1,44408118,C1CCN(C(=O)C1)CC2=CC=C(C=C2)OCCCN3CCC(CC3)CC4=...,O=C1CCCCN1Cc1ccc(OCCCN2CCC(Cc3cnc[nH]3)CC2)cc1,5.30103,,0.0,0
2,CHEMBL1086377,AATCBLYHOUOCTO-UHFFFAOYSA-N,KU-0060648,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,23855836.0,...,,5.0,2,11964036,CCN1CCN(CC1)CC(=O)NC2=C3C4=CC=CC=C4SC3=C(C=C2)...,CCN1CCN(CC(=O)Nc2ccc(-c3cccc4c(=O)cc(N5CCOCC5)...,5.0,,0.0,0
3,CHEMBL490153,ABACVOXFUHDKNZ-UHFFFAOYSA-N,A-867744,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,19419141.0,...,,5.69897,3,23642319,CCC(=O)C1=C(N(C(=C1)C2=CC=C(C=C2)Cl)C3=CC=C(C=...,CCC(=O)c1cc(-c2ccc(Cl)cc2)n(-c2ccc(S(N)(=O)=O)...,5.69897,,0.0,0
4,CHEMBL456654,ABAOWFCFAADMKA-UHFFFAOYSA-N,,,P10635,CYTOCHROME P450 2D6,CYP2D6,,,18954985.0,...,,5.455932,4,44230873,C1CNCC1(CC2=CC=CC=N2)C3=CC4=C(C=C3)NC=C4,c1ccc(CC2(c3ccc4[nH]ccc4c3)CCNC2)nc1,5.455932,,0.0,0


### Use aggregate_assay_data()

In [26]:
import atomsci.ddm.utils.curate_data as curate_data, imp
imp.reload(curate_data)

tolerance=10
column='PIC50'; #'standard_value'
list_bad_duplicates='No'
max_std=1
list_bad_duplicates='Yes'

data=check_df
print("before",check_df.shape)

temp_df=curate_data.aggregate_assay_data(data, value_col=column, output_value_col=None,
                         label_actives=True,
                         active_thresh=6, # None | This creates 'active' column based on pIC50 value. Rule used here: pIC50 >=6 'active', labelled as 0, else 1
                         id_col='standard_inchi_key', smiles_col='rdkit_smiles', relation_col='standard_relation')

# Remove inf in curated_df
temp_df = temp_df[~temp_df.isin([np.inf]).any(1)]
#censored_curated_df = censored_curated_df[~censored_curated_df.isin([np.inf]).any(1)]
print("after",temp_df.shape)
temp_df.head()

before (3263, 42)
after (3263, 5)


Unnamed: 0,compound_id,base_rdkit_smiles,relation,PIC50,active
0,ASJOLIPQCOYRPT-UHFFFAOYSA-N,N#Cc1cc(OC(F)(F)F)cc(-c2nc(-c3ccc4c(c3)CCN4C(=...,>,4.30103,0
1,NCPZEGFWCSOTGE-UHFFFAOYSA-N,CCOc1ccc2nc(SCc3ccccn3)[nH]c2c1,,4.30103,0
2,UZVPIJMLVISXHS-UHFFFAOYSA-N,Cc1cc2c(-c3ccc[nH]c3=O)c(C(=O)O)n(Cc3cc4ccccc4...,>,4.30103,0
3,IJLVCMAAMCBQNU-UHFFFAOYSA-N,CN(C)Cc1ccc2c(c1)CCN(C(=O)c1cc3cc(Cl)ccc3n1C)C2,,5.537602,0
4,LFSBNTMFCXGIEC-UHFFFAOYSA-N,CN(CCO)C1CCN(c2nc3ccccc3n2Cc2ccc(F)cc2)CC1,,4.454075,0


In [None]:
# Save to FRCE
#!mkdir $data_dir'ml_ready/'
temp_df.to_csv(data_dir + 'ml_ready/'+target.lower()+'_dtc_base_smiles_all.csv')

### Visualization

In [None]:
import atomsci.ddm.utils.curate_data as curate_data
imp.reload(curate_data)

data = temp_df
column = 'PIC50' #'VALUE_NUM_mean'
num_bins = 20
units = 'NM' #'log nm'
filepath = ""
title = target.upper()
data=temp_df

curate_data.summarize_data(column, num_bins, title, units, filepath, data)

In [None]:
old_compound_id='rdkit_smiles'
new_compound_id='base_rdkit_smiles'

# Takes all the compounds that aren't part of the curated data frame and prints them
reject=smiles_df[~smiles_df[old_compound_id].isin(temp_df[new_compound_id])]
[reject.shape]
reject.head(2)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots()
before=smiles_df["PIC50"]
after=temp_df["PIC50"] 
sns.distplot(before, bins=range(0, 12, 1), ax=ax, kde=False, label='before')
sns.distplot(after, bins=range(0, 12, 1), ax=ax, kde=False, label='after')
plt.legend()
ax.set_xlim([0, 12])
plt.title(target.upper() +' - Data Curation - DTC raw \n#discarded compounds='+str(reject[compound_id].nunique()));