# Here, we will add active column to Union train and test data from MoDac. 

### Please note that we have already done similar curation on individual DTC, Excape and ChEMBL 'raw' data as earlier stages. Now, we will use active column of those individual dataset to find active value of each compound on union train and testset by matching the rdkit_smiles and compound_id.

Titli Sarkar
March 15, 2023

In [3]:
import importlib as imp
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context('paper')
sns.set_style("whitegrid")
sns.set_palette("Set2")
plt.rcParams['figure.figsize'] = [10,5]
#pal = sns.color_palette("Set2")
plt.rcParams['font.size'] = 5

In [4]:
import atomsci.ddm.utils.data_curation_functions as dcf
import atomsci.ddm.utils.curate_data as curate_data
from atomsci.ddm.utils.struct_utils import base_smiles_from_smiles

In [6]:
# Just run all cells with changing target you want
target = 'CYP2D6' # 'CYP2D6' or 'CYP2C9'

In [7]:
data_dir = "/mnt/projects/ATOM/sarkart4/MoDaC/NCI_DOE_Archive/ATOM/Safety_Screen_Targets/"+target.upper()+'-ampl-1.1.0/'
!ls $data_dir

B9FDFB00
CHEMBL25-CYP2D6_human_IC50_26Nov2019.txt
cyp2d6_chembl_smiles_active.csv
cyp2d6_chembl_smiles_active_th5.csv
cyp2d6_chembl_testset_base_smiles_union_active.csv
cyp2d6_chembl_testset_base_smiles_union.csv
CYP2D6_curated.csv
CYP2D6_curated_train_valid_test_scaffold_0f84c002-8d4f-408b-a50f-e3fcba80ca4c.csv
cyp2d6_dtc_smiles_active.csv
cyp2d6_dtc_smiles_active_th5.csv
cyp2d6_dtc_smiles.csv
cyp2d6_dtc_testset_base_smiles_union_active.csv
cyp2d6_dtc_testset_base_smiles_union.csv
cyp2d6_excape_smiles_active.csv
cyp2d6_excape_smiles_active_th5.csv
cyp2d6_excape_smiles.csv
cyp2d6_excape_testset_base_smiles_union_active.csv
cyp2d6_excape_testset_base_smiles_union.csv
CYP2D6_merged
CYP2D6_merged.csv
CYP2D6_reject_IC50.csv
cyp2d6_union_testset_base_smiles.csv
cyp2d6_union_trainset_base_smiles.csv
cyp2d6_union_trainset_base_smiles_train_valid_test_scaffold_4cb49d5e-1882-474e-ae0b-781d77a4738a.csv
cyp2d6_union_trainset_base_smiles_train_valid_test_scaffold_c8c36365-b

In [8]:
file = data_dir + target.lower()+'_union_testset_base_smiles.csv' # exisitng MoDaC raw
union_test_df = pd.read_csv(file,sep=",",engine="python",error_bad_lines=False) # Note that my orig_df is MoDaC files, you should use the raw file downloaded form ExcapeDB
print(union_test_df.shape)
union_test_df.head()

(1843, 5)




  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,base_rdkit_smiles,compound_id,pIC50,relation,active
0,S=C(Nc1ccc2cn[nH]c2c1)Nc1ccc2cn[nH]c2c1,OTJWJLLGPAFTIP-KNLWZTMUNA-N,5.7,,1
1,Cc1[nH]c2ccccc2c1C(c1ccccn1)N1CCC(C)CC1,NBKDMBGWUXYCJB-UHFFFAOYNA-N,4.6,<,0
2,CC(C)(Cc1c[nH]c2ccc(Cl)cc12)NCCOc1ccccc1OCC1CC1,XLZHXAXXJVKTFM-UHFFFAOYNA-N,7.0,,1
3,CCN(CC)C(=O)CN1CCCC(c2nc3ccccc3n2Cc2ccc(F)cc2)C1,CHEMBL1935432,5.720831,,1
4,NC(=NCc1ccc(Cl)cc1)SCCCc1c[nH]cn1,UCAIEVHKDLMIFL-QDYITYEQNA-N,4.9,<,0


In [9]:
union_test_df.columns

Index(['base_rdkit_smiles', 'compound_id', 'pIC50', 'relation', 'active'], dtype='object')

In [10]:
# first, delete old 'active' column
union_test_df = union_test_df.drop(columns = ['active'])
print(union_test_df.shape) 
union_test_df.head(2)

(1843, 4)


Unnamed: 0,base_rdkit_smiles,compound_id,pIC50,relation
0,S=C(Nc1ccc2cn[nH]c2c1)Nc1ccc2cn[nH]c2c1,OTJWJLLGPAFTIP-KNLWZTMUNA-N,5.7,
1,Cc1[nH]c2ccccc2c1C(c1ccccn1)N1CCC(C)CC1,NBKDMBGWUXYCJB-UHFFFAOYNA-N,4.6,<


In [None]:
#next, standardize smiles (for extra precaution)
#union_test_df['base_rdkit_smiles'] = base_smiles_from_smiles(union_test_df['base_rdkit_smiles'].to_list())

In [11]:
# Read three previously correctly 'active' marked raw dataframes
dtc_df = pd.read_csv(data_dir +target.lower()+'_dtc_smiles_active.csv')[['rdkit_smiles', 'active']]
excape_df = pd.read_csv(data_dir +target.lower()+'_excape_smiles_active.csv')[['rdkit_smiles', 'active']]
chembl_df = pd.read_csv(data_dir +target.lower()+'_chembl_smiles_active.csv')[['rdkit_smiles', 'active']]
[dtc_df.shape, excape_df.shape, chembl_df.shape]

[(3834, 2), (7961, 2), (2297, 2)]

In [12]:
union_test_df['base_rdkit_smiles'].isin(dtc_df['rdkit_smiles']).value_counts()

False    1439
True      404
Name: base_rdkit_smiles, dtype: int64

In [13]:
union_test_df['base_rdkit_smiles'].isin(excape_df['rdkit_smiles']).value_counts()

True     1166
False     677
Name: base_rdkit_smiles, dtype: int64

In [14]:
union_test_df['base_rdkit_smiles'].isin(chembl_df['rdkit_smiles']).value_counts()

False    1513
True      330
Name: base_rdkit_smiles, dtype: int64

In [None]:
# #dtc_df[['rdkit_smiles', 'active']]
# dtc_df['base_rdkit_smiles'] = base_smiles_from_smiles(dtc_df['rdkit_smiles'].to_list())
# dtc_df.head(5)

In [None]:
# dtc_df = dtc_df.drop(columns=['rdkit_smiles'])

In [None]:
# dtc_df

In [27]:
# Add 'active' column to original data and save file to FRCE
df1 = pd.merge(left=dtc_df, right=excape_df, how='outer', on='rdkit_smiles')
df1 = df1.drop(columns=['active_y']).rename(columns={'active_x':'active'})

df2 = pd.merge(left=df1, right=chembl_df, how='outer', on='rdkit_smiles')
df2 = df2.drop(columns=['active_y']).rename(columns={'active_x':'active'})
df2

merged = pd.merge(left=union_test_df, right=df2, how='left', left_on='base_rdkit_smiles', right_on='rdkit_smiles')
#merged = merged.drop(columns=['rdkit_smiles'])

# merged = pd.merge(left=merged, right=dtc_df, how='left', left_on='base_rdkit_smiles', right_on='rdkit_smiles')
# merged = merged.drop(columns=['rdkit_smiles'])
# merged = pd.merge(pd.merge(union_test_df,dtc_df,left_on='base_rdkit_smiles', right_on='rdkit_smiles'),
#                   df3,on='Courses')

#merged = pd.merge(left=union_test_df, right=dtc_df, how='left', left_on='base_rdkit_smiles', right_on='rdkit_smiles')
#merged = merged.drop(columns=['compound_id_y', 'pIC50_y', 'relation_y'])
#merged = merged.rename(columns={'compound_id_x':'compound_id', 'pIC50_x':'pIC50', 'relation_':'relation'})
print(merged.shape)
print(merged.columns)
merged.head()

(1892, 6)
Index(['base_rdkit_smiles', 'compound_id', 'pIC50', 'relation', 'rdkit_smiles',
       'active'],
      dtype='object')


Unnamed: 0,base_rdkit_smiles,compound_id,pIC50,relation,rdkit_smiles,active
0,S=C(Nc1ccc2cn[nH]c2c1)Nc1ccc2cn[nH]c2c1,OTJWJLLGPAFTIP-KNLWZTMUNA-N,5.7,,S=C(Nc1ccc2cn[nH]c2c1)Nc1ccc2cn[nH]c2c1,
1,Cc1[nH]c2ccccc2c1C(c1ccccn1)N1CCC(C)CC1,NBKDMBGWUXYCJB-UHFFFAOYNA-N,4.6,<,Cc1[nH]c2ccccc2c1C(c1ccccn1)N1CCC(C)CC1,
2,CC(C)(Cc1c[nH]c2ccc(Cl)cc12)NCCOc1ccccc1OCC1CC1,XLZHXAXXJVKTFM-UHFFFAOYNA-N,7.0,,CC(C)(Cc1c[nH]c2ccc(Cl)cc12)NCCOc1ccccc1OCC1CC1,
3,CCN(CC)C(=O)CN1CCCC(c2nc3ccccc3n2Cc2ccc(F)cc2)C1,CHEMBL1935432,5.720831,,CCN(CC)C(=O)CN1CCCC(c2nc3ccccc3n2Cc2ccc(F)cc2)C1,0.0
4,NC(=NCc1ccc(Cl)cc1)SCCCc1c[nH]cn1,UCAIEVHKDLMIFL-QDYITYEQNA-N,4.9,<,NC(=NCc1ccc(Cl)cc1)SCCCc1c[nH]cn1,


In [28]:
merged.active.value_counts()

0.0    389
1.0     51
Name: active, dtype: int64

In [None]:
merged.to_csv(data_dir +target.lower()+'_excape_smiles_active.csv')

# Add 'active' column to testset

In [None]:
excape_raw_active = pd.read_csv(data_dir + target.lower()+'_excape_smiles_active.csv' ,sep=",")
print(excape_raw_active.shape)
excape_raw_active.head(2)

In [None]:
excape_test_df = pd.read_csv(data_dir + target.lower()+'_excape_testset_base_smiles_union.csv' ,sep=",")
excape_test_df.active.value_counts()

In [None]:
excape_test_df.shape

In [None]:
excape_test_df = excape_test_df.drop(columns = ['active'])
print(excape_test_df.shape) # Note: wrong 'active' column, will be marked correctly in next step
excape_test_df.head(2)

In [None]:
# Add 'active' column and save file to FRCE
#merged = pd.merge(left=dtc_test_df, right=dtc_raw_active, how='left', left_on='base_rdkit_smiles', right_on='rdkit_smiles')
merged = excape_test_df.merge(excape_raw_active[['rdkit_smiles', 'active']], how = 'left', 
                           left_on = 'base_rdkit_smiles', right_on = 'rdkit_smiles').drop(columns = ['rdkit_smiles'])
print(merged.shape)
merged.head(2)

In [None]:
merged.active.value_counts()

In [None]:
merged.to_csv(data_dir +target.lower()+'_excape_testset_base_smiles_union_active.csv')

# Plot pIC50 for A:N from Excape paper labelling (column='Activity Flag')

In [None]:
#plot pIC50 distribution based on ExcapeDB A:N labels (ModDaC raw)
raw = pd.read_csv(data_dir +target.lower()+'_excape_smiles_active.csv')
fig, ax = plt.subplots(figsize = (6,4))
sns.kdeplot(data=raw, x="pIC50", hue="Activity_Flag", palette="Set2")
plt.title(target.upper()+' - Distribution of pIC50 from ExcapeDB (MoDaC raw)')
plt.xlabel('pIC50')
plt.show()

# Plot pIC50 for 'Active' column from our labelling (column='active')

In [None]:
#plot pIC50 distribution based on our 'active' labelling (ModDaC raw)
#raw = pd.read_csv(data_dir +target.lower()+'_excape_smiles_active.csv')
fig, ax = plt.subplots(figsize = (6,4))
sns.kdeplot(data=raw, x="pIC50", hue="active", palette="Set1")
plt.title(target.upper()+' - Distribution of pIC50 (MoDaC raw)')
plt.xlabel('pIC50')
plt.show()

# Use active_threashold=5 in aggregate_assay_data()

In [None]:
imp.reload(curate_data)
tolerance=10
column='pIC50'   #'pXC50'; #'standard_value'
list_bad_duplicates='Yes'
max_std=1
data=agg_data
print("before",data.shape)
temp_df1=curate_data.aggregate_assay_data(data, value_col=column, output_value_col=None,
                             label_actives=True,
                             active_thresh=5, # val > 5 -> 'active' (drug-like compounds) # can be - None(default)| This creates 'active' column based on pIC50 value. Rule used here: pIC50 >=6 'active', labelled as 0, else 1
                             id_col='Ambit_InchiKey', smiles_col='rdkit_smiles', relation_col='standard_relation')
                             #id_col='standard_inchi_key', smiles_col='rdkit_smiles', relation_col='standard_relation')
# Remove inf in curated_df
temp_df1 = temp_df1[~temp_df1.isin([np.inf]).any(1)]
print("after",temp_df1.shape)
temp_df1.head()

In [None]:
# Add 'active' column to original data and save file to FRCE
merged = pd.merge(left=excape_raw_df, right=temp_df1, how='left', left_on='rdkit_smiles', right_on='base_rdkit_smiles')
merged = merged.drop(columns=['pIC50_y', 'base_rdkit_smiles'])
merged = merged.rename(columns={'pIC50_x':'pIC50'})
print(merged.shape)
print(merged.active.value_counts())
merged.to_csv(data_dir +target.lower()+'_excape_smiles_active_th5.csv')
merged.head(5)

# plot pIC50 distribution  with active_threashold=5


In [None]:
raw = pd.read_csv(data_dir +target.lower()+'_excape_smiles_active_th5.csv')
fig, ax = plt.subplots(figsize = (6,4))
sns.kdeplot(data=raw, x="pIC50", hue="active", palette="Set2")
plt.title(target.upper()+' - Distribution of pIC50 with active_threashold=5 (MoDaC raw)')
plt.xlabel('pIC50')
plt.show()