In [3]:
import os
import pandas as pd
import numpy as np
import rdkit
from tqdm import tqdm
from toolbox import smiles2inchi, smiles_unique,shuffle_train_test
# from pandas_profiling import ProfileReport
# from biopython import SeqIO
lg = rdkit.RDLogger.logger()
lg.setLevel(rdkit.RDLogger.CRITICAL)
pd.set_option('display.max_columns', None)

In [4]:
df = pd.read_csv('/mnt/nas/ai-project-data/syl/chembl30/chembl30_all_raw_20220317.csv', error_bad_lines=False)
df.shape

b'Skipping line 13069947: expected 28 fields, saw 37\nSkipping line 13069948: expected 28 fields, saw 37\nSkipping line 13069949: expected 28 fields, saw 37\nSkipping line 13069950: expected 28 fields, saw 37\nSkipping line 13069951: expected 28 fields, saw 37\nSkipping line 13069952: expected 28 fields, saw 37\nSkipping line 13069953: expected 28 fields, saw 37\nSkipping line 13069954: expected 28 fields, saw 37\nSkipping line 13069955: expected 28 fields, saw 37\nSkipping line 13069956: expected 28 fields, saw 37\nSkipping line 13069957: expected 28 fields, saw 37\nSkipping line 13069958: expected 28 fields, saw 37\nSkipping line 13069959: expected 28 fields, saw 37\nSkipping line 13069960: expected 28 fields, saw 37\nSkipping line 13069961: expected 28 fields, saw 37\nSkipping line 13069962: expected 28 fields, saw 37\nSkipping line 13069963: expected 28 fields, saw 37\nSkipping line 13069964: expected 28 fields, saw 37\nSkipping line 13069965: expected 28 fields, saw 37\nSkipping l

(20345140, 28)

In [5]:
len(df['target_chembl_id'].unique())

14408

In [6]:
df['uniprot_id'].count()

8931184

In [7]:
#删除uniprotid为空的数据
df = df[df['uniprot_id'].notna()]
df.shape

(8931184, 28)

In [8]:
#删除smiles，values，sequence为空的数据
df.dropna(subset=['canonical_smiles','standard_value', 'sequence'], inplace=True)
df.shape

(8242817, 28)

In [9]:
df = df[df['standard_relation'] == '=']
df.shape

(5133720, 28)

In [10]:
df_IC50 = df[df['standard_type'] == 'IC50']
df_IC50.shape

(1226898, 28)

In [11]:
#查看单位
df_IC50.standard_units.value_counts().to_dict()

{'nM': 1218682,
 'ug.mL-1': 7589,
 '%': 88,
 '10^-9mol/L': 46,
 '10^-8mol/L': 33,
 '10^-7mol/L': 32,
 '/uM': 21,
 'milliequivalent': 20,
 'ppm': 18,
 '10^-10mol/L': 18,
 'uM tube-1': 9,
 'min': 9,
 'molar ratio': 8,
 'mg kg-1': 6,
 'nmol/mg': 5,
 'p.p.m.': 5,
 '10^-5 mol/L': 5,
 'mg.min/m3': 4,
 '10^-4microM': 4,
 '10^-6 mol/L': 4,
 'umol/dm3': 4,
 "10'5pM": 3,
 '10^-5 uM': 2,
 "10'6pM": 2,
 "10'3pM": 2,
 'ucm/s': 2,
 'ucm': 2,
 'ug': 2,
 "10'-4umol/L": 1,
 '/uM/s': 1,
 "10'13nM": 1,
 "10'8nM": 1,
 "10'7nM": 1,
 "10'16 uM": 1,
 "10'20 uM": 1}

In [12]:
#只保留单位为nM的数据
df_IC50 = df_IC50[df_IC50['standard_units'] == 'nM']
df_IC50.shape

(1218682, 28)

In [13]:
#删除不合法的smiles
label_list = []
for x in tqdm(df_IC50["canonical_smiles"], total=df_IC50.shape[0]):
    try:
        smiles2inchi(x)
        smiles_unique([x])
        label_list.append("1")
    except Exception:
        label_list.append("0")
        continue
df_IC50.loc[:, "label"] = label_list
df_IC50 = df_IC50[df_IC50["label"] == "1"]
df_IC50.shape

  Error in cis/trans stereochemistry specified for the double bond

100%|██████████| 1218682/1218682 [21:48<00:00, 931.61it/s]


(1218682, 29)

In [14]:
#加上COMPOUND_SMILES 和 INCHI_KEY
df_IC50.loc[:,"COMPOUND_SMILES"] = df_IC50['canonical_smiles'].apply(lambda x: smiles_unique([x])[0])
df_IC50.loc[:, "INCHI_KEY"] = df_IC50['COMPOUND_SMILES'].apply(lambda x: smiles2inchi(x))
df_IC50.shape

  Error in cis/trans stereochemistry specified for the double bond



(1218682, 31)

In [15]:
df_IC50['INCHI_KEY']

78          XMAYWYJOQHXEEK-JYFHCDHNSA-N
79          XMAYWYJOQHXEEK-BVAGGSTKSA-N
80          XMAYWYJOQHXEEK-UHFFFAOYSA-N
81          XMAYWYJOQHXEEK-UHFFFAOYSA-N
83          FXNFHKRTJBSTCS-UHFFFAOYSA-N
                       ...             
20344230    ZQTDPABHHRSFDF-FRJWGUMJSA-N
20344231    IVHVIBKVJIZKOC-RTWAVKEYSA-N
20344232    IVHVIBKVJIZKOC-RTWAVKEYSA-N
20344233    IVHVIBKVJIZKOC-RTWAVKEYSA-N
20344271    VKJGBAJNNALVAV-UHFFFAOYSA-M
Name: INCHI_KEY, Length: 1218682, dtype: object

In [16]:
#删除smiles，inchi为空的数据
df_IC50.dropna(subset=['COMPOUND_SMILES','INCHI_KEY'],inplace=True)
df_IC50.shape

(1218682, 31)

In [17]:
#处理smiles带点的数据
def remove_solution(smi):
    if '.' in smi:
        par1 = smi.split('.')[0]
        par2 = smi.split('.')[1]
        if len(par1) >= len(par2):
            return par1
        else:
            return par2
    else: 
        return smi
df_IC50['COMPOUND_SMILES'] = df_IC50['COMPOUND_SMILES'].apply(remove_solution)
print(df_IC50[df_IC50['COMPOUND_SMILES'].str.contains('\.')].shape)
print(df_IC50.shape)

(0, 31)
(1218682, 31)


In [18]:
#处理smiles带\t的数据
def remove_solution1(smi):
    if '\t' in smi:
        par1 = smi.split('\t')[0]
        par2 = smi.split('\t')[1]
        if len(par1) >= len(par2):
            return par1
        else:
            return par2
    else: 
        return smi
df_IC50['COMPOUND_SMILES'] = df_IC50['COMPOUND_SMILES'].apply(remove_solution)
print(df_IC50[df_IC50['COMPOUND_SMILES'].str.contains('\t')].shape)
print(df_IC50.shape)

(0, 31)
(1218682, 31)


In [19]:
#添加UID字段（COMPOUND_SMILES + Uniport_id）
df_IC50['UID'] = df_IC50['COMPOUND_SMILES'] + '_' + df_IC50['uniprot_id']
df_IC50.shape

(1218682, 32)

In [20]:
df_IC50['IC50pki'] = df_IC50['standard_value'].apply(lambda x: round(-np.log10(x * 10**(-9)), 4))
df_IC50.head(2)

Unnamed: 0,molregno,canonical_smiles,activity_id,assay_id,standard_relation,standard_value,standard_units,standard_type,molecule_chembl_id,molecule_type,description,assay_strain,assay_tissue,assay_cell_type,confidence_score,assay_chembl_id,tid,target_type,target_pref_name,target_chembl_id,organism,targcomp_id,component_id,homologue,component_type,uniprot_id,sequence,sequence_md5sum,label,COMPOUND_SMILES,INCHI_KEY,UID,IC50pki
78,156572,CC(=O)N1CCN(c2ccc(OC[C@H]3CO[C@@](Cn4ccnc4)(c4...,1256007,17,=,1080.0,nM,IC50,CHEMBL319160,Small molecule,Inhibition of cytochrome P450 progesterone 15-...,,,,9,CHEMBL884521,100122,SINGLE PROTEIN,Cytochrome P450 2A2,CHEMBL3705,Rattus norvegicus,1808.0,2020.0,0.0,PROTEIN,P15149,MLDTGLLLVVILASLSVMFLVSLWQQKIRERLPPGPTPLPFIGNYL...,2954bba4ecdf5fe262f6e90d5d47a524,1,Clc1ccc(c(c1)Cl)[C@@]1(OC[C@@H](O1)COc1ccc(cc1...,XMAYWYJOQHXEEK-JYFHCDHNSA-N,Clc1ccc(c(c1)Cl)[C@@]1(OC[C@@H](O1)COc1ccc(cc1...,5.9666
79,156650,CC(=O)N1CCN(c2ccc(OC[C@@H]3CO[C@](Cn4ccnc4)(c4...,1239273,17,=,728.0,nM,IC50,CHEMBL328863,Small molecule,Inhibition of cytochrome P450 progesterone 15-...,,,,9,CHEMBL884521,100122,SINGLE PROTEIN,Cytochrome P450 2A2,CHEMBL3705,Rattus norvegicus,1808.0,2020.0,0.0,PROTEIN,P15149,MLDTGLLLVVILASLSVMFLVSLWQQKIRERLPPGPTPLPFIGNYL...,2954bba4ecdf5fe262f6e90d5d47a524,1,Clc1ccc(c(c1)Cl)[C@]1(OC[C@H](O1)COc1ccc(cc1)N...,XMAYWYJOQHXEEK-BVAGGSTKSA-N,Clc1ccc(c(c1)Cl)[C@]1(OC[C@H](O1)COc1ccc(cc1)N...,6.1379


In [21]:
# df_IC50.to_csv('Chembl30_IC50_raw_new.csv', index=False)

In [22]:
#根据UID去重
g = df_IC50.groupby(['UID'], as_index=True)
value_list = g.apply(lambda x: x['IC50pki'].values)
out_df = g['IC50pki'].count().reset_index()
out_df.loc[:, 'Duplicate_Values'] = list(value_list)
out_df.loc[:, 'check'] = out_df['Duplicate_Values'].apply(lambda x: np.ptp(x))
out_df

Unnamed: 0,UID,IC50pki,Duplicate_Values,check
0,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1_P42574,1,[4.5003],0.0000
1,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1_Q9BQF6,1,[5.2262],0.0000
2,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1ccoc1C_P32897,1,[4.8633],0.0000
3,Br/C(=C\c1ccc(cc1)F)/[N+](=O)[O-]_P09467,1,[6.1938],0.0000
4,Br/C(=C\c1ccccc1)/C=N/NC(=O)c1[nH]nc(c1)c1ccc(...,2,"[5.2311, 5.0324]",0.1987
...,...,...,...,...
970854,o1nc2c(n1)cc(cc2)CCN1CCN(CC1)CCc1ccc2c(c1)non2...,1,[7.1739],0.0000
970855,s1ccc(c1)c1nc(NC2Cc3c(C2)cccc3)c2c(n1)cccc2_P0...,3,"[8.284, 8.0506, 8.1805]",0.2334
970856,s1cnc(c1)c1nc2-c(n1)ccc[nH]2_P0AE18,1,[7.1079],0.0000
970857,s1nc2c(n1)c1nc([nH]c1cc2)C1CC1_P06492,1,[5.4556],0.0000


In [23]:
out_df = out_df[out_df['check'] < 1.3]
out_df.shape

(937272, 4)

In [24]:
out_df.loc[:, "REG_LABEL"] = out_df["Duplicate_Values"].apply(lambda x: np.median(x))
out_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,UID,IC50pki,Duplicate_Values,check,REG_LABEL
0,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1_P42574,1,[4.5003],0.0,4.5003
1,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1_Q9BQF6,1,[5.2262],0.0,5.2262


In [25]:
# 0 <= REG_LABEL <= 13
out_df = out_df[(out_df['REG_LABEL'] >= 0)&(out_df['REG_LABEL'] <= 13)]
out_df.shape

(936573, 5)

In [26]:
out_df = out_df[['UID', 'Duplicate_Values', 'REG_LABEL']]
out_df.head(2)

Unnamed: 0,UID,Duplicate_Values,REG_LABEL
0,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1_P42574,[4.5003],4.5003
1,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1_Q9BQF6,[5.2262],5.2262


In [27]:
df_IC50_ddp = df_IC50.drop_duplicates(subset='UID', keep='first')
df_IC50_ddp.shape

(970859, 33)

In [28]:
df_IC50_cleaned = pd.merge(out_df, df_IC50_ddp, how='left', on='UID')
df_IC50_cleaned.head(2)

Unnamed: 0,UID,Duplicate_Values,REG_LABEL,molregno,canonical_smiles,activity_id,assay_id,standard_relation,standard_value,standard_units,standard_type,molecule_chembl_id,molecule_type,description,assay_strain,assay_tissue,assay_cell_type,confidence_score,assay_chembl_id,tid,target_type,target_pref_name,target_chembl_id,organism,targcomp_id,component_id,homologue,component_type,uniprot_id,sequence,sequence_md5sum,label,COMPOUND_SMILES,INCHI_KEY,IC50pki
0,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1_P42574,[4.5003],4.5003,1682192,O=C(N/N=C/C(Br)=C\c1ccccc1)c1cccs1,7348218,736971,=,31600.0,nM,IC50,CHEMBL3192901,Small molecule,PUBCHEM_BIOASSAY: Dose Response confirmation o...,,,,8,CHEMBL1738414,10131,SINGLE PROTEIN,Caspase-3,CHEMBL2334,Homo sapiens,1352.0,675.0,0.0,PROTEIN,P42574,MENTENSVDSKSIKNLEPKIIHGSESMDSGISLDNSYKMDYPEMGL...,cd9050995bb3aa9f42ca5d1eedd10b2d,1,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1,XZRHAUPXEHCGFW-JXTNESPGSA-N,4.5003
1,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1_Q9BQF6,[5.2262],5.2262,1682192,O=C(N/N=C/C(Br)=C\c1ccccc1)c1cccs1,5774539,737351,=,5940.0,nM,IC50,CHEMBL3192901,Small molecule,PUBCHEM_BIOASSAY: Dose Response confirmation o...,,,,9,CHEMBL1738495,104009,SINGLE PROTEIN,Sentrin-specific protease 7,CHEMBL1741213,Homo sapiens,2069.0,5436.0,0.0,PROTEIN,Q9BQF6,MDKRKLGRRPSSSEIITEGKRKKSSSDLSEIRKMLNAKPEDVHVQS...,5fc5c39e13d1ce7d15f8fe9a374bcf57,1,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1,XZRHAUPXEHCGFW-JXTNESPGSA-N,5.2262


In [29]:
df_IC50_cleaned.drop(['standard_relation', 'standard_value', 'standard_units', 'label', 'IC50pki'], axis=1, inplace=True)
df_IC50_cleaned.head(2)

Unnamed: 0,UID,Duplicate_Values,REG_LABEL,molregno,canonical_smiles,activity_id,assay_id,standard_type,molecule_chembl_id,molecule_type,description,assay_strain,assay_tissue,assay_cell_type,confidence_score,assay_chembl_id,tid,target_type,target_pref_name,target_chembl_id,organism,targcomp_id,component_id,homologue,component_type,uniprot_id,sequence,sequence_md5sum,COMPOUND_SMILES,INCHI_KEY
0,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1_P42574,[4.5003],4.5003,1682192,O=C(N/N=C/C(Br)=C\c1ccccc1)c1cccs1,7348218,736971,IC50,CHEMBL3192901,Small molecule,PUBCHEM_BIOASSAY: Dose Response confirmation o...,,,,8,CHEMBL1738414,10131,SINGLE PROTEIN,Caspase-3,CHEMBL2334,Homo sapiens,1352.0,675.0,0.0,PROTEIN,P42574,MENTENSVDSKSIKNLEPKIIHGSESMDSGISLDNSYKMDYPEMGL...,cd9050995bb3aa9f42ca5d1eedd10b2d,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1,XZRHAUPXEHCGFW-JXTNESPGSA-N
1,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1_Q9BQF6,[5.2262],5.2262,1682192,O=C(N/N=C/C(Br)=C\c1ccccc1)c1cccs1,5774539,737351,IC50,CHEMBL3192901,Small molecule,PUBCHEM_BIOASSAY: Dose Response confirmation o...,,,,9,CHEMBL1738495,104009,SINGLE PROTEIN,Sentrin-specific protease 7,CHEMBL1741213,Homo sapiens,2069.0,5436.0,0.0,PROTEIN,Q9BQF6,MDKRKLGRRPSSSEIITEGKRKKSSSDLSEIRKMLNAKPEDVHVQS...,5fc5c39e13d1ce7d15f8fe9a374bcf57,Br/C(=C/c1ccccc1)/C=N/NC(=O)c1cccs1,XZRHAUPXEHCGFW-JXTNESPGSA-N


In [30]:
df_IC50_cleaned.shape

(936573, 30)

In [31]:
df_IC50_cleaned.rename(columns={"uniprot_id": "UNIPROT_ID", "sequence": "PROTEIN_SEQUENCE"}, inplace = True)

In [32]:
# df_IC50_cleaned.to_csv('Chembl30_IC50_cleaned_new.csv', index=False)