# ChEMBL Compound - Prepare
**[Work in progress]**

https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/README

Smiles, InChI, InChIKey
https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_30_chemreps.txt.gz


Data source: [ChEMBL](https://www.ebi.ac.uk/chembl/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
from utils import join_string_columns, backfill

In [2]:
# reload modules before executing user code
%load_ext autoreload
%autoreload 2

In [3]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns
pd.set_option('display.max_colwidth', None) # don't truncate cells

In [4]:
# Path will take care of handling operating system differences.
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/')
CACHE = os.getenv('KGBIO_CACHE', default='../cache')

In [5]:
os.makedirs(os.path.join(CACHE), exist_ok=True)

In [6]:
chembl_node_filename = 'Compound_chembl_30_2022-03-17.csv'
chebi_node_filename = 'Compound_chebi_2022-03-17.parquet'
version = 30
today = '2022-03-17'
node_filename = f'Compound_chembl_{version}_{today}.csv'
print(node_filename)

Compound_chembl_30_2022-03-17.csv


### Import NCBI Taxonomy Names

In [7]:
chembl = pd.read_parquet(os.path.join(NODE_DATA, chembl_node_filename))
print('Number of ChEMBL nodes:', chembl.shape[0])
chembl.head(2)

Number of ChEMBL nodes: 2282427


Unnamed: 0,id,name,ids,inchiKey,inchi,smiles,formula,mass
0,inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N,(-)-epicatechin,chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chembl:CHEMBL583912,PFTAWBLQPZVEMU-UKRRQHHQSA-N,"InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15-/m1/s1",[H][C@@]1(Oc2cc(O)cc(O)c2C[C@H]1O)c1ccc(O)c(O)c1,C15H14O6,290.2681
1,inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N,"(1S,4R)-fenchone",chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N,LHXDLQBQYFFVNW-XCBNKYQSSA-N,"InChI=1S/C10H16O/c1-9(2)7-4-5-10(3,6-7)8(9)11/h7H,4-6H2,1-3H3/t7-,10+/m1/s1",CC1(C)[C@@H]2CC[C@@](C)(C2)C1=O,C10H16O,152.2334


In [8]:
chebi = pd.read_parquet(os.path.join(NODE_DATA, chebi_node_filename))
print('Number of ChEBI nodes:', chebi.shape[0])
chebi.head(2)

Number of ChEBI nodes: 136652


Unnamed: 0,id,name,ids,inchiKey,inchi,smiles,formula,mass
0,inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N,(-)-epicatechin,chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N,PFTAWBLQPZVEMU-UKRRQHHQSA-N,"InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15-/m1/s1",[H][C@@]1(Oc2cc(O)cc(O)c2C[C@H]1O)c1ccc(O)c(O)c1,C15H14O6,290.2681
1,inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N,"(1S,4R)-fenchone",chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N,LHXDLQBQYFFVNW-XCBNKYQSSA-N,"InChI=1S/C10H16O/c1-9(2)7-4-5-10(3,6-7)8(9)11/h7H,4-6H2,1-3H3/t7-,10+/m1/s1",CC1(C)[C@@H]2CC[C@@](C)(C2)C1=O,C10H16O,152.2334


In [9]:
left, right = chebi.align(chembl, join="outer", fill_value='', axis=1)

In [10]:
print('ChEBI nodes with InChIKey:', chebi.query('inchiKey != ""').shape[0])
print('ChEBI nodes without InChIKey   :', chebi.query('inchiKey == ""').shape[0])

ChEBI nodes with InChIKey: 136652
ChEBI nodes without InChIKey   : 0


In [11]:
nodes = left.merge(right, on='id', how='outer')

In [12]:
print('Number of nodes:', nodes.shape[0])

Number of nodes: 2380115


In [13]:
nodes.head()

Unnamed: 0,id,name_x,ids_x,inchiKey_x,inchi_x,smiles_x,formula_x,mass_x,name_y,ids_y,inchiKey_y,inchi_y,smiles_y,formula_y,mass_y
0,inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N,(-)-epicatechin,chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N,PFTAWBLQPZVEMU-UKRRQHHQSA-N,"InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15-/m1/s1",[H][C@@]1(Oc2cc(O)cc(O)c2C[C@H]1O)c1ccc(O)c(O)c1,C15H14O6,290.2681,(-)-epicatechin,chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chembl:CHEMBL583912,PFTAWBLQPZVEMU-UKRRQHHQSA-N,"InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15-/m1/s1",[H][C@@]1(Oc2cc(O)cc(O)c2C[C@H]1O)c1ccc(O)c(O)c1,C15H14O6,290.2681
1,inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N,"(1S,4R)-fenchone",chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N,LHXDLQBQYFFVNW-XCBNKYQSSA-N,"InChI=1S/C10H16O/c1-9(2)7-4-5-10(3,6-7)8(9)11/h7H,4-6H2,1-3H3/t7-,10+/m1/s1",CC1(C)[C@@H]2CC[C@@](C)(C2)C1=O,C10H16O,152.2334,"(1S,4R)-fenchone",chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N,LHXDLQBQYFFVNW-XCBNKYQSSA-N,"InChI=1S/C10H16O/c1-9(2)7-4-5-10(3,6-7)8(9)11/h7H,4-6H2,1-3H3/t7-,10+/m1/s1",CC1(C)[C@@H]2CC[C@@](C)(C2)C1=O,C10H16O,152.2334
2,inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N,16alpha-hydroxyestrone,chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N,WPOCIZJTELRQMF-QFXBJFAPSA-N,"InChI=1S/C18H22O3/c1-18-7-6-13-12-5-3-11(19)8-10(12)2-4-14(13)15(18)9-16(20)17(18)21/h3,5,8,13-16,19-20H,2,4,6-7,9H2,1H3/t13-,14-,15+,16-,18+/m1/s1",[H][C@@]12C[C@@H](O)C(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[C@@]21[H])C=C(O)C=C3,C18H22O3,286.371,16alpha-hydroxyestrone,chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chembl:CHEMBL1233720,WPOCIZJTELRQMF-QFXBJFAPSA-N,"InChI=1S/C18H22O3/c1-18-7-6-13-12-5-3-11(19)8-10(12)2-4-14(13)15(18)9-16(20)17(18)21/h3,5,8,13-16,19-20H,2,4,6-7,9H2,1H3/t13-,14-,15+,16-,18+/m1/s1",[H][C@@]12C[C@@H](O)C(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[C@@]21[H])C=C(O)C=C3,C18H22O3,286.371
3,inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N,"2,6-dichlorobenzonitrile",chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N,YOYAIZYFCNQIRF-UHFFFAOYSA-N,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,Clc1cccc(Cl)c1C#N,C7H3Cl2N,172.01086,"2,6-dichlorobenzonitrile",chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chembl:CHEMBL3039332,YOYAIZYFCNQIRF-UHFFFAOYSA-N,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,Clc1cccc(Cl)c1C#N,C7H3Cl2N,172.01086
4,inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N,2-hydroxybutyric acid,chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N,AFENDNXGAFYKQO-UHFFFAOYSA-N,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(H,6,7)",CCC(O)C(O)=O,C4H8O3,104.1045,2-hydroxybutyric acid,chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chembl:CHEMBL567588,AFENDNXGAFYKQO-UHFFFAOYSA-N,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(H,6,7)",CCC(O)C(O)=O,C4H8O3,104.1045


Create a new column that contains the first non-null value from the two columns that couldn't be merged

In [14]:
nodes = backfill(nodes, 'formula', ['formula_x', 'formula_y'], delete=True)
nodes = backfill(nodes, 'name', ['name_x', 'name_y'], delete=True)
nodes = backfill(nodes, 'smiles', ['smiles_x', 'smiles_y'], delete=True)
nodes = backfill(nodes, 'inchi', ['inchi_x', 'inchi_y'], delete=True)
nodes = backfill(nodes, 'inchiKey', ['inchiKey_x', 'inchiKey_y'], delete=True)
nodes = backfill(nodes, 'mass', ['mass_x', 'mass_y'], delete=True)

In [15]:
# concatenate two columns if they contain values
nodes.fillna('', inplace=True)
nodes = join_string_columns(nodes, 'ids', ['ids_x', 'ids_y'], delete=True)

In [16]:
print('Number of nodes', nodes.shape[0])
nodes.head()

Number of nodes 2380115


Unnamed: 0,id,formula,name,smiles,inchi,inchiKey,mass,ids
0,inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N,C15H14O6,(-)-epicatechin,[H][C@@]1(Oc2cc(O)cc(O)c2C[C@H]1O)c1ccc(O)c(O)c1,"InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15-/m1/s1",PFTAWBLQPZVEMU-UKRRQHHQSA-N,290.2681,chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N|chembl:CHEMBL583912
1,inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N,C10H16O,"(1S,4R)-fenchone",CC1(C)[C@@H]2CC[C@@](C)(C2)C1=O,"InChI=1S/C10H16O/c1-9(2)7-4-5-10(3,6-7)8(9)11/h7H,4-6H2,1-3H3/t7-,10+/m1/s1",LHXDLQBQYFFVNW-XCBNKYQSSA-N,152.2334,chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N|chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N
2,inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N,C18H22O3,16alpha-hydroxyestrone,[H][C@@]12C[C@@H](O)C(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[C@@]21[H])C=C(O)C=C3,"InChI=1S/C18H22O3/c1-18-7-6-13-12-5-3-11(19)8-10(12)2-4-14(13)15(18)9-16(20)17(18)21/h3,5,8,13-16,19-20H,2,4,6-7,9H2,1H3/t13-,14-,15+,16-,18+/m1/s1",WPOCIZJTELRQMF-QFXBJFAPSA-N,286.371,chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N|chembl:CHEMBL1233720
3,inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N,C7H3Cl2N,"2,6-dichlorobenzonitrile",Clc1cccc(Cl)c1C#N,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,YOYAIZYFCNQIRF-UHFFFAOYSA-N,172.01086,chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N|chembl:CHEMBL3039332
4,inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N,C4H8O3,2-hydroxybutyric acid,CCC(O)C(O)=O,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(H,6,7)",AFENDNXGAFYKQO-UHFFFAOYSA-N,104.1045,chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N|chembl:CHEMBL567588


In [17]:
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/') 

In [18]:
os.makedirs(os.path.join(NODE_DATA), exist_ok=True)
os.makedirs(os.path.join(RELATIONSHIP_DATA), exist_ok=True)

In [19]:
nodes[['id', 'name', 'ids', 'inchiKey', 'inchi', 'smiles', 'formula', 'mass']].to_parquet(os.path.join(NODE_DATA, node_filename), index=False)