# ChEMBL Compound - Prepare
**[Work in progress]**

https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/README

Smiles, InChI, InChIKey
https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_30_chemreps.txt.gz


Data source: [ChEMBL](https://www.ebi.ac.uk/chembl/)

Author: Peter Rose (pwrose@ucsd.edu)

In [15]:
import os
import pandas as pd
from pathlib import Path

In [16]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [17]:
# Path will take care of handling operating system differences.
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/')
CACHE = os.getenv('KGBIO_CACHE', default='../cache')

In [18]:
os.makedirs(os.path.join(CACHE), exist_ok=True)

### Import NCBI Taxonomy Names

In [19]:
chembl = pd.read_parquet(os.path.join(NODE_DATA, 'Compound_chembl_70.parquet'))
print('Number of ChEMBL compounds:', chembl.shape[0])
chembl.head(2)

Number of ChEMBL compounds: 2136187


Unnamed: 0,id,smiles,inchi,inchiKey,name
0,chembl:CHEMBL153534,Cc1cc(-c2csc(N=C(N)N)n2)cn1C,InChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10...,MFRNFCWYPYSFQQ-UHFFFAOYSA-N,CHEMBL153534
1,chembl:CHEMBL440060,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H...,InChI=1S/C123H212N44O34S/c1-19-63(12)96(164-11...,RSEQNZQKBMRQNM-VRGFNVLHSA-N,CHEMBL440060


In [20]:
#chebi = pd.read_parquet(os.path.join(NODE_DATA, 'Compound_chebi_2022-03-10.parquet'))
chebi = pd.read_csv(os.path.join(NODE_DATA, '/Users/Peter/GitRepositories/kg-onto/data/nodes/Compound_chebi_209-2022-03-10.csv'))
chebi.fillna('', inplace=True)
print('Number of ChEBI compounds:', chebi.shape[0])
chebi.head(2)

Number of ChEBI compounds: 178080


Unnamed: 0,id,name,synonyms,definition,url,inchi,inchiKey,smiles,formula,mass,xref
0,chebi:CHEBI_101465,"(2S,3S,4R)-4-(hydroxymethyl)-1-(2-methoxy-1-ox...",,,http://purl.obolibrary.org/obo/CHEBI_101465,InChI=1S/C19H19N3O3/c1-25-12-18(24)22-16(9-20)...,NONDGOMIDWLUNU-AOIWGVFYSA-N,COCC(=O)N1[C@H]([C@H]([C@H]1C#N)C2=CC=C(C=C2)C...,C19H19N3O3,337.373,LINCS:LSM-12828
1,chebi:CHEBI_159237,Leu-His-Glu,(2S)-2-[[(2S)-2-[[(2S)-2-amino-4-methylpentano...,,http://purl.obolibrary.org/obo/CHEBI_159237,InChI=1S/C17H27N5O6/c1-9(2)5-11(18)15(25)22-13...,KXODZBLFVFSLAI-AVGNSLFASA-N,O=C(N[C@@H](CC=1NC=NC1)C(=O)N[C@@H](CCC(O)=O)C...,C17H27N5O6,397.432,Chemspider:58808728


In [21]:
print('ChEBI compounds with InChIKey:', chebi.query('inchiKey != ""').shape[0])
print('ChEBI compounds without InChIKey:', chebi.query('inchiKey == ""').shape[0])

ChEBI compounds with InChIKey: 136648
ChEBI compounds without InChIKey: 41432


In [28]:
#compounds = chembl.merge(chebi, on='inchiKey')
compounds = chembl.merge(chebi, how='outer')

In [29]:
print('Number of compounds:', compounds.shape[0])

Number of compounds: 2314267


In [30]:
compounds.head()

Unnamed: 0,id,smiles,inchi,inchiKey,name,synonyms,definition,url,formula,mass,xref
0,chembl:CHEMBL153534,Cc1cc(-c2csc(N=C(N)N)n2)cn1C,InChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10...,MFRNFCWYPYSFQQ-UHFFFAOYSA-N,CHEMBL153534,,,,,,
1,chembl:CHEMBL440060,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H...,InChI=1S/C123H212N44O34S/c1-19-63(12)96(164-11...,RSEQNZQKBMRQNM-VRGFNVLHSA-N,CHEMBL440060,,,,,,
2,chembl:CHEMBL440245,CCCC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(...,InChI=1S/C160H268N50O41/c1-23-27-41-95-134(228...,FTKBTEIKPOYCEX-OZSLQWTKSA-N,CHEMBL440245,,,,,,
3,chembl:CHEMBL440249,CC(C)C[C@@H]1NC(=O)CNC(=O)[C@H](c2ccc(O)cc2)NC...,InChI=1S/C124H154ClN21O39/c1-57(2)48-81-112(17...,UYSXXKGACMHPIM-KFGDMSGDSA-N,CHEMBL440249,,,,,,
4,chembl:CHEMBL405398,Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1,InChI=1S/C19H21BrN6O/c20-15-2-1-3-17(18(15)22-...,VDSXZXJEWIWBCG-UHFFFAOYSA-N,CHEMBL405398,,,,,,


In [25]:
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/') 

In [26]:
os.makedirs(os.path.join(NODE_DATA), exist_ok=True)
os.makedirs(os.path.join(RELATIONSHIP_DATA), exist_ok=True)

In [27]:
#print('Number of nodes', nodes.shape[0])

In [13]:
#nodes.to_parquet(os.path.join(NODE_DATA, 'Compound_chembl_70.parquet'), compression='brotli', index=False)