# ChEBI Compound - Prepare
Prepares a data frame of ChEBI compounds with InChiKeys.

**[Work in progress]**

http://ftp.ebi.ac.uk/pub/databases/chebi/README.txt

https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete.sdf.gz

Data source: [ChEBI](https://www.ebi.ac.uk/chebi/)


Data source: [ChEMBL](https://www.ebi.ac.uk/chembl/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
import gzip
from datetime import date
from utils import assign_curie, join_columns

In [2]:
# reload modules before executing user code
%load_ext autoreload
%autoreload 2

In [3]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns
pd.set_option('display.max_colwidth', None)

In [4]:
# Path will take care of handling operating system differences.
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/')
CACHE = os.getenv('KGBIO_CACHE', default='../cache')

In [5]:
os.makedirs(os.path.join(CACHE), exist_ok=True)

In [6]:
filename = os.path.join(CACHE, 'ChEBI_complete.sdf.gz')
today = date.today().strftime("%Y-%m-%d")
node_filename = f'Compound_chebi_{today}.parquet'
print(node_filename)

Compound_chebi_2022-03-19.parquet


### Parse data fields from ChEBI sdf file

In [7]:
def parse_chebi_sdf(filename):
    """Parses compound data from ChEBI"""

    field_names = {'ChEBI ID': 'chebiId', 'Secondary ChEBI ID': 'chebiIds',
                   'ChEBI Name': 'name', 'synonyms': 'Synonyms', 
                   'SMILES': 'smiles', 'InChI': 'inchi', 'InChIKey': 'inchiKey',
                   'Formulae': 'formula', 'Mass': 'mass',
                   'CAS Registry Numbers': 'casNumber', 
                   'ChEMBL Database Links': 'chemblId'}
    start_data = False
    row = dict()
    rows = list()

    # parse requested fields from the sdf file
    with gzip.open(filename, 'rt') as f:
        for line in f:
            line = line.strip()
            if line == '> <ChEBI ID>':
                # begin of a new compound record, save data from previous compound
                if len(row) > 0:
                    rows.append(row)
                    row = dict()
            if line.startswith('> <'):
                # begin of a new data field, get the field name
                start_data = True
                field_name = line[3:len(line) - 1]
            elif len(line) == 0:
                # an empty line indicates the end of a data section
                start_data = False
            elif start_data:
                # copy data into a dictionary, if there are multiple data items, create a pipe-separated string
                if field_name in field_names:
                    std_name = field_names.get(field_name)
                    if field_name in row:
                        row[std_name] = row.get(std_name) + '|' + line
                    else:
                        row[std_name] = line

    # special case for end of file: append data for last compound
    if len(row) > 0:
        rows.append(row)
        
    return rows

In [8]:
def rows_to_dataframe(rows):
    """Converts list of dictionaries to a dataframe an cleans and standardizes the data"""

    compounds = pd.DataFrame(rows)
    compounds.fillna('', inplace=True)

    # remove compounds without an InChIKey
    compounds.query('inchiKey != ""', inplace=True)
    
    # Convert CHEBI:identifier to chebi:identifier to match CURIE (see identifiers.org)
    compounds['chebiId'] = compounds['chebiId'].str.lower()
    compounds['chebiIds'] = compounds['chebiIds'].str.lower()
    
    # use "|" instead of "," as the delimiter for multiple ids
    compounds['chebiIds'] = compounds['chebiIds'].str.replace(',', '|')
    
    # assign CURIE (see identifiers.org)
    compounds['id'] =  compounds['inchiKey'].apply(lambda row: assign_curie(row, 'inchikey'))
    compounds['chemblId'] = compounds['chemblId'].apply(lambda row: assign_curie(row, 'chembl'))
    compounds['casNumber'] = compounds['casNumber'].apply(lambda row: assign_curie(row, 'cas'))
                                                   
    # concatenate all ids into a "|"-separated string
    #compounds['ids'] = compounds.apply(lambda row: join_columns(row, ['chebiId', 'chebiIds', 'casNumber', 'id']), axis=1)
    compounds['ids'] = compounds[['chebiId', 'chebiIds', 'casNumber', 'id']].apply(lambda ids: '|'.join(filter(None, ids)), axis=1)
                                                                  
    compounds.drop(columns=['chebiId', 'chebiIds', 'chemblId','casNumber'], inplace=True)

    return compounds

In [9]:
rows = parse_chebi_sdf(filename)
nodes = rows_to_dataframe(rows)

In [10]:
nodes.head()

Unnamed: 0,name,inchi,inchiKey,smiles,formula,mass,id,ids
0,(-)-epicatechin,"InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15-/m1/s1",PFTAWBLQPZVEMU-UKRRQHHQSA-N,[H][C@@]1(Oc2cc(O)cc(O)c2C[C@H]1O)c1ccc(O)c(O)c1,C15H14O6,290.2681,inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N,chebi:90|chebi:18484|cas:490-46-0|inchikey:PFTAWBLQPZVEMU-UKRRQHHQSA-N
1,"(1S,4R)-fenchone","InChI=1S/C10H16O/c1-9(2)7-4-5-10(3,6-7)8(9)11/h7H,4-6H2,1-3H3/t7-,10+/m1/s1",LHXDLQBQYFFVNW-XCBNKYQSSA-N,CC1(C)[C@@H]2CC[C@@](C)(C2)C1=O,C10H16O,152.2334,inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N,chebi:165|chebi:63901|cas:4695-62-9|inchikey:LHXDLQBQYFFVNW-XCBNKYQSSA-N
3,16alpha-hydroxyestrone,"InChI=1S/C18H22O3/c1-18-7-6-13-12-5-3-11(19)8-10(12)2-4-14(13)15(18)9-16(20)17(18)21/h3,5,8,13-16,19-20H,2,4,6-7,9H2,1H3/t13-,14-,15+,16-,18+/m1/s1",WPOCIZJTELRQMF-QFXBJFAPSA-N,[H][C@@]12C[C@@H](O)C(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[C@@]21[H])C=C(O)C=C3,C18H22O3,286.371,inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N,chebi:776|chebi:60497|cas:566-76-7|inchikey:WPOCIZJTELRQMF-QFXBJFAPSA-N
4,"2,6-dichlorobenzonitrile",InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,YOYAIZYFCNQIRF-UHFFFAOYSA-N,Clc1cccc(Cl)c1C#N,C7H3Cl2N,172.01086,inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N,chebi:943|chebi:73174|cas:1194-65-6|inchikey:YOYAIZYFCNQIRF-UHFFFAOYSA-N
5,2-hydroxybutyric acid,"InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(H,6,7)",AFENDNXGAFYKQO-UHFFFAOYSA-N,CCC(O)C(O)=O,C4H8O3,104.1045,inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N,chebi:1148|chebi:675746|cas:600-15-7|inchikey:AFENDNXGAFYKQO-UHFFFAOYSA-N


## Setup Output Directories

In [11]:
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/')

In [12]:
os.makedirs(os.path.join(NODE_DATA), exist_ok=True)
os.makedirs(os.path.join(RELATIONSHIP_DATA), exist_ok=True)

In [13]:
print('Number of nodes', nodes.shape[0])

Number of nodes 136652


In [14]:
nodes[['id', 'name', 'ids', 'inchiKey', 'inchi', 'smiles', 'formula', 'mass']].to_parquet(os.path.join(NODE_DATA, node_filename), index=False)