# MIBiG JSON -> SMILES

Load MIBiG database in JSON format and extract the predicted SMILES (either already stored in the json file or retrieved from PubChem).

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline


In [2]:
import os
from glob import glob
import json

from pubchempy import *

In [3]:
input_dir = 'mibig_json_1.4/'

In [4]:
def get_isomeric_smiles(prop):
    for prop in pubchem_compound.record['props']:
        urn = prop['urn']
        if 'name' in urn and urn['name'] == 'Isomeric' and urn['label'] == 'SMILES':
            smiles = prop['value']['sval']
            return smiles

Sometimes the MIBiG json doesn't contain the chemical structure (SMILES), but it has the pubchem id. In this case, we can fetch this information from pubchem.

In [5]:
data = []
json_files = glob(os.path.join(input_dir, '*.json'))
for fn in sorted(json_files):
    with open(fn, 'r') as f:
        
        d = json.load(f)    
        
        bgc_id = os.path.basename(fn).split('.')[0]
        compounds = d['general_params']['compounds']
        for c in compounds:
            compound_name = bgc_id + '_' + c['compound'].lower().replace(' ', '_')
            if 'chem_struct' in c:
                smiles = c['chem_struct']
            elif 'pubchem_id' in c:
                try:
                    pubchem_id = c['pubchem_id']
                    pubchem_compound = Compound.from_cid(pubchem_id)
                    props = pubchem_compound.record['props']
                    smiles = get_isomeric_smiles(props)
                except NotFoundError:
                    smiles = None
                    
            if smiles is not None:
                row = (compound_name, smiles,)
#                 print row
                data.append(row)
                if len(data) % 100 == 0:
                    print(len(data), '/', len(json_files))

['mibig_json_1.4\\BGC0000001.json', 'mibig_json_1.4\\BGC0000002.json', 'mibig_json_1.4\\BGC0000003.json', 'mibig_json_1.4\\BGC0000004.json', 'mibig_json_1.4\\BGC0000005.json', 'mibig_json_1.4\\BGC0000006.json', 'mibig_json_1.4\\BGC0000007.json', 'mibig_json_1.4\\BGC0000008.json', 'mibig_json_1.4\\BGC0000009.json', 'mibig_json_1.4\\BGC0000010.json', 'mibig_json_1.4\\BGC0000011.json', 'mibig_json_1.4\\BGC0000012.json', 'mibig_json_1.4\\BGC0000013.json', 'mibig_json_1.4\\BGC0000014.json', 'mibig_json_1.4\\BGC0000015.json', 'mibig_json_1.4\\BGC0000016.json', 'mibig_json_1.4\\BGC0000017.json', 'mibig_json_1.4\\BGC0000018.json', 'mibig_json_1.4\\BGC0000019.json', 'mibig_json_1.4\\BGC0000020.json', 'mibig_json_1.4\\BGC0000021.json', 'mibig_json_1.4\\BGC0000022.json', 'mibig_json_1.4\\BGC0000023.json', 'mibig_json_1.4\\BGC0000024.json', 'mibig_json_1.4\\BGC0000025.json', 'mibig_json_1.4\\BGC0000026.json', 'mibig_json_1.4\\BGC0000027.json', 'mibig_json_1.4\\BGC0000028.json', 'mibig_json_1.4\\BG

100 / 1816
200 / 1816
300 / 1816
400 / 1816
500 / 1816
600 / 1816
700 / 1816
800 / 1816
900 / 1816
1000 / 1816
1100 / 1816
1200 / 1816
1300 / 1816
1400 / 1816
1500 / 1816
1600 / 1816
1700 / 1816
1800 / 1816
1900 / 1816
2000 / 1816
2100 / 1816
2200 / 1816
2300 / 1816
2400 / 1816


Export the results to csv

In [6]:
import pandas as pd
df = pd.DataFrame(data, columns=['mol', 'smiles'])
df.to_csv('smiles.tsv', sep='\t', header=None, index=False)