In [61]:
'''
This notebook will download the COMP_DB from LipidMaps and convert it into a format that can be used for annotations with JMS.

The output file will be in json format, located at "/tmp/list_COMP_DB.json"
'''


import csv
import json
import urllib.request
import zipfile
from mass2chem.formula import dict_to_hill_formula, parse_chemformula_dict, calculate_formula_mass

COMP_DB_url = "https://www.lipidmaps.org/files/store/COMP_DB_DATA.zip"


In [62]:
urllib.request.urlretrieve(COMP_DB_url, "/tmp/COMP_DB_DATA.zip")

('/tmp/COMP_DB_DATA.zip', <http.client.HTTPMessage at 0x11311d710>)

In [63]:
prototype_empCpd = {
    "primary_id": None,
    "primary_db": None,
    "name": None,
    "neutral_formula": None,
    "neutral_formula_mass": None,
    "SMILES": None,
    "inchikey": None,
    "other_ids": None,
}

In [65]:
entries = []
comp_DB_path = "/tmp/COMP_DB_DATA.zip"
comp_DB_archive = zipfile.ZipFile(comp_DB_path, 'r')
comp_DB_filehandle = comp_DB_archive.open('COMP_DB_DATA.tsv')

line_no = 0
for line in comp_DB_filehandle:
    if line_no == 0:
        pass
    else:
        line = line.decode()
        headgroup, abbreviation, mass, formula, chain_length = str(line).rstrip().split("\t")
        empCpd_for_entry = dict(prototype_empCpd)
        empCpd_for_entry["primary_db"] = "LipidMaps_COMP_DB"
        empCpd_for_entry["name"] = abbreviation
        empCpd_for_entry["neutral_formula"] = dict_to_hill_formula(parse_chemformula_dict(formula))
        empCpd_for_entry["neutral_formula_mass"] = calculate_formula_mass(empCpd_for_entry["neutral_formula"])
        entries.append(empCpd_for_entry)
    line_no += 1

In [66]:
with open("/tmp/list_compounds_COMP_DB.json", 'w+') as output:
    json.dump(entries, output, indent=4)