In [1]:
# phase 0: compare old schema vs old data, output the summary in an excel file* to help decision making
# phase 1: transform old schema to match (correct version) of old data
# phase 2: compare phase 1 schema vs old data, output the summary in an excel file* to help decision making
# phase 3: transform old data to match schema from phase 1 (doesn't count dependencies/required keywords)
# phase 4: transform schema from phase 1 to match JSON Schema draft v7 (we will call it 'new schema')
# phase 5: transform data from phase 2 to match the new schema (include all dependencies/required keywords)

In [2]:
## common imports ##
from os import path, makedirs
import glob
import json

In [3]:
## common functions ##
def fetch_mibig_json_filepaths(dir_path):
    """fetch mibig json paths from a specific folder"""
    return glob.glob(path.join(dir_path, "BGC*.json"))

def count_props(input_dict, cur_path, result):
    """given a (mibig?) json, construct a list of property paths
    along with its presence count in the json object"""
    key_path = cur_path
    
    if isinstance(input_dict, dict):
        for key in input_dict.keys():
            result = count_props(input_dict[key], "{}/{}".format(key_path, key), result)
    elif isinstance(input_dict, list):
        key_path = "{}[]".format(key_path)
        for node in input_dict:
            result = count_props(node, "{}".format(key_path), result)

    if not isinstance(input_dict, dict):
        if key_path not in result:
            result[key_path] = 0
        result[key_path] += 1
    
    return result


def fetch_props_old_schema(input_dict, cur_path, result):
    """given a (mibig?) json schema, construct a list of property paths
    along with either required == True for each properties"""
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_props_old_schema(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_props_old_schema(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result and "properties" not in input_dict:
        result[key_path] = "required" in input_dict and input_dict["required"] == True
    return result


def fetch_props_new_schema(input_dict, cur_path, result):
    """given a (mibig?) json draft7 schema, construct a list of property paths
    along with either required == True for each properties"""
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_props_old_schema(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_props_old_schema(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result and "properties" not in input_dict:
        result[key_path] = False # can't really use this
    return result


def search_and_delete(key, input_dict):
    """delete keys from nested dict"""
    if isinstance(input_dict, list):
        for i in input_dict:
            search_and_delete(key, i)
    elif not isinstance(input_dict, dict):
        return
    to_del = []
    for k in input_dict:
        if k == key:
            to_del.append(k)
        elif isinstance(input_dict[k], dict):
            search_and_delete(key, input_dict[k])
    for k in to_del:
        del input_dict[k]
        
        
def rename_key(from_key, to_key, parent_dict):
    """rename key in dict"""
    if from_key in parent_dict:
        parent_dict[to_key] = parent_dict[from_key]
        del parent_dict[from_key]

def del_key(key, parent_dict):
    if key in parent_dict:
        del parent_dict[key]

In [4]:
###### phase 0: compare old schema vs old data, output the summary in an excel file* to help decision making ########

In [5]:
all_props = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../inputs/json_1.4/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props:
                all_props[prop] = [path.basename(json_path)]
            else:
                all_props[prop].append(path.basename(json_path))

In [6]:
with open("../../inputs/mibig_schema.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj, "", {})
    with open("../../preprocessed/old_schema_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{},{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/old_data_vs_old_schema.csv", "w") as o:
        not_in_schema = []
        for key in sorted(all_props.keys()):
            if key not in schema_props.keys():
                not_in_schema.append((key, all_props[key]))
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))

File written: ../../preprocessed/old_schema_properties.csv
File written: ../../preprocessed/old_data_vs_old_schema.csv


In [7]:
###### phase 1: transform old schema to match (correct version) of old data ######

In [8]:
# (everything is manually done) -- TODO: should write hardcoded scripts to make it reproducible
# update all comma-separated based properties into arrays
# gene_pubs: integer --> gene_pubs: array
print("File written: ../../preprocessed/mibig_schema_phase_1.json")

File written: ../../preprocessed/mibig_schema_phase_1.json


In [9]:
##### phase 2: compare phase 1 schema vs old data, output the summary in an excel file* to help decision making ####

In [10]:
# use all_props from phase 0
with open("../../preprocessed/mibig_schema_phase_1.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj, "", {})
    with open("../../preprocessed/schema_phase_1_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{},{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    not_in_schema = []
    for key in sorted(all_props.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props[key]))
    with open("../../preprocessed/old_data_vs_schema_phase_1.csv", "w") as o:
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/bgc_to_fix_phase_2.csv", "w") as o:
        bgc_to_fix = {}
        for rep in not_in_schema:
            for bgc in rep[1]:
                if bgc not in bgc_to_fix:
                    bgc_to_fix[bgc] = []
                bgc_to_fix[bgc].append(rep[0])
        for bgc in bgc_to_fix:
            o.write("{},{}\n".format(bgc, ";".join(bgc_to_fix[bgc])))
        print("File written: {}".format(o.name))

File written: ../../preprocessed/schema_phase_1_properties.csv
File written: ../../preprocessed/old_data_vs_schema_phase_1.csv
File written: ../../preprocessed/bgc_to_fix_phase_2.csv


In [11]:
#### phase 3: transform old data to match schema from phase 1 (doesn't count dependencies/required keywords) ####
def match_attributes_to_schema_phase_1(data):
    # fix /Comments
    rename_key("Comments", "comments", data)
    # fix /general_params/loci/nucl_acc[]/conn_comp_cluster
    for nuac in data["general_params"]["loci"]["nucl_acc"]:
        if "conn_comp_cluster" in nuac and isinstance(nuac["conn_comp_cluster"], str):
            nuac["conn_comp_cluster"] = nuac["conn_comp_cluster"].replace(" ", "").split(",")
    # fix /general_params/Polyketide/Saccharide
    if "Polyketide" in data["general_params"] and "Saccharide" in data["general_params"]["Polyketide"]:
        data["general_params"]["Saccharide"] = data["general_params"]["Polyketide"]["Saccharide"]
        del data["general_params"]["Polyketide"]["Saccharide"]
    # fix /general_params/Saccharide/Sugar_subclass
    if "Saccharide" in data["general_params"] and "Sugar_subclass" in data["general_params"]["Saccharide"]:
        rename_key("Sugar_subclass", "saccharide_subclass", data["general_params"]["Saccharide"])
    # fix /general_params/Saccharide/gt_genes[]/sugar_subcluster
    if "Saccharide" in data["general_params"] and "gt_genes" in data["general_params"]["Saccharide"]:
        for gtg in data["general_params"]["Saccharide"]["gt_genes"]:
            if "sugar_subcluster" in gtg and isinstance(gtg["sugar_subcluster"], str):
                gtg["sugar_subcluster"] = gtg["sugar_subcluster"].replace(" ", "").split(",")    
    for comp in data["general_params"]["compounds"]:
        # fix /general_params/compounds[]/chem_target
        if "chem_target" in comp and isinstance(comp["chem_target"], str):
            comp["chem_target"] = comp["chem_target"].replace(" ", "").split(",")  
        # fix /general_params/compounds[]/chem_moieties[]/subcluster
        if "chem_moieties" in comp:
            for moi in comp["chem_moieties"]:
                if "subcluster" in moi:
                    if moi["subcluster"] == "unknown":
                        del moi["subcluster"]
    # fix /general_params/genes/gene[]/evidence_genefunction[][]**
    if "genes" in data["general_params"]:
        if "gene" in data["general_params"]["genes"]:
            for gen in data["general_params"]["genes"]["gene"]:
                if "evidence_genefunction" in gen:
                    for i, evgen in enumerate(gen["evidence_genefunction"]):
                        def getvalevgen(ar):
                            if isinstance(ar, list):
                                return getvalevgen(ar[0])
                            else:
                                return ar
                        gen["evidence_genefunction"][i] = getvalevgen(evgen)
    # fix /general_params/Other/biosyn_class[]
    if "Other" in data["general_params"] and "biosyn_class" in data["general_params"]["Other"]:
        clas = data["general_params"]["Other"]["biosyn_class"][0]
        del data["general_params"]["Other"]["biosyn_class"]
        data["general_params"]["Other"]["other_subclass"] = clas
    # fix /general_params/publications
    if "publications" in data["general_params"] and isinstance(data["general_params"]["publications"], str):
        data["general_params"]["publications"] = data["general_params"]["publications"].replace(" ", "").split(",")
    # Polyketide
    if "Polyketide" in data["general_params"]:
        pol = data["general_params"]["Polyketide"]
        # fix /general_params/Polyketide/cyclases
        if "cyclases" in pol and isinstance(pol["cyclases"], str):
            pol["cyclases"] = pol["cyclases"].replace(" ", "").split(",")
        # fix /general_params/Polyketide/pks_genes
        if "pks_genes" in pol and isinstance(pol["pks_genes"], str):
            pol["pks_genes"] = pol["pks_genes"].replace(" ", "").split(",")
        # fix /general_params/Polyketide/pufa_mod_doms
        if "pufa_mod_doms" in pol and isinstance(pol["pufa_mod_doms"], str):
            pol["pufa_mod_doms"] = pol["pufa_mod_doms"].replace(" ", "").split(",")
        # fix /general_params/Polyketide/mod_pks_genes[]/pks_module[]/pks_mod_doms
        if "mod_pks_genes" in pol:
            for modpk in pol["mod_pks_genes"]:
                if "pks_module" in modpk:
                    for pkmod in modpk["pks_module"]:
                        if "pks_mod_doms" in pkmod and isinstance(pkmod["pks_mod_doms"], str):
                            pkmod["pks_mod_doms"] = pkmod["pks_mod_doms"].replace(" ", "").split(",")
    # RiPP
    if "RiPP" in data["general_params"]:
        rip = data["general_params"]["RiPP"]
        if "precursor_loci" in rip:
            for ploc in rip["precursor_loci"]:
                # fix /general_params/RiPP/precursor_loci[]/cleavage_recogn_site
                if "cleavage_recogn_site" in ploc and isinstance(ploc["cleavage_recogn_site"], str):
                    ploc["cleavage_recogn_site"] = ploc["cleavage_recogn_site"].replace(" ", "").split(",")
                # fix /general_params/RiPP/precursor_loci[]/core_pept_aa
                if "core_pept_aa" in ploc and isinstance(ploc["core_pept_aa"], str):
                    ploc["core_pept_aa"] = ploc["core_pept_aa"].replace(" ", "").split(",")
                            
    return

if not path.exists("../../preprocessed/json_1.4_phase_3/"):
    makedirs("../../preprocessed/json_1.4_phase_3/")
    
for json_path in sorted(fetch_mibig_json_filepaths("../../inputs/json_1.4/")):
    json_obj = None
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        match_attributes_to_schema_phase_1(json_obj)        
        with open(path.join("../../preprocessed/json_1.4_phase_3/", path.basename(json_path)), "w") as json_file:
            json.dump(json_obj, json_file)

In [12]:
# verify that all data matched schema
all_props_phase_3 = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/json_1.4_phase_3/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props_phase_3:
                all_props_phase_3[prop] = [path.basename(json_path)]
            else:
                all_props_phase_3[prop].append(path.basename(json_path))

In [13]:
with open("../../preprocessed/mibig_schema_phase_1.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj, "", {})
    not_in_schema = []
    for key in sorted(all_props_phase_3.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props_phase_3[key]))
            print(key)
    print("Number of conflicts: {}".format(len(not_in_schema)))

Number of conflicts: 0


In [14]:
###### phase 4: transform schema from phase 1 to match JSON Schema draft v7 (we will call it 'new schema') ######

In [15]:
new_schema = None
with open("../../preprocessed/mibig_schema_phase_1.json") as json_file:
    new_schema = json.load(json_file) # pre-load with old schema

In [16]:
# 1: convert 'required' to Json Schema draft 7 style
def fix_required(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":
        input_dict["required"] = []
        for prop in input_dict["properties"]:
            child = input_dict["properties"][prop]
            if "required" in child and child["required"] == True:
                input_dict["required"].append(prop)
            fix_required(child)
        if len(input_dict["required"]) < 1:
            del input_dict["required"]
    else:
        if "type" in input_dict and input_dict["type"] == "array":
            fix_required(input_dict["items"])
        if "required" in input_dict:
            del input_dict["required"]
fix_required(new_schema)

In [17]:
# 2: convert 'dependencies' to Json Schema draft 7 style
def fix_dependencies(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":
        input_dict["dependencies"] = {}
        for prop in input_dict["properties"]:
            child = input_dict["properties"][prop]
            if "dependencies" in child and isinstance(child["dependencies"], str):
                if child["dependencies"] in input_dict["properties"]:
                    if child["dependencies"] not in input_dict["dependencies"]:
                        input_dict["dependencies"][child["dependencies"]] = []
                    input_dict["dependencies"][child["dependencies"]].append(prop)
                else:
                    print("Error: {} not found".format(child["dependencies"]))
            fix_dependencies(child)
        if len(input_dict["dependencies"].keys()) < 1:
            del input_dict["dependencies"]
    else:
        if "type" in input_dict and input_dict["type"] == "array":
            fix_dependencies(input_dict["items"])
        if "dependencies" in input_dict:
            del input_dict["dependencies"]
fix_dependencies(new_schema)

In [18]:
# 3: make sure 'enum' contain unique items
def fix_enum(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":        
        for prop in input_dict["properties"]:
            fix_enum(input_dict["properties"][prop])
    elif "type" in input_dict and input_dict["type"] == "array":
        fix_enum(input_dict["items"])
            
    if "enum" in input_dict:
        input_dict["enum"] = list(set(input_dict["enum"]))
fix_enum(new_schema)

In [19]:
# 4: manual (but reproducible) curations

In [20]:
import datetime
now = datetime.datetime.now()
new_schema["$schema"] = "http://json-schema.org/draft-07/schema#"
new_schema["$schema_version"] = "2.0"
new_schema["$schema_created"] = now.strftime("%Y-%m-%d %H:%M")

# remove version, replace with created and modified (datetime)
del new_schema["properties"]["version"]
new_schema["properties"]["created"] = { "type": "string", "format": "date-time" }
new_schema["properties"]["modified"] = { "type": "string", "format": "date-time" }

# require "created", "modified", "general_params", "personal"
new_schema["required"] = ["created", "modified", "general_params", "personal"]

# require "mibig_accession", "biosyn_class", "compounds", "publications"
new_schema["properties"]["general_params"]["required"] = ["mibig_accession", "biosyn_class", "compounds", "publications"]

# delete properties we don't need anymore (i.e. ones meant for AlpacaJS forms)
del new_schema["properties"]["personal"]["properties"]["submitter_institution"]["format"]
search_and_delete("default", new_schema)

# rename Polyketide, NRP, RiPP, Terpene, Saccharide, Alkaloid, Other, to lowercases
rename_key("Polyketide", "polyketide", new_schema["properties"]["general_params"]["properties"])
rename_key("NRP", "nrp", new_schema["properties"]["general_params"]["properties"])
rename_key("RiPP", "ripp", new_schema["properties"]["general_params"]["properties"])
rename_key("Terpene", "terpene", new_schema["properties"]["general_params"]["properties"])
rename_key("Saccharide", "saccharide", new_schema["properties"]["general_params"]["properties"])
rename_key("Alkaloid", "alkaloid", new_schema["properties"]["general_params"]["properties"])
rename_key("Other", "other", new_schema["properties"]["general_params"]["properties"])

# if class = "Polyketide", requires "polyketide" attribute, except if minimal = true
del new_schema["properties"]["general_params"]["dependencies"]
new_schema["properties"]["general_params"]["allOf"] = []
prop_attr_pairs = [
    ("NRP", "nrp"),
    ("Polyketide", "polyketide"),
    ("RiPP", "ripp"),
    ("Terpene", "terpene"),
    ("Saccharide", "saccharide"),
    ("Alkaloid", "alkaloid"),
    ("Other", "other")
]
for prop_attr_pair in prop_attr_pairs:
    new_schema["properties"]["general_params"]["allOf"].append({
        "if": {
            "not": {"properties": {"minimal": {"const": True}}, "required": ["minimal"]},
            "properties": {"biosyn_class": {"contains":{"enum": [prop_attr_pair[0]]}}}
          },
          "then": {
            "required": [prop_attr_pair[1]]
          }
    })

# fix publications.pattern (it doesn't cover doi entries)
# for now, just delete the pattern matching constraint
del new_schema["properties"]["general_params"]["properties"]["publications"]["items"]["pattern"]

In [21]:
### general_params.compounds ###

# remove database_deposited and databases_deposited. we can always infer it from their respective accession ids
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["database_deposited"]
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["databases_deposited"]
# delete old dependencies format
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["dependencies"]
# required = ["compound", "evidence_struct", "chem_act"]
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["required"] = ["compound", "evidence_struct", "chem_act"]
# if chem_act == "other", require other_chem_act
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["allOf"].append({
    "if": {
            "properties": {"chem_act": {"contains":{"enum": ["Other"]}}}
          },
          "then": {
            "required": ["other_chem_act"]
          }
})

In [22]:
### general_params.loci ###

# move conn_comp_cluster from nucl_acc to loci (a cluster should only have one conn_comp_cluster)
new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["conn_comp_cluster"] = new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]["items"]["properties"]["conn_comp_cluster"]
del new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]["items"]["properties"]["conn_comp_cluster"]
# loci.required = ["complete", "nucl_acc", "conn_comp_cluster"]
new_schema["properties"]["general_params"]["properties"]["loci"]["required"] = ["complete", "nucl_acc", "conn_comp_cluster"]
# change 'Accession' to 'accession'
rename_key("Accession", "accession", new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]["items"]["properties"])
# nucl_acc.required = ["accession"]
new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]["items"]["required"] = ["accession"]

In [23]:
### general_params.genes ###

# operon.required = ["operon_genes", "evidence_operon"]
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["operon"]["items"]["required"] = ["operon_genes", "evidence_operon"]
# gene.required = ["gene_function", "evidence_genefunction"]
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["required"] = ["gene_function", "evidence_genefunction"]
# add nucl_acc, to specify which of the supplied accessions does this gene belongs to
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["properties"]["nucl_acc"] = { "type": "string" }
# delete gene.not_in_gbk, we can infer it from gene_id
del new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["properties"]["not_in_gbk"]
# delete old dependencies format, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"] = []
# if gene_id is not supplied, require gene_startpos,gene_endpos,gene_name,accession
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"].append({
    "if": {
            "not": {"required": ["gene_id"]}
          },
          "then": {
            "required": ["gene_startpos", "gene_endpos", "gene_name", "nucl_acc"]
          },
})
# if gene_function == Tailoring, require tailoring
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"].append({
    "if": {
            "properties": {"gene_function": {"enum": ["Tailoring"]}}
          },
          "then": {
            "required": ["tailoring"]
          }
})

In [24]:
### general_params.polyketide ###

# required = ["pk_subclass", "pks_subclass", "pks_te_type", "lin_cycl_pk", "starter_unit", "ketide_length", "pks_te_type"]
new_schema["properties"]["general_params"]["properties"]["polyketide"]["required"] = ["pk_subclass", "pks_subclass", "pks_te_type", "lin_cycl_pk", "starter_unit", "ketide_length"]

# delete old dependencies format, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["polyketide"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
    "if": {
            "properties": {"pks_subclass": {"enum": ["Modular type I"]}}
          },
          "then": {
            "required": ["mod_pks_genes"],
            "properties": {"mod_pks_genes": {"minItems": 1}}
          }
})
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
    "if": {
            "properties": {"pks_subclass": {"enum": ["Trans-AT type I"]}}
          },
          "then": {
            "required": ["trans_at"],
            "properties": {"trans_at": {"minItems": 1}}
          }
})
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
    "if": {
            "properties": {"pks_subclass": {"enum": ["Iterative type I"]}}
          },
          "then": {
            "required": ["iterative_subtype", "nr_iterations", "iter_cycl_type"]
          }
})
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
    "if": {
            "properties": {"pks_subclass": {"enum": ["PUFA synthase or related"]}}
          },
          "then": {
            "required": ["pufa_mod_doms"],
            "properties": {"pufa_mod_doms": {"minItems": 1}}
          }
})
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
    "if": {
            "not": { "properties": {"pks_te_type": {"enum": ["None"]}} }
          },
          "then": {
            "required": ["pks_thioesterase"],
            "properties": {"pks_thioesterase": {"minItems": 1}}
          }
})

# --- todo: assert dependencies of pks_subtype --> requirements
# --- todo: apply dependencies for mod_pks_genes

In [25]:
### general_params.nrp ###

# delete old dependencies format, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["nrp"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"].append({
    "if": {
            "not": { "properties": {"nrps_te_type": {"enum": ["None"]}} }
          },
          "then": {
            "required": ["nrps_thioesterase"],
            "properties": {"nrps_thioesterase": {"minItems": 1}}
          }
})
new_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"].append({
    "if": {
            "properties": {"subclass": {"enum": ["Other lipopeptide"]}}
          },
          "then": {
            "required": ["lipid_moiety"]
          }
})

# nrps_module -- required = ["module_nr", "cdom_subtype", "nrps_mod_doms"]
new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["required"] = ["module_nr", "cdom_subtype", "nrps_mod_doms"]
# nrps_module -- delete old dependencies, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["allOf"].append({
    "if": {
            "properties": {"nrps_mod_doms": {"enum": ["Other"]}}
          },
          "then": {
            "required": ["nrps_other_mod_dom"]
          }    
})
new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["allOf"].append({
    "if": {
            "not": {"properties": {"nrps_mod_skip_iter": {"enum": ["Neither"]}}}
          },
          "then": {
            "required": ["nrps_evidence_skip_iter"]
          }    
})

## todo -- assert dependencies

In [26]:
### general_params.saccharide ###

# delete old dependencies format, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["allOf"].append({
    "if": {
            "properties": {"gt_specificity": {"enum": ["Other"]}}
          },
          "then": {
            "required": ["other_gt_spec"]
          }
})
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["allOf"].append({
    "if": {
            "not": { "properties": {"gt_specificity": {"enum": ["Unknown"]}} }
          },
          "then": {
            "required": ["evidence_gt_spec"]
          }
})

# move 'gt_genes.sugar_subcluster' to sugar_subclusters, we can infer the specificities from the list of gene ids
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["sugar_subclusters"] = {
    "title": "Sub-clusters for sugar biosynthesis",
    "type": "array",
    "items": new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["properties"]["sugar_subcluster"]
}
del new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["properties"]["sugar_subcluster"]


In [27]:
### for all 'required' fields, if it is an array, specify 'minItems = 1' so that it can't be empty
def apply_array_required(input_dict):
    if "properties" in input_dict:
        if "required" in input_dict:
            for req in input_dict["required"]:
                if "type" in input_dict["properties"][req] and input_dict["properties"][req]["type"] == "array":
                    if "minItems" not in input_dict["properties"][req]:
                        input_dict["properties"][req]["minItems"] = 1
        for key in input_dict["properties"]:
            apply_array_required(input_dict["properties"][key])
    elif "items" in input_dict:
        apply_array_required(input_dict["items"])
                                
apply_array_required(new_schema)

In [28]:
# 5: save new schema
with open("../../outputs/mibig_schema_draft7.json", "w") as o:
    o.write(json.dumps(new_schema, indent=4, separators=(',', ': ')))

In [29]:
#### phase 5: transform data from phase 2 to match the new schema (include all dependencies/required keywords) ######

In [30]:
# 1: check data vs new schema to get quick overview of changed structures
# use all_props from phase 3
with open("../../outputs/mibig_schema_draft7.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_new_schema(json_obj, "", {})
    with open("../../preprocessed/schema_draft7_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    not_in_schema = []
    for key in sorted(all_props_phase_3.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props_phase_3[key]))
    with open("../../preprocessed/data_phase_3_vs_schema_draft7.csv", "w") as o:
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/bgc_to_fix_phase_5.csv", "w") as o:
        bgc_to_fix = {}
        for rep in not_in_schema:
            for bgc in rep[1]:
                if bgc not in bgc_to_fix:
                    bgc_to_fix[bgc] = []
                bgc_to_fix[bgc].append(rep[0])
        for bgc in bgc_to_fix:
            o.write("{},{}\n".format(bgc, ";".join(bgc_to_fix[bgc])))
        print("File written: {}".format(o.name))

File written: ../../preprocessed/schema_draft7_properties.csv
File written: ../../preprocessed/data_phase_3_vs_schema_draft7.csv
File written: ../../preprocessed/bgc_to_fix_phase_5.csv


In [31]:
# 2: fix data and assert that there is no more unrecognized attributes present
def match_attributes_to_schema_7(data):
    # remove 'version'
    del_key("version", data)
    con_comp_temp = []
    for nuc in data["general_params"]["loci"]["nucl_acc"]:
        # rename Accession to accession
        rename_key("Accession", "accession", nuc)
        if "conn_comp_cluster" in nuc:
            for con_comp in nuc["conn_comp_cluster"]:
                if con_comp not in con_comp_temp:
                    con_comp_temp.append(con_comp)
            del nuc["conn_comp_cluster"]
    # fix /general_params/loci/nucl_acc[]/conn_comp_cluster[]
    if len(con_comp_temp) > 0:
        data["general_params"]["loci"]["conn_comp_cluster"] = con_comp_temp
    # rename Polyketide, NRP, etc. to its lowercase version
    rename_key("Polyketide", "polyketide", data["general_params"])
    rename_key("NRP", "nrp", data["general_params"])
    rename_key("RiPP", "ripp", data["general_params"])
    rename_key("Terpene", "terpene", data["general_params"])
    rename_key("Saccharide", "saccharide", data["general_params"])
    rename_key("Alkaloid", "alkaloid", data["general_params"])
    rename_key("Other", "other", data["general_params"])
    for comp in data["general_params"]["compounds"]:
        # fix /general_params/compounds[]/database_deposited
        if "database_deposited" in comp:
            del comp["database_deposited"]
        # fix /general_params/compounds[]/databases_deposited[]
        if "databases_deposited" in comp:
            del comp["databases_deposited"]
    # fix /general_params/genes/gene[]/not_in_gbk
    if "genes" in data["general_params"]:
        if "gene" in data["general_params"]["genes"]:
            for gen in data["general_params"]["genes"]["gene"]:
                del_key("not_in_gbk", gen)
    # fix /general_params/saccharide/gt_genes[]/sugar_subcluster[]
    sugsub = []
    if "saccharide" in data["general_params"]:
        if "gt_genes" in data["general_params"]["saccharide"]:
            for gtg in data["general_params"]["saccharide"]["gt_genes"]:
                if "sugar_subcluster" in gtg:
                    sugsub.append(gtg["sugar_subcluster"])
                    del gtg["sugar_subcluster"]
        if len(sugsub) > 0:
            data["general_params"]["saccharide"]["sugar_subclusters"] = sugsub
    return

if not path.exists("../../preprocessed/json_1.4_phase_5/"):
    makedirs("../../preprocessed/json_1.4_phase_5/")
    
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/json_1.4_phase_3/")):
    json_obj = None
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        match_attributes_to_schema_7(json_obj)
        with open(path.join("../../preprocessed/json_1.4_phase_5/", path.basename(json_path)), "w") as json_file:
            json.dump(json_obj, json_file)

all_props_phase_5 = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/json_1.4_phase_5/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props_phase_5:
                all_props_phase_5[prop] = [path.basename(json_path)]
            else:
                all_props_phase_5[prop].append(path.basename(json_path))

with open("../../outputs/mibig_schema_draft7.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_new_schema(json_obj, "", {})
    not_in_schema = []
    for key in sorted(all_props_phase_5.keys()):
        if key not in schema_props.keys():
            not_in_schema.append(key)
            print("{},{}".format(key, len(all_props_phase_5[key])))
    print("Number of conflicts: {}".format(len(not_in_schema)))

Number of conflicts: 0


In [32]:
# 3: validate data using JSON Schema V7 validator, fix conflicts, then save to final output folder
if not path.exists("../../outputs/json_2.0/"):
    makedirs("../../outputs/json_2.0/")
