In [1]:
# phase 0: compare old schema vs old data, output the summary in an excel file* to help decision making
# phase 1: transform old schema to match (correct version) of old data
# phase 2: compare phase 1 schema vs old data, output the summary in an excel file* to help decision making
# phase 3: transform old data to match schema from phase 1 (doesn't count dependencies/required keywords)
# phase 4: transform schema from phase 1 to match JSON Schema draft v7 (we will call it 'new schema')
# phase 5: transform data from phase 2 to match the new schema (include all dependencies/required keywords)

In [2]:
## common imports ##
from os import path, makedirs
import glob
import json
from jsonschema import validate, Draft7Validator
from sys import exit
import datetime
import time

In [3]:
## common functions ##
def fetch_mibig_json_filepaths(dir_path):
    """fetch mibig json paths from a specific folder"""
    return glob.glob(path.join(dir_path, "BGC*.json"))

def count_props(input_dict, cur_path, result):
    """given a (mibig?) json, construct a list of property paths
    along with its presence count in the json object"""
    key_path = cur_path
    
    if isinstance(input_dict, dict):
        for key in input_dict.keys():
            result = count_props(input_dict[key], "{}/{}".format(key_path, key), result)
    elif isinstance(input_dict, list):
        key_path = "{}[]".format(key_path)
        for node in input_dict:
            result = count_props(node, "{}".format(key_path), result)

    if not isinstance(input_dict, dict):
        if key_path not in result:
            result[key_path] = 0
        result[key_path] += 1
    
    return result


def fetch_props_old_schema(input_dict, cur_path, result):
    """given a (mibig?) json schema, construct a list of property paths
    along with either required == True for each properties"""
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_props_old_schema(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_props_old_schema(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result and "properties" not in input_dict:
        result[key_path] = "required" in input_dict and input_dict["required"] == True
    return result


def fetch_props_new_schema(input_dict, cur_path, result):
    """given a (mibig?) json draft7 schema, construct a list of property paths
    along with either required == True for each properties"""
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_props_old_schema(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_props_old_schema(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result and "properties" not in input_dict:
        result[key_path] = False # can't really use this
    return result


def search_and_delete(key, input_dict):
    """delete keys from nested dict"""
    if isinstance(input_dict, list):
        for i in input_dict:
            search_and_delete(key, i)
    elif not isinstance(input_dict, dict):
        return
    to_del = []
    for k in input_dict:
        if k == key:
            to_del.append(k)
        elif isinstance(input_dict[k], dict):
            search_and_delete(key, input_dict[k])
    for k in to_del:
        del input_dict[k]

        
def rename_key(from_key, to_key, parent_dict):
    """rename key in dict"""
    if from_key in parent_dict:
        parent_dict[to_key] = parent_dict[from_key]
        del parent_dict[from_key]

def del_key(key, parent_dict):
    if key in parent_dict:
        del parent_dict[key]
        
import time
def date2iso(thedate):
    strdate = thedate.strftime("%Y-%m-%dT%H:%M:%S")
    minute = (time.localtime().tm_gmtoff / 60) % 60
    hour = ((time.localtime().tm_gmtoff / 60) - minute) / 60
    utcoffset = "%.2d:%.2d" %(hour, minute)
    if utcoffset[0] != '-':
        utcoffset = '+' + utcoffset
        return strdate + utcoffset
    
class ToDelete():
    """dummy class for lazy deletion of list members"""
    pass

def lazily_deletes(input_dict):
    """traverse and lazily delete list/dict members"""
    if isinstance(input_dict, list):
        new_list = []
        for i, node in enumerate(input_dict):
            if not isinstance(node, ToDelete):
                input_dict[i] = lazily_deletes(node)
                new_list.append(node)
        return new_list
    elif isinstance(input_dict, dict):
        key_to_dels = []
        for key in input_dict:
            if not isinstance(input_dict[key], ToDelete):
                input_dict[key] = lazily_deletes(input_dict[key])
            else:
                key_to_dels.append(key)
        for key in key_to_dels:
            del input_dict[key]
    return input_dict


In [4]:
###### phase 0: compare old schema vs old data, output the summary in an excel file* to help decision making ########

In [5]:
all_props = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../inputs/json_1.4/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props:
                all_props[prop] = [path.basename(json_path)]
            else:
                all_props[prop].append(path.basename(json_path))

In [6]:
with open("../../inputs/mibig_schema.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj, "", {})
    with open("../../preprocessed/old_schema_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{},{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/old_data_vs_old_schema.csv", "w") as o:
        not_in_schema = []
        for key in sorted(all_props.keys()):
            if key not in schema_props.keys():
                not_in_schema.append((key, all_props[key]))
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))

File written: ../../preprocessed/old_schema_properties.csv
File written: ../../preprocessed/old_data_vs_old_schema.csv


In [7]:
###### phase 1: transform old schema to match (correct version) of old data ######

In [8]:
# (everything is manually done) -- TODO: should write hardcoded scripts to make it reproducible
# update all comma-separated based properties into arrays
# gene_pubs: integer --> gene_pubs: array
print("File written: ../../preprocessed/mibig_schema_phase_1.json")

File written: ../../preprocessed/mibig_schema_phase_1.json


In [9]:
##### phase 2: compare phase 1 schema vs old data, output the summary in an excel file* to help decision making ####

In [10]:
# use all_props from phase 0
with open("../../preprocessed/mibig_schema_phase_1.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj, "", {})
    with open("../../preprocessed/schema_phase_1_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{},{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    not_in_schema = []
    for key in sorted(all_props.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props[key]))
    with open("../../preprocessed/old_data_vs_schema_phase_1.csv", "w") as o:
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/bgc_to_fix_phase_2.csv", "w") as o:
        bgc_to_fix = {}
        for rep in not_in_schema:
            for bgc in rep[1]:
                if bgc not in bgc_to_fix:
                    bgc_to_fix[bgc] = []
                bgc_to_fix[bgc].append(rep[0])
        for bgc in bgc_to_fix:
            o.write("{},{}\n".format(bgc, ";".join(bgc_to_fix[bgc])))
        print("File written: {}".format(o.name))

File written: ../../preprocessed/schema_phase_1_properties.csv
File written: ../../preprocessed/old_data_vs_schema_phase_1.csv
File written: ../../preprocessed/bgc_to_fix_phase_2.csv


In [11]:
#### phase 3: transform old data to match schema from phase 1 (doesn't count dependencies/required keywords) ####
def match_attributes_to_schema_phase_1(data):
    # fix /Comments
    rename_key("Comments", "comments", data)
    # fix /general_params/loci/nucl_acc[]/conn_comp_cluster
    for nuac in data["general_params"]["loci"]["nucl_acc"]:
        if "conn_comp_cluster" in nuac and isinstance(nuac["conn_comp_cluster"], str):
            nuac["conn_comp_cluster"] = nuac["conn_comp_cluster"].replace(" ", "").split(",")
    # fix /general_params/Polyketide/Saccharide
    if "Polyketide" in data["general_params"] and "Saccharide" in data["general_params"]["Polyketide"]:
        data["general_params"]["Saccharide"] = data["general_params"]["Polyketide"]["Saccharide"]
        del data["general_params"]["Polyketide"]["Saccharide"]
    # fix /general_params/Saccharide/Sugar_subclass
    if "Saccharide" in data["general_params"] and "Sugar_subclass" in data["general_params"]["Saccharide"]:
        rename_key("Sugar_subclass", "saccharide_subclass", data["general_params"]["Saccharide"])
    # fix /general_params/Saccharide/gt_genes[]/sugar_subcluster
    if "Saccharide" in data["general_params"] and "gt_genes" in data["general_params"]["Saccharide"]:
        for gtg in data["general_params"]["Saccharide"]["gt_genes"]:
            if "sugar_subcluster" in gtg and isinstance(gtg["sugar_subcluster"], str):
                gtg["sugar_subcluster"] = gtg["sugar_subcluster"].replace(" ", "").split(",")    
    for comp in data["general_params"]["compounds"]:
        # fix /general_params/compounds[]/chem_target
        if "chem_target" in comp and isinstance(comp["chem_target"], str):
            comp["chem_target"] = comp["chem_target"].replace(" ", "").split(",")  
        # fix /general_params/compounds[]/chem_moieties[]/subcluster
        if "chem_moieties" in comp:
            for moi in comp["chem_moieties"]:
                if "subcluster" in moi:
                    if moi["subcluster"] == "unknown":
                        del moi["subcluster"]
    # fix /general_params/genes/gene[]/evidence_genefunction[][]**
    if "genes" in data["general_params"]:
        if "gene" in data["general_params"]["genes"]:
            for gen in data["general_params"]["genes"]["gene"]:
                if "evidence_genefunction" in gen:
                    for i, evgen in enumerate(gen["evidence_genefunction"]):
                        def getvalevgen(ar):
                            if isinstance(ar, list):
                                return getvalevgen(ar[0])
                            else:
                                return ar
                        gen["evidence_genefunction"][i] = getvalevgen(evgen)
    # fix /general_params/Other/biosyn_class[]
    if "Other" in data["general_params"] and "biosyn_class" in data["general_params"]["Other"]:
        clas = data["general_params"]["Other"]["biosyn_class"][0]
        del data["general_params"]["Other"]["biosyn_class"]
        data["general_params"]["Other"]["other_subclass"] = clas
    # fix /general_params/publications
    if "publications" in data["general_params"] and isinstance(data["general_params"]["publications"], str):
        data["general_params"]["publications"] = data["general_params"]["publications"].replace(" ", "").split(",")
    # Polyketide
    if "Polyketide" in data["general_params"]:
        pol = data["general_params"]["Polyketide"]
        # fix /general_params/Polyketide/cyclases
        if "cyclases" in pol and isinstance(pol["cyclases"], str):
            pol["cyclases"] = pol["cyclases"].replace(" ", "").split(",")
        # fix /general_params/Polyketide/pks_genes
        if "pks_genes" in pol and isinstance(pol["pks_genes"], str):
            pol["pks_genes"] = pol["pks_genes"].replace(" ", "").split(",")
        # fix /general_params/Polyketide/pufa_mod_doms
        if "pufa_mod_doms" in pol and isinstance(pol["pufa_mod_doms"], str):
            pol["pufa_mod_doms"] = pol["pufa_mod_doms"].replace(" ", "").split(",")
        # fix /general_params/Polyketide/mod_pks_genes[]/pks_module[]/pks_mod_doms
        if "mod_pks_genes" in pol:
            for modpk in pol["mod_pks_genes"]:
                if "pks_module" in modpk:
                    for pkmod in modpk["pks_module"]:
                        if "pks_mod_doms" in pkmod and isinstance(pkmod["pks_mod_doms"], str):
                            pkmod["pks_mod_doms"] = pkmod["pks_mod_doms"].replace(" ", "").split(",")
    # RiPP
    if "RiPP" in data["general_params"]:
        rip = data["general_params"]["RiPP"]
        if "precursor_loci" in rip:
            for ploc in rip["precursor_loci"]:
                # fix /general_params/RiPP/precursor_loci[]/cleavage_recogn_site
                if "cleavage_recogn_site" in ploc and isinstance(ploc["cleavage_recogn_site"], str):
                    ploc["cleavage_recogn_site"] = ploc["cleavage_recogn_site"].replace(" ", "").split(",")
                # fix /general_params/RiPP/precursor_loci[]/core_pept_aa
                if "core_pept_aa" in ploc and isinstance(ploc["core_pept_aa"], str):
                    ploc["core_pept_aa"] = ploc["core_pept_aa"].replace(" ", "").split(",")
                            
    return

if not path.exists("../../preprocessed/json_1.4_phase_3/"):
    makedirs("../../preprocessed/json_1.4_phase_3/")
    
for json_path in sorted(fetch_mibig_json_filepaths("../../inputs/json_1.4/")):
    json_obj = None
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        match_attributes_to_schema_phase_1(json_obj)        
        with open(path.join("../../preprocessed/json_1.4_phase_3/", path.basename(json_path)), "w") as json_file:
            json.dump(json_obj, json_file, indent=4, separators=(',', ': '))

In [12]:
# verify that all data matched schema
all_props_phase_3 = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/json_1.4_phase_3/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props_phase_3:
                all_props_phase_3[prop] = [path.basename(json_path)]
            else:
                all_props_phase_3[prop].append(path.basename(json_path))

In [13]:
with open("../../preprocessed/mibig_schema_phase_1.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj, "", {})
    not_in_schema = []
    for key in sorted(all_props_phase_3.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props_phase_3[key]))
            print(key)
    print("Number of conflicts: {}".format(len(not_in_schema)))

Number of conflicts: 0


In [14]:
###### phase 4: transform schema from phase 1 to match JSON Schema draft v7 (we will call it 'new schema') ######

In [15]:
new_schema = None
with open("../../preprocessed/mibig_schema_phase_1.json") as json_file:
    new_schema = json.load(json_file) # pre-load with old schema

In [16]:
# 1: convert 'required' to Json Schema draft 7 style
def fix_required(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":
        input_dict["required"] = []
        for prop in input_dict["properties"]:
            child = input_dict["properties"][prop]
            if "required" in child and child["required"] == True:
                input_dict["required"].append(prop)
            fix_required(child)
        if len(input_dict["required"]) < 1:
            del input_dict["required"]
    else:
        if "type" in input_dict and input_dict["type"] == "array":
            fix_required(input_dict["items"])
        if "required" in input_dict:
            del input_dict["required"]
fix_required(new_schema)

In [17]:
# 2: convert 'dependencies' to Json Schema draft 7 style
def fix_dependencies(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":
        input_dict["dependencies"] = {}
        for prop in input_dict["properties"]:
            child = input_dict["properties"][prop]
            if "dependencies" in child and isinstance(child["dependencies"], str):
                if child["dependencies"] in input_dict["properties"]:
                    if child["dependencies"] not in input_dict["dependencies"]:
                        input_dict["dependencies"][child["dependencies"]] = []
                    input_dict["dependencies"][child["dependencies"]].append(prop)
                else:
                    print("Error: {} not found".format(child["dependencies"]))
            fix_dependencies(child)
        if len(input_dict["dependencies"].keys()) < 1:
            del input_dict["dependencies"]
    else:
        if "type" in input_dict and input_dict["type"] == "array":
            fix_dependencies(input_dict["items"])
        if "dependencies" in input_dict:
            del input_dict["dependencies"]
fix_dependencies(new_schema)

In [18]:
# 3: make sure 'enum' contain unique items, and remove all trailing white spaces
def fix_enum(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":        
        for prop in input_dict["properties"]:
            fix_enum(input_dict["properties"][prop])
    elif "type" in input_dict and input_dict["type"] == "array":
        fix_enum(input_dict["items"])
            
    if "enum" in input_dict:
        for i, item in enumerate(input_dict["enum"]):
            input_dict["enum"][i] = item.rstrip().lstrip()
        input_dict["enum"] = list(set(input_dict["enum"]))
fix_enum(new_schema)

In [19]:
# 4: manual (but reproducible) curations

In [20]:
now = datetime.datetime.now()
new_schema["$schema"] = "http://json-schema.org/draft-07/schema#"
new_schema["$schema_version"] = "2.0"
new_schema["$schema_created"] = date2iso(now)

# remove version, replace with created and modified (datetime)
del new_schema["properties"]["version"]
new_schema["properties"]["created"] = { "type": "string", "format": "date-time" }
new_schema["properties"]["modified"] = { "type": "string", "format": "date-time" }

# require "created", "modified", "general_params", "personal"
new_schema["required"] = ["created", "modified", "general_params", "personal"]

# require "mibig_accession", "biosyn_class", "compounds", "publications"
new_schema["properties"]["general_params"]["required"] = ["mibig_accession", "biosyn_class", "compounds", "publications"]

# delete properties we don't need anymore (i.e. ones meant for AlpacaJS forms)
del new_schema["properties"]["personal"]["properties"]["submitter_institution"]["format"]
search_and_delete("default", new_schema)

# rename Polyketide, NRP, RiPP, Terpene, Saccharide, Alkaloid, Other, to lowercases
rename_key("Polyketide", "polyketide", new_schema["properties"]["general_params"]["properties"])
rename_key("NRP", "nrp", new_schema["properties"]["general_params"]["properties"])
rename_key("RiPP", "ripp", new_schema["properties"]["general_params"]["properties"])
rename_key("Terpene", "terpene", new_schema["properties"]["general_params"]["properties"])
rename_key("Saccharide", "saccharide", new_schema["properties"]["general_params"]["properties"])
rename_key("Alkaloid", "alkaloid", new_schema["properties"]["general_params"]["properties"])
rename_key("Other", "other", new_schema["properties"]["general_params"]["properties"])

# fix publications.pattern (it doesn't cover doi entries)
# for now, just delete the pattern matching constraint
del new_schema["properties"]["general_params"]["properties"]["publications"]["items"]["pattern"]

In [21]:
### general_params.compounds ###

# remove database_deposited and databases_deposited. we can always infer it from their respective accession ids
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["database_deposited"]
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["databases_deposited"]
# delete old dependencies format for compounds
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["dependencies"]
# required = ["compound", "evidence_struct", "chem_act"]
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["required"] = ["compound", "evidence_struct", "chem_act"]
# if chem_act == "other", require other_chem_act
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["allOf"].append({
    "if": {
            "properties": {"chem_act": {"contains":{"enum": ["Other"]}}},
            "required": ["chem_act"]
          },
          "then": {
            "required": ["other_chem_act"]
          }
})
# delete old dependencies format for chem_moieties
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["chem_moieties"]["items"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["chem_moieties"]["items"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["chem_moieties"]["items"]["allOf"].append({
    "if": {
            "properties": {"chem_moiety": {"enum": ["Other"]}},
            "required": ["chem_moiety"]
          },
          "then": {
            "required": ["other_chem_moiety"]
          }
})

In [22]:
### general_params.loci ###

# move conn_comp_cluster from nucl_acc to loci (a cluster should only have one conn_comp_cluster)
new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["conn_comp_cluster"] = new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]["items"]["properties"]["conn_comp_cluster"]
del new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]["items"]["properties"]["conn_comp_cluster"]
# loci.required = ["complete", "nucl_acc", "conn_comp_cluster"]
new_schema["properties"]["general_params"]["properties"]["loci"]["required"] = ["complete", "nucl_acc", "conn_comp_cluster"]
# change 'Accession' to 'accession'
rename_key("Accession", "accession", new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]["items"]["properties"])
# nucl_acc.required = ["accession"]
new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]["items"]["required"] = ["accession"]

In [23]:
### general_params.genes ###

# operon.required = ["operon_genes", "evidence_operon"]
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["operon"]["items"]["required"] = ["operon_genes", "evidence_operon"]
# gene.required = ["gene_function"]
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["required"] = ["gene_function"]
# add nucl_acc, to specify which of the supplied accessions does this gene belongs to
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["properties"]["nucl_acc"] = { "type": "string" }
# delete gene.not_in_gbk, we can infer it from gene_id
del new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["properties"]["not_in_gbk"]
# delete old dependencies format, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"] = []
""" -- disable this? then gene can be exist without positional information
# if gene_id is not supplied, require gene_startpos,gene_endpos,gene_name,accession
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"].append({
    "if": {
            "not": {"required": ["gene_id"]}
          },
          "then": {
            "required": ["gene_startpos", "gene_endpos", "gene_name", "nucl_acc"]
          },
})
"""
# if gene_function != Unknown, require evidence_genefunction
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"].append({
    "if": {
            "not": {"properties": {"gene_function": {"enum": ["Unknown"]}}},
            "required": ["gene_function"]
          },
          "then": {
            "required": ["evidence_genefunction"]
          }
})
# if gene_function == Tailoring, require tailoring
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"].append({
    "if": {
            "properties": {"gene_function": {"enum": ["Tailoring"]}},
            "required": ["gene_function"]
          },
          "then": {
            "required": ["tailoring"]
          }
})

In [24]:
### general_params.polyketide ###
# for now, just remove all dependencies and requirements
search_and_delete("required", new_schema["properties"]["general_params"]["properties"]["polyketide"])
search_and_delete("dependencies", new_schema["properties"]["general_params"]["properties"]["polyketide"])

# add evidence_at_spec = "None"
new_schema["properties"]["general_params"]["properties"]["polyketide"]["properties"]["mod_pks_genes"]["items"]["properties"]["pks_module"]["items"]["properties"]["evidence_at_spec"]["enum"].append("None")
""" TODO: fix Polyketide schema (requirements, dependencies)
# required = ["pk_subclass", "pks_subclass", "lin_cycl_pk", "starter_unit", "pks_genes", "cyclases"]
new_schema["properties"]["general_params"]["properties"]["polyketide"]["required"] = ["pk_subclass", "pks_subclass", "pks_te_type", "lin_cycl_pk", "starter_unit", "ketide_length"]

# delete old dependencies format, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["polyketide"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
    "if": {
            "properties": {"pks_subclass": {"enum": ["Modular type I"]}},
            "required": ["pks_subclass"]
          },
          "then": {
            "required": ["mod_pks_genes"],
            "properties": {"mod_pks_genes": {"minItems": 1}}
          }
})
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
    "if": {
            "properties": {"pks_subclass": {"enum": ["Trans-AT type I"]}},
            "required": ["pks_subclass"]
          },
          "then": {
            "required": ["trans_at"],
            "properties": {"trans_at": {"minItems": 1}}
          }
})
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
    "if": {
            "properties": {"pks_subclass": {"enum": ["Iterative type I"]}},
            "required": ["pks_subclass"]
          },
          "then": {
            "required": ["iterative_subtype", "nr_iterations", "iter_cycl_type"]
          }
})
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
    "if": {
            "properties": {"pks_subclass": {"enum": ["PUFA synthase or related"]}},
            "required": ["pks_subclass"]
          },
          "then": {
            "required": ["pufa_mod_doms"],
            "properties": {"pufa_mod_doms": {"minItems": 1}}
          }
})
new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
    "if": {
            "not": { "properties": {"pks_te_type": {"enum": ["None"]}} },
            "required": ["pks_te_type"]
          },
          "then": {
            "required": ["pks_thioesterase"],
            "properties": {"pks_thioesterase": {"minItems": 1}}
          }
})
"""
# --- todo: assert dependencies of pks_subtype --> requirements
# --- todo: apply dependencies for mod_pks_genes

' TODO: fix Polyketide schema (requirements, dependencies)\n# required = ["pk_subclass", "pks_subclass", "lin_cycl_pk", "starter_unit", "pks_genes", "cyclases"]\nnew_schema["properties"]["general_params"]["properties"]["polyketide"]["required"] = ["pk_subclass", "pks_subclass", "pks_te_type", "lin_cycl_pk", "starter_unit", "ketide_length"]\n\n# delete old dependencies format, replace with allOf\ndel new_schema["properties"]["general_params"]["properties"]["polyketide"]["dependencies"]\nnew_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"] = []\nnew_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({\n    "if": {\n            "properties": {"pks_subclass": {"enum": ["Modular type I"]}},\n            "required": ["pks_subclass"]\n          },\n          "then": {\n            "required": ["mod_pks_genes"],\n            "properties": {"mod_pks_genes": {"minItems": 1}}\n          }\n})\nnew_schema["properties"]["general_params"]["p

In [25]:
### general_params.nrp ###
# for now, just remove all dependencies and requirements
search_and_delete("required", new_schema["properties"]["general_params"]["properties"]["nrp"])
search_and_delete("dependencies", new_schema["properties"]["general_params"]["properties"]["nrp"])
""" TODO: fix NRP schema (requirements, dependencies)
# delete old dependencies format, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["nrp"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"].append({
    "if": {
            "not": { "properties": {"nrps_te_type": {"enum": ["None"]}} },
            "required": ["nrps_te_type"]
          },
          "then": {
            "required": ["nrps_thioesterase"],
            "properties": {"nrps_thioesterase": {"minItems": 1}}
          }
})
new_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"].append({
    "if": {
            "properties": {"subclass": {"enum": ["Other lipopeptide"]}},
            "required": ["subclass"]
          },
          "then": {
            "required": ["lipid_moiety"]
          }
})

# nrps_module -- required = ["module_nr", "cdom_subtype", "nrps_mod_doms"]
new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["required"] = ["module_nr", "cdom_subtype", "nrps_mod_doms"]
# nrps_module -- delete old dependencies, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["allOf"].append({
    "if": {
            "properties": {"nrps_mod_doms": {"enum": ["Other"]}},
            "required": ["nrps_mod_doms"]
          },
          "then": {
            "required": ["nrps_other_mod_dom"]
          }    
})
new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["allOf"].append({
    "if": {
            "not": {"properties": {"nrps_mod_skip_iter": {"enum": ["Neither"]}}},
            "required": ["nrps_mod_skip_iter"]
          },
          "then": {
            "required": ["nrps_evidence_skip_iter"]
          }    
})
"""
## todo -- assert dependencies

' TODO: fix NRP schema (requirements, dependencies)\n# delete old dependencies format, replace with allOf\ndel new_schema["properties"]["general_params"]["properties"]["nrp"]["dependencies"]\nnew_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"] = []\nnew_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"].append({\n    "if": {\n            "not": { "properties": {"nrps_te_type": {"enum": ["None"]}} },\n            "required": ["nrps_te_type"]\n          },\n          "then": {\n            "required": ["nrps_thioesterase"],\n            "properties": {"nrps_thioesterase": {"minItems": 1}}\n          }\n})\nnew_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"].append({\n    "if": {\n            "properties": {"subclass": {"enum": ["Other lipopeptide"]}},\n            "required": ["subclass"]\n          },\n          "then": {\n            "required": ["lipid_moiety"]\n          }\n})\n\n# nrps_module -- required = ["module_nr", "cd

In [26]:
### general_params.saccharide ###

# delete old dependencies format, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["allOf"].append({
    "if": {
            "properties": {"gt_specificity": {"enum": ["Other"]}},
            "required": ["gt_specificity"]
          },
          "then": {
            "required": ["other_gt_spec"]
          }
})
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["allOf"].append({
    "if": {
            "not": { "properties": {"gt_specificity": {"enum": ["Unknown"]}} },
            "required": ["gt_specificity"]
          },
          "then": {
            "required": ["evidence_gt_spec"]
          }
})

# move 'gt_genes.sugar_subcluster' to sugar_subclusters, we can infer the specificities from the list of gene ids
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["sugar_subclusters"] = {
    "title": "Sub-clusters for sugar biosynthesis",
    "type": "array",
    "items": new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["properties"]["sugar_subcluster"]
}
del new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["properties"]["sugar_subcluster"]


In [27]:
### for all 'required' fields, if it is an array, specify 'minItems = 1' so that it can't be empty
def apply_array_required(input_dict):
    if "properties" in input_dict:
        if "required" in input_dict:
            for req in input_dict["required"]:
                if "type" in input_dict["properties"][req] and input_dict["properties"][req]["type"] == "array":
                    if "minItems" not in input_dict["properties"][req]:
                        input_dict["properties"][req]["minItems"] = 1
        for key in input_dict["properties"]:
            apply_array_required(input_dict["properties"][key])
    elif "items" in input_dict:
        apply_array_required(input_dict["items"])
                                
apply_array_required(new_schema)

In [28]:
# if class = "Polyketide", requires "polyketide" attribute, etc. except if minimal = true
del new_schema["properties"]["general_params"]["dependencies"]
new_schema["properties"]["general_params"]["allOf"] = []
prop_attr_pairs = [
    ("NRP", "nrp"),
    ("Polyketide", "polyketide"),
    ("RiPP", "ripp"),
    ("Terpene", "terpene"),
    ("Saccharide", "saccharide"),
    ("Alkaloid", "alkaloid"),
    ("Other", "other")
]
for prop, attr in prop_attr_pairs:
    then = { "required": [attr] }
    sub_attr = new_schema["properties"]["general_params"]["properties"][attr]
    if "required" in sub_attr:
        then["properties"] = {}
        then["properties"][attr] = {"required": sub_attr["required"]}
        del sub_attr["required"]
    new_schema["properties"]["general_params"]["allOf"].append({
        "if": {
            "not": {"properties": {"minimal": {"const": True}}, "required": ["minimal"]},
            "properties": {"biosyn_class": {"contains":{"enum": [prop]}}}
          },
          "then": then
    })

In [29]:
# 5: save new schema
with open("../../outputs/mibig_schema_draft7.json", "w") as o:
    o.write(json.dumps(new_schema, indent=4, separators=(',', ': ')))

In [30]:
#### phase 5: transform data from phase 2 to match the new schema (include all dependencies/required keywords) ######

In [31]:
# 1: check data vs new schema to get quick overview of changed structures
# use all_props from phase 3
with open("../../outputs/mibig_schema_draft7.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_new_schema(json_obj, "", {})
    with open("../../preprocessed/schema_draft7_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    not_in_schema = []
    for key in sorted(all_props_phase_3.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props_phase_3[key]))
    with open("../../preprocessed/data_phase_3_vs_schema_draft7.csv", "w") as o:
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/bgc_to_fix_phase_5.csv", "w") as o:
        bgc_to_fix = {}
        for rep in not_in_schema:
            for bgc in rep[1]:
                if bgc not in bgc_to_fix:
                    bgc_to_fix[bgc] = []
                bgc_to_fix[bgc].append(rep[0])
        for bgc in bgc_to_fix:
            o.write("{},{}\n".format(bgc, ";".join(bgc_to_fix[bgc])))
        print("File written: {}".format(o.name))

File written: ../../preprocessed/schema_draft7_properties.csv
File written: ../../preprocessed/data_phase_3_vs_schema_draft7.csv
File written: ../../preprocessed/bgc_to_fix_phase_5.csv


In [32]:
# 2: fix data and assert that there is no more unrecognized attributes present
def match_attributes_to_schema_7(data):
    # remove 'version'
    del_key("version", data)
    con_comp_temp = []
    for nuc in data["general_params"]["loci"]["nucl_acc"]:
        # rename Accession to accession
        rename_key("Accession", "accession", nuc)
        if "conn_comp_cluster" in nuc:
            for con_comp in nuc["conn_comp_cluster"]:
                if con_comp not in con_comp_temp:
                    con_comp_temp.append(con_comp)
            del nuc["conn_comp_cluster"]
    # fix /general_params/loci/nucl_acc[]/conn_comp_cluster[]
    if len(con_comp_temp) > 0:
        data["general_params"]["loci"]["conn_comp_cluster"] = con_comp_temp
    # rename Polyketide, NRP, etc. to its lowercase version
    rename_key("Polyketide", "polyketide", data["general_params"])
    rename_key("NRP", "nrp", data["general_params"])
    rename_key("RiPP", "ripp", data["general_params"])
    rename_key("Terpene", "terpene", data["general_params"])
    rename_key("Saccharide", "saccharide", data["general_params"])
    rename_key("Alkaloid", "alkaloid", data["general_params"])
    rename_key("Other", "other", data["general_params"])
    for comp in data["general_params"]["compounds"]:
        # fix /general_params/compounds[]/database_deposited
        if "database_deposited" in comp:
            del comp["database_deposited"]
        # fix /general_params/compounds[]/databases_deposited[]
        if "databases_deposited" in comp:
            del comp["databases_deposited"]
    # fix /general_params/genes/gene[]/not_in_gbk
    if "genes" in data["general_params"]:
        if "gene" in data["general_params"]["genes"]:
            for gen in data["general_params"]["genes"]["gene"]:
                del_key("not_in_gbk", gen)
    # fix /general_params/saccharide/gt_genes[]/sugar_subcluster[]
    sugsub = []
    if "saccharide" in data["general_params"]:
        if "gt_genes" in data["general_params"]["saccharide"]:
            for gtg in data["general_params"]["saccharide"]["gt_genes"]:
                if "sugar_subcluster" in gtg:
                    sugsub.append(gtg["sugar_subcluster"])
                    del gtg["sugar_subcluster"]
        if len(sugsub) > 0:
            data["general_params"]["saccharide"]["sugar_subclusters"] = sugsub
    return

if not path.exists("../../preprocessed/json_1.4_phase_5/"):
    makedirs("../../preprocessed/json_1.4_phase_5/")
    
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/json_1.4_phase_3/")):
    json_obj = None
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        match_attributes_to_schema_7(json_obj)
        with open(path.join("../../preprocessed/json_1.4_phase_5/", path.basename(json_path)), "w") as json_file:
            json.dump(json_obj, json_file, indent=4, separators=(',', ': '))

all_props_phase_5 = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/json_1.4_phase_5/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props_phase_5:
                all_props_phase_5[prop] = [path.basename(json_path)]
            else:
                all_props_phase_5[prop].append(path.basename(json_path))

with open("../../outputs/mibig_schema_draft7.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_new_schema(json_obj, "", {})
    not_in_schema = []
    for key in sorted(all_props_phase_5.keys()):
        if key not in schema_props.keys():
            not_in_schema.append(key)
            print("{},{}".format(key, len(all_props_phase_5[key])))
    print("Number of conflicts: {}".format(len(not_in_schema)))

Number of conflicts: 0


In [33]:
# 3: validate data using JSON Schema V7 validator
with open("../../outputs/mibig_schema_draft7.json") as json_file:
    schema_obj = json.load(json_file)
    validator = Draft7Validator(schema_obj)
    errors = {}
    errors_by_message = {}
    for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/json_1.4_phase_5/")):
        break
        bgc_id = path.basename(json_path)
        print("Validating {}...".format(bgc_id))
        with open(json_path, "r") as json_file:
            data = json.load(json_file)
            for error in sorted(validator.iter_errors(data), key=str):
                # error by path
                path_err = ".".join([str(i) for i in list(error.schema_path)])
                inst_err = ""
                if not (isinstance(error.instance, dict) or isinstance(error.instance, list)):
                    inst_err = str(error.instance)
                if path_err not in errors:
                    errors[path_err] = {
                        "files": [],
                        "instances": []
                    }
                if bgc_id not in errors[path_err]["files"]:
                    errors[path_err]["files"].append(bgc_id)
                if inst_err not in errors[path_err]["instances"]:
                    errors[path_err]["instances"].append(inst_err)
                # error by message
                if error.message not in errors_by_message:
                    errors_by_message[error.message] = []
                if bgc_id not in errors_by_message[error.message]:
                    errors_by_message[error.message].append(bgc_id)
                    
    #with open("../../preprocessed/phase_5_errors.tsv", "w") as error_list:
    #    for error in sorted(errors.keys(), reverse = True):
    #        error_list.write("{}\t{}\t{}\t{}\n".format(error, error.split(".")[-1], len(errors[error]["files"]), ";".join(errors[error]["instances"])))
            
    #with open("../../preprocessed/phase_5_errors_by_message.tsv", "w") as error_list:
    #    for error in sorted(errors_by_message.keys(), reverse = True):
    #        error_list.write("{}\t{}\n".format(error, len(errors_by_message[error])))
            

In [64]:
# 4: fix conflicts, then save to final output folder
def fix_data_new_schema(data, error):
    if len(error.path) < 1:
        # problem is in root, need a separate approach
        if error.validator == "required":
            missing_keys = set(error.validator_value) - set(data.keys())
            if "created" in missing_keys:
                data["created"] = date2iso(now)
            if "modified" in missing_keys:
                data["modified"] = date2iso(now)
            if "personal" in missing_keys:
                data["personal"] = {
                    "submitter_name": "mibig.secondarymetabolites.org",
                    "submitter_institution": "MIBiG",
                    "submitter_email": "info@mibig.secondarymetabolites.org"
                }
    else:        
        # get problematic parent instance from data (so that we can fix it)
        error_container = data
        error_container_parent = None # for catching grandparent
        error_container_attribute = None
        while len(error.path) > 1:
            if len(error.path) == 2:
                error_container_parent = error_container
            error_container_attribute = error.path.popleft()
            error_container = error_container[error_container_attribute] # parent node containing the error instance
        error_attribute = error.path.popleft() # attribute from parent node containing the error instance

        if isinstance(error_container, ToDelete):
            return
        elif isinstance(error_container[error_attribute], ToDelete):
            return
        
        # function to replace attribute values
        def replace_attr(attr_pairs):
            for attr_from, attr_to in attr_pairs:
                if error.instance == attr_from:
                    error_container[error_attribute] = attr_to
                    return True
            return False
        
        # fix type errors (should be generalizable)
        if error.validator == "type":
            if error.validator_value == "integer":
                try:
                    error_container[error_attribute] = int(error.instance)
                    return
                except:
                    # may need tailored fix
                    pass
            elif error.validator_value == "number":
                try:
                    error_container[error_attribute] = float(error.instance)
                    return
                except:
                    # may need tailored fix
                    pass
            elif error.validator_value == "string":
                # fix mut_pheno, error list shown that these are None/null values, delete the attribute instead
                if error_attribute == "mut_pheno":
                    del error_container[error_attribute]
                # fix gene_function
                elif error_attribute == "gene_function":
                    error_container[error_attribute] = "Unknown"
                # fix gene_annotation, name, id
                elif error_attribute in ["gene_annotation", "gene_name", "gene_id"]:
                    del error_container[error_attribute]
                # fix comments, gene_comments
                elif error_attribute in ["comments", "gene_comments"]:
                    error_container[error_attribute] = ""

        # fix minimum errors (generalizable, but needs to be careful)
        elif error.validator == "minimum":
            # fix gene.startpos
            if error_attribute == "gene_startpos":
                error_container[error_attribute] = 0
            # fix gene.endpos
            if error_attribute == "gene_endpos":
                error_container[error_attribute] = 0

        # fix minItems errors (generalizable but limited e.g. need to consider 'required')
        elif error.validator == "minItems":
            # fix operon_genes = [], then delete operon
            if error_attribute == "operon_genes":
                del data["general_params"]["genes"]["operon"]
            # fix ripp..gene_id = []
            elif error_attribute == "gene_id":
                del data["general_params"]["ripp"]["precursor_loci"]
                data["general_params"]["minimal"] = True
                
        # fix enum errors (semi-generalizable, needs to know what to put in place of the wrong value)
        elif error.validator == "enum":
            # fix biosyn_class
            if error_container_attribute == "biosyn_class":
                attr_pairs = []
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Other"
                    pass
            # fix genes..gene_function
            if error_attribute == "gene_function":
                attr_pairs = [
                    ("", "Unknown"),
                    ("None", "Unknown"),
                    ("Scaffold Biosynthesis", "Scaffold biosynthesis")
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix genes..tailoring
            if error_attribute == "tailoring":
                attr_pairs = [
                    ("None", "Unknown"),
                ]
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Other"
                    pass
            # fix genes..evidence_genefunction
            if error_container_attribute == "evidence_genefunction":
                evidences = []
                errorinstance = error.instance.replace(" ", "").replace("-", "").lower()
                if "activityassay" in errorinstance:
                    evidences.append("Activity assay")
                if "knockout" in errorinstance:
                    evidences.append("Knock-out")
                if "invivo" in errorinstance:
                    evidences.append("Other in vivo study")
                if "sequence" in errorinstance:
                    evidences.append("Sequence-based prediction")
                if "expression" in errorinstance:
                    evidences.append("Heterologous expression")
                if len(evidences) < 1:
                    evidences.append("Other")
                error_container[error_attribute] = evidences[0]
                if len(evidences) > 1:
                    error_container.extend(evidences)
            # fix loci..complete
            elif error_attribute == "complete":
                if error.instance == "partial":
                    error_container[error_attribute] = "incomplete"
            # fix loci..conn_comp_cluster
            elif error_container_attribute == "conn_comp_cluster":
                attr_pairs = [
                    ("Proven expression in natural host", "Gene expression correlated with compound production"),
                    ("Knock-outstudies", "Knock-out studies")
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix compounds..chem_act
            elif error_container_attribute == "chem_act":
                attr_pairs = [
                    ("", "Unknown")
                ]
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Other"
                    error_container_parent["other_chem_act"] = error.instance
                    pass
            # fix compounds..chem_moiety
            elif error_attribute == "chem_moiety":
                attr_pairs = []
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Other"
                    error_container["other_chem_moiety"] = error.instance
                    pass                
            # fix compounds.mass_ion_type
            elif error_attribute == "mass_ion_type":
                error_container[error_attribute] = "Other"
            # fix polyketide.pks_subclass
            elif error_container_attribute == "pks_subclass":
                attr_pairs = [
                    ("Type I", "Modular type I"), # (aculeximycin)
                    ("Iterative typeI", "Iterative type I"),
                    ("Modular Type I", "Modular type I")
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix polyketide.starter_unit
            elif error_attribute == "starter_unit":
                attr_pairs = [
                    ("methylmalonate-CoA", "Other"),
                    ("None", "Unknown"),
                    ("4-hydroxyphenylpyruvate", "Other")
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix polyketide.pks_te_type
            elif error_attribute == "pks_te_type":
                attr_pairs = [
                    ("other", "Other"),
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix polyketide..pks_domains
            elif error_container_attribute == "pks_domains":
                attr_pairs = [
                    ("AT", "Acyltransferase"),
                    ("DH", "Dehydratase"),
                    ("KR", "Ketoreductase"),
                    ("ACP", "Thiolation (ACP/PCP)"),
                    ("PCP", "Thiolation (ACP/PCP)"),
                    ("T", "Thiolation (ACP/PCP)"),
                    ("CAL", "CoA-ligase"),
                    ("ER", "Enoylreductase"),
                    ("KS", "Ketosynthase")
                ]
                if not replace_attr(attr_pairs):
                    # what to do? TE;PT;SAT;PPTASE;A;E;TE/CLC;CMET;FAAL;ST;C;TR
                    # (lazily) delete?
                    error_container[error_attribute] = ToDelete()
                    pass
            # fix mod_pks_genes..kr_stereochem, A->S->D-OH, B->R->L-OH
            elif error_attribute == "kr_stereochem":
                attr_pairs = [
                    ("A-group", "D-OH"),
                    ("B-group", "L-OH")
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix mod_pks_genes..at_substr_spec
            elif error_attribute == "at_substr_spec":
                attr_pairs = [
                    ("malonyl-CoA", "Malonyl-CoA"),
                    ("methylmalonyl-CoA", "Methylmalonyl-CoA"),
                    ("Malonyl-CoA/Malonyl-CoA/Malonyl-CoA", "Malonyl-CoA"),
                    ("Methylmalonyl-CoA/Methylmalonyl-CoA", "Methylmalonyl-CoA"),
                    ("N/A", "None")
                ]
                if not replace_attr(attr_pairs):
                    if error.instance == "Acetyl-CoA/Methylmalonyl-CoA":
                        error_container[error_attribute] = "Multiple (promiscuous)"
                        error_container["at_multiple_spec"] = ["Acetyl-CoA", "Methylmalonyl-CoA"]
                    elif error.instance == "Methylmalonyl-CoA/Malonyl-CoA":
                        error_container[error_attribute] = "Multiple (promiscuous)"
                        error_container["at_multiple_spec"] = ["Malonyl-CoA", "Methylmalonyl-CoA"]
                    elif error.instance == "Acetyl-CoA + Malonyl CoA":
                        error_container[error_attribute] = "Multiple (promiscuous)"
                        error_container["at_multiple_spec"] = ["Malonyl-CoA", "Acetyl-CoA"]
                    else:
                        # what to do? 4-hydroxyphenylpyruvate;Various atypical acyl-CoAs;phenylacetate-like;Decanoyl-CoA
                        # set to 'Other'
                        error_container[error_attribute] = "Other"
                    pass
            # fix mod_pks_genes..evidence_at_spec
            elif error_attribute == "evidence_at_spec":
                attr_pairs = [
                    ("Feeding study", "Other")
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix mod_pks_genes..pks_mod_doms
            elif error_container_attribute == "pks_mod_doms":
                attr_pairs = []
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Other"
                    error_container_parent["pks_other_mod_dom"] = error.instance
                    pass
            # fix nrps_modules..nrps_mod_doms
            elif error_attribute == "nrps_mod_doms":
                attr_pairs = []
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Other"
                    error_container["nrps_other_mod_dom"] = error.instance
                    pass
            # fix nrps_modules..prot_adom_spec
            elif error_attribute == "prot_adom_spec":
                if error.instance == "Asparigine":
                    error_container[error_attribute] = "Asparagine"
            # fix nrp..cdom_subtype
            elif error_attribute == "cdom_subtype":
                attr_pairs = [
                    ("N/A", "Unknown"),
                    ("None", "Unknown")
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix nrp..nonprot_adom_spec
            elif error_attribute == "nonprot_adom_spec":
                # a bit too random, consider freeforming this?
                error_container[error_attribute] = "Other"
                error_container["other_spec"] = error.instance
            # fix ripp..ripp_subclass
            elif error_attribute == "ripp_subclass":
                attr_pairs = [
                    ("Lantipeptide", "Lanthipeptide"),
                    ("Head-To-Tail Cyclized Peptide", "Head-to-tailcyclized peptide"),
                    ("Lap", "LAP"),
                    ("Lap / Microcin", "LAP"),
                    ("Lasso Peptide", "Lassopeptide"),
                    ("None", "Other")
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix ripp..lin_cycl_ripp
            elif error_attribute == "lin_cycl_ripp":
                attr_pairs = [
                    ("linear", "Linear")
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix saccharide..saccharide_subclass
            elif error_attribute == "saccharide_subclass":
                if error.instance == "hyrbid/tailoring":
                    error_container[error_attribute] = "hybrid/tailoring"
            # fix saccharide..gt_specificity
            elif error_attribute == "gt_specificity":
                if error.instance == "None":
                    error_container[error_attribute] = "Unknown"
                else:
                    error_container[error_attribute] = "Other"
                    error_container["other_gt_spec"] = error.instance
            # fix saccharide..evidence_gt_spec
            elif error_attribute == "evidence_gt_spec":
                if error.instance == "structure-based inference":
                    error_container[error_attribute] = "Structure-based inference"                
            # fix other.other_subclass
            elif error_attribute == "other_subclass":
                error_container[error_attribute] = "Other"
            ## try to fix "None"/"N/A"/"" --> "Unknown"
            else:
                attr_to_use = "Unknown"
                use_others_instead = [
                    "crosslink_type",
                    "evidence_a_spec",
                    "nrps_evidence_skip_iter",
                    "terpene_subclass",
                    "terpene_c_len",
                    "terpene_precursor",
                    "pk_subclass",
                    "subclass",
                    "pks_evidence_skip_iter"
                ]
                if error_attribute in use_others_instead:
                    attr_to_use = "Other"
                attr_pairs = [
                    ("N/A", attr_to_use),
                    ("None", attr_to_use),
                    ("", attr_to_use)
                ]
                if not replace_attr(attr_pairs):
                    pass
                
        # fix requirement errors (needs hand-on approach)        
        elif error.validator == "required":
            missing_keys = set(error.validator_value) - set(error_container[error_attribute].keys())
            # fix saccharide subclasses
            if "saccharide" in missing_keys:
                # if PKS+Saccharide, assume it is a tailoring GT
                if "Polyketide" in data["general_params"]["biosyn_class"]:
                    data["general_params"]["saccharide"] = { "saccharide_subclass": "hybrid/tailoring" }
                # if NR+Saccharide, assume it is a tailoring GT
                elif "NRP" in data["general_params"]["biosyn_class"]:
                    data["general_params"]["saccharide"] = { "saccharide_subclass": "hybrid/tailoring" }
            # fix other subclasses
            if "other" in missing_keys:
                data["general_params"]["other"] = {"other_subclass": "Other"}
            # fix nrp subclasses
            if "nrp" in missing_keys:
                data["general_params"]["nrp"] = {}
            # fix polyketide subclasses
            if "polyketide" in missing_keys:
                data["general_params"]["polyketide"] = {}
            # fix publication
            if "publications" in missing_keys:
                ncbi_acc = data["general_params"]["loci"]["nucl_acc"][0]["accession"]
                def fix_publications(acc_pub_pairs):
                    for ncbi_accs, pubs in acc_pub_pairs:
                        if ncbi_acc in ncbi_accs:
                            error_container[error_attribute]["publications"] = pubs
                            return True
                    return False
                acc_pub_pairs = [
                    (["AY510455"], ["16108793"]), # aflatoxin
                    (["AY092402"], ["19537208"]), # aflatoxin/sterigmatocystin
                    (["GP697151"], ["patent:US7595187"]), # Elaiophylin
                    (["FM173265"], ["19025863"]), # lasalocid
                    (["BD420675"], ["patent:CN1896226B"]), # Midecamycin
                    (["AB363939"], ["3372359"]), # nemadectin (LL-F28249)
                    (["CP000850"], ["19474814"]), # rifamycins
                    (["FN565166"], ["20140934"]), # chrysomycin
                    (["EU220288", "EU232693"], ["18802638"]), # anglomycin
                    (["AF141924", "AF141925"], ["10334994"]), # lovastatin
                    (["AY228175"], ["10.1021/ja00275a058"]), # kinamycin
                    (["AF323753"], ["11683270"]), # nogalamycin
                    (["BD251846"], ["patent:JP2002528068A"]), # another nogalamycin?                    
                    (["EF151801"], ["25677666"]), # pradimicin                
                    (["DQ266254"], ["17381736"]), # prodigiosin                
                    (["FN565485"], ["20140934"]), # ravidomycin
                    (["AF293355"], ["26433383"]), # rubrinomycin
                    ([""], [""]), # 
                    ([""], [""]), # 
                    ([""], [""]), # 
                    ([""], [""]), # 
                    ([""], [""]), # 
                    ([""], [""]), # 
                    ([""], [""]), # 
                    ([""], [""]), # 
                    ([""], [""]), # 
                    ([""], [""]), # 
                    ([""], [""]), # 
                    ([""], [""]), # 
                    (["FJ719113"], ["19621341"]) # erdacin
                ]
                if not fix_publications(acc_pub_pairs):
                    print("Missing publication: {}".format(ncbi_acc))
                    # for now, fill with empty string (let's tackle other issues first)
                    error_container[error_attribute]["publications"] = [""]
                if ncbi_acc in ["AF141924", "AF141925"]: # lovastatin (2 of them?), note: accession is obsolete! replace with AH007774
                    data["general_params"]["loci"]["nucl_acc"][0]["accession"] = "AH007774"
                elif ncbi_acc == "AY228175": # kinamycin accession is obsolete! replace with AH012623
                    data["general_params"]["loci"]["nucl_acc"][0]["accession"] = "AH012623"
                    
            # fix compounds.evidence_struct, set to 'Other'
            if "evidence_struct" in missing_keys:
                error_container[error_attribute]["evidence_struct"] = ["Other"]
            # fix compounds.chem_act, set to 'Unknown'
            if "chem_act" in missing_keys:
                error_container[error_attribute]["chem_act"] = ["Unknown"]
            # fix compounds.other_chem_act, set to ''
            if "other_chem_act" in missing_keys:
                error_container[error_attribute]["other_chem_act"] = ""
            # fix compounds..other_chem_moiety
            if "other_chem_moiety" in missing_keys:
                error_container[error_attribute]["other_chem_moiety"] = ""
            # fix loci.evidence_struct, set to 'Other'
            if "conn_comp_cluster" in missing_keys:
                error_container[error_attribute]["conn_comp_cluster"] = ["Other"]
            # fix loci.complete, set to 'unknown'
            if "complete" in missing_keys:
                error_container[error_attribute]["complete"] = "unknown"
            # fix genes..tailoring
            if "tailoring" in missing_keys:
                error_container[error_attribute]["gene_function"] = "Unknown"
            # fix genes..gene_function
            if "gene_function" in missing_keys:
                error_container[error_attribute]["gene_function"] = "Unknown"
            # fix genes..evidence_genefunction
            if "evidence_genefunction" in missing_keys:
                if "mut_pheno" in error_container[error_attribute]:
                    error_container[error_attribute]["evidence_genefunction"] = ["Knock-out"]
                else:
                    error_container[error_attribute]["evidence_genefunction"] = ["Other"]
            # fix genes..evidence_operon
            if "evidence_operon" in missing_keys:
                error_container[error_attribute]["evidence_operon"] = "Other"
            # fix saccharide..evidence_gt_spec
            if "evidence_gt_spec" in missing_keys:
                error_container[error_attribute]["evidence_gt_spec"] = "Other"
            # fix saccharide..gt_gene
            if "gt_gene" in missing_keys:
                error_container_parent[error_container_attribute] = ToDelete()
            # fix ripp..precursor_loci by setting minimal=True (no good solution otherwise)
            if "precursor_loci" in missing_keys:
                data["general_params"]["minimal"] = True
                
        # fix pattern errors (needs hand-on approach)        
        elif error.validator == "pattern":                
            # fix pks and nrp module_nr
            if error_attribute == "module_nr":
                # X/x, let's delete it for now (?)
                del error_container[error_attribute]
    return

if not path.exists("../../outputs/json_2.0/"):
    makedirs("../../outputs/json_2.0/")

with open("../../outputs/mibig_schema_draft7.json") as json_file:
    schema_obj = json.load(json_file)
    validator = Draft7Validator(schema_obj)
    errors = {}
    for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/json_1.4_phase_5/")):
        bgc_id = path.basename(json_path)
        id_int = int(bgc_id[3:-5])
        if (id_int < last_error) and True:
            continue
        with open(json_path, "r") as json_file:
            data = json.load(json_file)
            error_counts_before = 0
            error_counts_after = 0
            for error in sorted(validator.iter_errors(data), key=str):
                fix_data_new_schema(data, error)
                error_counts_before += 1
            lazily_deletes(data)
            error_counts = 0
            for error in sorted(validator.iter_errors(data), key=str):
                print(error.path)
                error_counts_after += 1
            print("Validated and fixed {}... Before {} error(s), After: {} error(s)".format(bgc_id, error_counts_before, error_counts_after))
            with open(path.join("../../outputs/json_2.0/", bgc_id), "w") as jo:
                json.dump(data, jo, indent=4, separators=(',', ': '))
            if error_counts_after > 0:
                last_error = id_int
                exit(1)
    print("All data validated!")

Validated and fixed BGC0001445.json... Before 27 error(s), After: 0 error(s)
Validated and fixed BGC0001446.json... Before 11 error(s), After: 0 error(s)
Missing publication: AHBX01000216
Validated and fixed BGC0001447.json... Before 9 error(s), After: 0 error(s)
Validated and fixed BGC0001448.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001449.json... Before 11 error(s), After: 0 error(s)
Validated and fixed BGC0001450.json... Before 7 error(s), After: 0 error(s)
Missing publication: NZ_LGTG01000643
Validated and fixed BGC0001451.json... Before 8 error(s), After: 0 error(s)
Validated and fixed BGC0001452.json... Before 5 error(s), After: 0 error(s)
Missing publication: LT989883
Validated and fixed BGC0001453.json... Before 8 error(s), After: 0 error(s)
Missing publication: LT989884
Validated and fixed BGC0001454.json... Before 8 error(s), After: 0 error(s)
Missing publication: LT989885
Validated and fixed BGC0001455.json... Before 9 error(s), After: 0 error(s)


Validated and fixed BGC0001548.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001549.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001550.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001551.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001552.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001553.json... Before 8 error(s), After: 0 error(s)
Validated and fixed BGC0001554.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001555.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001556.json... Before 8 error(s), After: 0 error(s)
Validated and fixed BGC0001557.json... Before 8 error(s), After: 0 error(s)
Validated and fixed BGC0001558.json... Before 9 error(s), After: 0 error(s)
Validated and fixed BGC0001559.json... Before 9 error(s), After: 0 error(s)
Validated and fixed BGC0001560.json... Before 8 error(s), After: 0 error(s)
Validated an

Validated and fixed BGC0001655.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001656.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001657.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001658.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001659.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001660.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001661.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001662.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001663.json... Before 9 error(s), After: 0 error(s)
Validated and fixed BGC0001664.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001665.json... Before 9 error(s), After: 0 error(s)
Validated and fixed BGC0001666.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001667.json... Before 20 error(s), After: 0 error(s)
Validated a

Validated and fixed BGC0001762.json... Before 9 error(s), After: 0 error(s)
Validated and fixed BGC0001763.json... Before 8 error(s), After: 0 error(s)
Validated and fixed BGC0001764.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001765.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001766.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001767.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001768.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001769.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001770.json... Before 8 error(s), After: 0 error(s)
Validated and fixed BGC0001771.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001772.json... Before 7 error(s), After: 0 error(s)
Validated and fixed BGC0001773.json... Before 8 error(s), After: 0 error(s)
Validated and fixed BGC0001774.json... Before 7 error(s), After: 0 error(s)
Validated an

In [35]:
last_error = 0