In [None]:
###### phase 6: convert data from phase 5 to fit a manually restructured draft7 schema ########

In [None]:
## common imports ##
from os import path, makedirs
import glob
import json
from jsonschema.validators import Draft7Validator
from sys import exit
import datetime
import time

In [None]:
## common functions ##
def fetch_mibig_json_filepaths(dir_path):
    """fetch mibig json paths from a specific folder"""
    return glob.glob(path.join(dir_path, "BGC*.json"))

def count_props(input_dict, cur_path, result):
    """given a (mibig?) json, construct a list of property paths
    along with its presence count in the json object"""
    key_path = cur_path
    
    if isinstance(input_dict, dict):
        for key in input_dict.keys():
            result = count_props(input_dict[key], "{}/{}".format(key_path, key), result)
    elif isinstance(input_dict, list):
        key_path = "{}[]".format(key_path)
        for node in input_dict:
            result = count_props(node, "{}".format(key_path), result)

    if not isinstance(input_dict, dict):
        if key_path not in result:
            result[key_path] = 0
        result[key_path] += 1
    
    return result


def fetch_props_new_schema(input_dict, cur_path, result):
    """given a (mibig?) json draft7 schema, construct a list of property paths
    along with either required == True for each properties"""
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_props_new_schema(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_props_new_schema(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result and "properties" not in input_dict:
        result[key_path] = False # can't really use this
    return result


def search_and_delete(key, input_dict):
    """delete keys from nested dict"""
    if isinstance(input_dict, list):
        for i in input_dict:
            search_and_delete(key, i)
    elif not isinstance(input_dict, dict):
        return
    to_del = []
    for k in input_dict:
        if k == key:
            to_del.append(k)
        elif isinstance(input_dict[k], dict):
            search_and_delete(key, input_dict[k])
    for k in to_del:
        del input_dict[k]

        
def rename_key(from_key, to_key, parent_dict):
    """rename key in dict"""
    if from_key in parent_dict:
        parent_dict[to_key] = parent_dict[from_key]
        del parent_dict[from_key]

def del_key(key, parent_dict):
    if key in parent_dict:
        del parent_dict[key]
        
import time
def date2iso(thedate):
    strdate = thedate.strftime("%Y-%m-%dT%H:%M:%S")
    minute = (time.localtime().tm_gmtoff / 60) % 60
    hour = ((time.localtime().tm_gmtoff / 60) - minute) / 60
    utcoffset = "%.2d:%.2d" %(hour, minute)
    if utcoffset[0] != '-':
        utcoffset = '+' + utcoffset
        return strdate + utcoffset
    
class ToDelete():
    """dummy class for lazy deletion of list members"""
    pass

def lazily_deletes(input_dict):
    """traverse and lazily delete list/dict members"""
    if isinstance(input_dict, list):
        new_list = []
        for i, node in enumerate(input_dict):
            if not isinstance(node, ToDelete):
                input_dict[i] = lazily_deletes(node)
                new_list.append(node)
        return new_list
    elif isinstance(input_dict, dict):
        key_to_dels = []
        for key in input_dict:
            if not isinstance(input_dict[key], ToDelete):
                input_dict[key] = lazily_deletes(input_dict[key])
            else:
                key_to_dels.append(key)
        for key in key_to_dels:
            del input_dict[key]
    return input_dict


In [None]:
schema_props = {}
with open("../../inputs/mibig_schema_phase_6.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_new_schema(json_obj, "", {})

In [None]:
def transform_data_genes(data):
    # /general_params/genes/gene[]
    if "genes" in data["cluster"]:
        if "gene" in data["cluster"]["genes"]:
            data["cluster"]["genes"]["extra_genes"] = []
            data["cluster"]["genes"]["annotations"] = []
            for gene in data["cluster"]["genes"]["gene"]:
                if gene.get("not_in_gbk"):
                    if gene["gene_id"] == "No protein ID":
                        gene["gene_id"] = gene["gene_name"]
                    extra_gene = {
                        "id": gene.get("gene_id"),
                    }
                    if extra_gene["id"] == None:
                        del extra_gene["id"]
                    if "gene_startpos" in gene:
                        extra_gene["location"] = {
                            "exons": [{"start": gene["gene_startpos"], "end": gene["gene_endpos"]}],
                            "strand": gene["strand"]
                        }
                    if "aa_seq" in gene:
                        extra_gene["translation"] = gene["aa_seq"]
                    if len(extra_gene.keys()) > 0:
                        data["cluster"]["genes"]["extra_genes"].append(extra_gene)
                if "evidence_genefunction" in gene:
                    annotation = {
                        "name": gene.get("gene_name"),
                        "id": gene.get("gene_id"),
                        "functions": [{"category": gene["gene_function"], "evidence": gene["evidence_genefunction"]}]                    
                    }
                    if annotation["name"] == None:
                        del annotation["name"]
                    if annotation["id"] == None:
                        del annotation["id"]
                    if "gene_annotation" in gene:
                        annotation["product"] = gene["gene_annotation"]
                    if "tailoring" in gene:
                        annotation["tailoring"] = [gene["tailoring"]]
                    if "gene_pubs" in gene:
                        annotation["publications"] = gene["gene_pubs"]
                    if "gene_comments" in gene:
                        annotation["comments"] = gene["gene_comments"]
                    data["cluster"]["genes"]["annotations"].append(annotation)
            del data["cluster"]["genes"]["gene"]
        if "operon" in data["cluster"]["genes"]:
            data["cluster"]["genes"]["operons"] = []
            for operon in data["cluster"]["genes"]["operon"]:
                rename_key("operon_genes", "genes", operon)
                rename_key("evidence_operon", "evidence", operon)
                operon["evidence"] = [operon["evidence"]]
                data["cluster"]["genes"]["operons"].append(operon)
            del data["cluster"]["genes"]["operon"]
    return

In [None]:
with open("../../preprocessed/reports/p6-compounds_ids.tsv", "w") as o:
    o.write("bgc_id\tcompound\tpubchem\tchebi\tchembl\tchemspider\n")

def transform_data_compounds(data):
    #/general_params/compounds[]/
    for compound in data["cluster"]["compounds"]:
        # database_id
        compound["database_id"] = []
        values = ""
        if "pubchem_id" in compound and compound["pubchem_id"] > 0:
            compound["database_id"].append("pubchem:{}".format(compound["pubchem_id"]))
            values += str(compound["pubchem_id"])
            del compound["pubchem_id"]
        values += "\t"
        if "chebi_id" in compound and compound["chebi_id"] > 0:
            compound["database_id"].append("chebi:{}".format(compound["chebi_id"]))
            values += str(compound["chebi_id"])
            del compound["chebi_id"]
        values += "\t"
        if "chembl_id" in compound and compound["chembl_id"] > 0:
            compound["database_id"].append("chembl:{}".format(compound["chembl_id"]))
            values += str(compound["chembl_id"])
            del compound["chembl_id"]
        values += "\t"
        if "chemspider_id" in compound and compound["chemspider_id"] > 0:
            compound["database_id"].append("chemspider:{}".format(compound["chemspider_id"]))    
            values += str(compound["chemspider_id"])
            del compound["chemspider_id"]
        with open("../../preprocessed/reports/p6-compounds_ids.tsv", "a") as o:
            o.write("{}\t{}\t{}\n".format(data["cluster"]["mibig_accession"], compound["compound"], values))
        # mass_spec_ion_type
        rename_key("mass_ion_type", "mass_spec_ion_type", compound)
        # evidence
        rename_key("evidence_struct", "evidence", compound)
        # chem_acts
        rename_key("chem_act", "chem_acts", compound)
        # chem_targets
        if "chem_target" in compound:
            chem_targets = compound["chem_target"]
            compound["chem_target"] = []
            for chem_target in chem_targets:
                compound["chem_target"].append({
                    "target": chem_target,
                    "publications": []
                })
            rename_key("chem_target", "chem_targets", compound)
        # chem_moieties
        if "chem_moieties" in compound:
            moieties = []
            for chem_moiety in compound["chem_moieties"]:
                rename_key("chem_moiety", "moiety", chem_moiety)
                rename_key("moiety_subcluster", "subcluster", chem_moiety)
                if len(chem_moiety.get("moiety", "")) > 0:
                    if len(chem_moiety.get("subcluster", [])) < 1:
                        chem_moiety.pop("subcluster", None)
                    moieties.append(chem_moiety)
            compound["chem_moieties"] = moieties
            if len(moieties) < 0:
                del compound["chem_moieties"]
    return

In [None]:
def transform_data_polyketide(data):
    # /general_params/polyketide
    if "polyketide" in data["cluster"]:
        polyketide = data["cluster"]["polyketide"]
        if "pk_subclass" in polyketide:
            polyketide["subclasses"] = [polyketide["pk_subclass"]]
            del polyketide["pk_subclass"]
        if "lin_cycl_pk" in polyketide:
            polyketide["cyclic"] = polyketide.get("lin_cycl_pk") == "Cyclic"
            polyketide.pop("lin_cycl_pk", None)
        if "pks_release_type" in polyketide:
            polyketide["release_type"] = [polyketide["pks_release_type"]]
            del polyketide["pks_release_type"]
        synthase = {
            "genes": set([])
        }
        if "pks_genes" in polyketide:
            synthase["genes"] |= set(polyketide["pks_genes"])
            del polyketide["pks_genes"]
        if "pks_subclass" in polyketide:
            synthase["subclass"] = polyketide["pks_subclass"]
            del polyketide["pks_subclass"]
        if "pufa_mod_doms" in polyketide:
            synthase["pufa_modification_domains"] = polyketide["pufa_mod_doms"]
            del polyketide["pufa_mod_doms"]
        if "nr_iterations" in polyketide:
            synthase["iterative"] = {
                "nr_iterations": polyketide["nr_iterations"],
                "subtype": polyketide["iterative_subtype"],
                "cyclization_type": polyketide.get("iter_cycl_type", "Unknown")
            }
            del polyketide["nr_iterations"]
            del polyketide["iterative_subtype"]
            polyketide.pop("iter_cycl_type", None)
        if len(polyketide.get("trans_at", [])) > 0:
            synthase["trans_at"] = {
                "genes": polyketide["trans_at"]
            }
            synthase["genes"] |= set(polyketide["trans_at"])
        polyketide.pop("trans_at", None)
        if "cyclases" in polyketide:            
            synthase["genes"] |= set(polyketide["cyclases"])
        if "pks_thioesterase" in polyketide:
            te_type = "Unknown"
            if polyketide.get("pks_te_type") in ["Type I", "Type II"]:
                te_type = polyketide["pks_te_type"]
            synthase["thioesterases"] = [{"gene": gene, "thioesterase_type": te_type} for gene in polyketide["pks_thioesterase"]]
            synthase["genes"] |= set(polyketide["pks_thioesterase"])
        polyketide.pop("pks_thioesterase", None)
        polyketide.pop("pks_te_type", None)
        if "mod_pks_genes" in polyketide:
            synthase["modules"] = []
            for mod_gene in polyketide["mod_pks_genes"]:
                if "pks_module" in mod_gene:
                    for mod in mod_gene["pks_module"]:
                        module = {}
                        module["genes"] = [mod_gene["mod_pks_gene"]]
                        synthase["genes"] |= set(module["genes"])
                        if "module_nr" in mod:
                            module["module_number"] = mod["module_nr"]
                        module["domains"] = mod.get("pks_domains", [])
                        module["at_specificities"] = [mod.get("at_substr_spec", "Unknown")]
                        module["at_specificities"].extend(mod.get("at_multiple_spec", []))
                        if mod.get("evidence_at_spec", "") in ["Sequence-based prediction", "Structure-based inference", "Feeding study", "Activity assay"]:
                            module["evidence"] = mod["evidence_at_spec"]
                        module["kr_stereochem"] = "Unknown"
                        if mod.get("kr_stereochem") in ["Inactive", "L-OH", "D-OH"]:
                            module["kr_stereochem"] = mod["kr_stereochem"]
                        module["pks_mod_doms"] = mod["pks_mod_doms"]
                        module["comments"] = mod.get("comments", "")
                        if "pks_evidence_skip_iter" in mod:
                            module["non_canonical"] = {}
                            module["non_canonical"]["skipped"] = mod.get("pks_mod_skip_iter") == "Skipped"
                            module["non_canonical"]["non_elongating"] = mod.get("pks_mod_skip_iter") == "Non-elongating"
                            module["non_canonical"]["iterated"] = mod.get("pks_mod_skip_iter") == "Iterated"
                            if mod.get("pks_evidence_skip_iter") in ["Sequence-based prediction", "Structure-based inference", "Activity assay"]:
                                module["non_canonical"]["evidence"] = [mod["pks_evidence_skip_iter"]]
                        synthase["modules"].append(module)
            del polyketide["mod_pks_genes"]
        synthase["genes"] = list(synthase["genes"])
        if len(synthase["genes"]) < 1:
            del synthase["genes"]
        if len(synthase.get("genes", [])) > 0:#len(synthase.keys()) > 0:
            polyketide["synthases"] = [synthase]
    return

In [None]:
def transform_data_nrp(data):
    if "nrp" not in data["cluster"]:
        return
    nrp = data["cluster"]["nrp"]
    
    if "lin_cycl_nrp" in nrp:
        nrp["cyclic"] = nrp.get("lin_cycl_nrp") == "Cyclic"
        nrp.pop("lin_cycl_nrp", None)
    
    if "nrps_thioesterase" in nrp:
        te_type = "Unknown"
        if nrp.get("nrps_te_type") in ["Type I", "Type II"]:
            te_type = nrp["nrps_te_type"]
        nrp["thioesterases"] = [{"gene": gene, "thioesterase_type": te_type} for gene in nrp["nrps_thioesterase"]]
        nrp.pop("nrps_thioesterase", None)
    nrp.pop("nrps_te_type", None)
        
    rename_key("nrps_release_type", "release_type", nrp)
    if "release_type" in nrp:
        nrp["release_type"] = [nrp["release_type"]]

    if "nrps_genes" in nrp:
        for nrps in nrp["nrps_genes"]:
            rename_key("nrps_gene", "gene_id", nrps)
            rename_key("nrps_module", "modules", nrps)
            if "modules" in nrps:
                for module in nrps["modules"]:
                    rename_key("module_nr", "module_number", module)
                    module["active"] = module.get("nrps_mod_skip_iter") != "Neither"
                    module.pop("nrps_mod_skip_iter", None)
                    if "a_substr_spec" in module:
                        module["a_substr_spec"].pop("aa_type", None)
                        a_spec = {
                        }
                        if "prot_adom_spec" in module["a_substr_spec"]:
                            a_spec["proteinogenic"] = [module["a_substr_spec"]["prot_adom_spec"]]
                        if "nonprot_adom_spec" in module["a_substr_spec"]:
                            a_spec["nonproteinogenic"] = [module["a_substr_spec"]["nonprot_adom_spec"]]
                        if "a_multiple_spec" in module["a_substr_spec"]: # put into proteinogenic, so that it will trigger validator errors
                            if "proteinogenic" not in a_spec:
                                a_spec["proteinogenic"] = []
                            a_spec["proteinogenic"].extend(module["a_substr_spec"].get("a_multiple_spec", "").split(","))
                        if len(a_spec.get("proteinogenic", [])) + len(a_spec.get("nonproteinogenic", [])) > 0:
                            if module["a_substr_spec"].get("evidence_a_spec", "Unknown") in ["Sequence-based prediction", "Structure-based inference", "Feeding study", "Activity assay"]:
                                a_spec["evidence"] = [module["a_substr_spec"]["evidence_a_spec"]]
                            a_spec["epimerized"] = module["a_substr_spec"].get("epimerized", False)
                            a_spec["aa_subcluster"] = module["a_substr_spec"].get("aa_subcluster", [])
                            module["a_substr_spec"] = a_spec
                        else:
                            module.pop("a_substr_spec", None)
                    rename_key("cdom_subtype", "c_dom_subtype", module)
                    if "nrps_mod_doms" in module:
                        module["modification_domains"] = [module["nrps_mod_doms"]]
                        del module["nrps_mod_doms"]
                    if "nrps_evidence_skip_iter" in module:
                        module["non_canonical"] = {}
                        module["non_canonical"]["skipped"] = module.get("nrps_mod_skip_iter") == "Skipped"
                        module["non_canonical"]["non_elongating"] = module.get("nrps_mod_skip_iter") == "Non-elongating"
                        module["non_canonical"]["iterated"] = module.get("nrps_mod_skip_iter") == "Iterated"
                        if module.get("nrps_evidence_skip_iter") in ["Sequence-based prediction", "Structure-based inference", "Activity assay"]:
                            module["non_canonical"]["evidence"] = [module["nrps_evidence_skip_iter"]]
                        del module["nrps_evidence_skip_iter"]
                        module.pop("nrps_mod_skip_iter", None)

                    
    return

In [None]:
def transform_data_ripp(data):
    if "ripp" not in data["cluster"]:
        return
    ripp = data["cluster"]["ripp"]
    
    rename_key("ripp_subclass", "subclass", ripp)
    
    if "lin_cycl_ripp" in ripp:
        ripp["cyclic"] = ripp.get("lin_cycl_ripp") == "Cyclic"
        ripp.pop("lin_cycl_ripp", None)

    peptidases = set([])

    rename_key("precursor_loci", "precursor_genes", ripp)
    if "precursor_genes" in ripp:
        for precursor in ripp["precursor_genes"]:
            precursor["gene_id"] = ",".join(precursor["gene_id"])
            rename_key("core_pept_aa", "core_sequence", precursor)
            rename_key("lead_pept_len", "leader_sequence", precursor)
            rename_key("foll_pept_len", "follower_sequence", precursor)
            rename_key("recogn_motif", "recognition_motif", precursor)
            peptidases |= set(precursor.get("peptidase", []))
            precursor.pop("peptidase", None)
            if "crosslinks" in precursor:
                cls = []
                for crosslink in precursor["crosslinks"]:
                    rename_key("AA_pos_1", "first_AA", crosslink)
                    rename_key("AA_pos_2", "second_AA", crosslink)
                    if len(crosslink.get("crosslink_type", "")) < 1:
                        continue
                    cl = {
                        "crosslink_type": crosslink["crosslink_type"]
                    }
                    if crosslink.get("first_AA", -1) >= 0:
                        if crosslink.get("second_AA", -1) >= 0:
                            cl["first_AA"] = crosslink["first_AA"]
                            cl["second_AA"] = crosslink["second_AA"]
                    print(cl)
                    cls.append(cl)
                if (len(cls) > 0):
                    precursor["crosslinks"] = cls
                else:
                    del precursor["crosslinks"]
            
    
    ripp["peptidases"] = list(peptidases)
    
    return

In [None]:
def transform_data_terpene(data):
    if "terpene" not in data["cluster"]:
        return
    terpene = data["cluster"]["terpene"]
    
    rename_key("terpene_subclass", "structural_subclass", terpene)
    rename_key("terpene_c_len", "carbon_count_subclass", terpene)
    rename_key("prenyl_transf", "prenyltransferases", terpene)
    
    return

In [None]:
def transform_data_saccharide(data):
    if "saccharide" not in data["cluster"]:
        return
    saccharide = data["cluster"]["saccharide"]
    
    rename_key("saccharide_subclass", "subclass", saccharide)
    rename_key("gt_genes", "glycosyltransferases", saccharide)
    if "glycosyltransferases" in saccharide:
        for gt in saccharide["glycosyltransferases"]:
            rename_key("gt_gene", "gene_id", gt)
            rename_key("gt_specificity", "specificity", gt)
            evidence = gt.get("evidence_gt_spec", "Unknown")
            if evidence == "Unknown":
                evidence = "Sequence-based prediction"
            gt["evidence"] = [evidence]
            gt.pop("evidence_gt_spec", None)
    
    return

In [None]:
def transform_data_alkaloid(data):
    if "alkaloid" not in data["cluster"]:
        return
    alkaloid = data["cluster"]["alkaloid"]
    
    rename_key("alkaloid_subclass", "subclass", alkaloid)
    
    return

In [None]:
def transform_data_other(data):
    if "other" not in data["cluster"]:
        return
    other = data["cluster"]["other"]
    
    rename_key("other_subclass", "subclass", other)
    
    return

In [None]:
def fix_none_or_unknown(data):
    if isinstance(data, dict):
        transformed = {}        
        for key in data:
            if data[key] not in ["None", ""]:
                transformed[key] = fix_none_or_unknown(data[key])
        return transformed
    elif isinstance(data, list):
        transformed = []
        for val in data:
            if val not in ["None", "Unknown", ""]:
                transformed.append(fix_none_or_unknown(val))
        return transformed
    else:
        return data

In [None]:
with open("../../outputs/submitter.tsv", "w") as o:
    o.write("bgc_id\tsubmitter_name\tsubmitter_institution\tsubmitter_email\n")

def transform_data(data):
    
    #/changelogs/*
    rename_key("changelogs", "changelog", data)
    for log in data["changelog"]:
        log["comments"] = []
        for comment in log["comment"].split(";"):
            comment = comment.strip()
            if comment == "Submitted" and int(data["general_params"]["mibig_accession"][3:]) < 1831:
                comment = "Migrated from v1.4"
            log["comments"].append(comment)
        log.pop("comment", None)
    
    #/general_params/*
    rename_key("general_params", "cluster", data)
    rename_key("complete", "completeness", data["cluster"]["loci"])
    data["cluster"]["loci"]["evidence"] = []
    for evidence in data["cluster"]["loci"].get("conn_comp_cluster", []):
        if evidence in ["Sequence-based prediction",
                        "Gene expression correlated with compound production",
                        "Knock-out studies",
                        "Enzymatic assays",
                        "Heterologous expression"]:
            data["cluster"]["loci"]["evidence"].append(evidence)
    data["cluster"]["loci"].pop("conn_comp_cluster", None)
    data["cluster"]["minimal"] = data["cluster"].get("minimal", False)
    if data["cluster"]["loci"].get("start_coord", 1) < 0 and data["cluster"]["loci"].get("end_coord", 1) < 0:
        data["cluster"]["loci"].pop("start_coord", None)
        data["cluster"]["loci"].pop("end_coord", None)

    # remove submitter information, store it in a separate text file
    with open("../../outputs/submitter.tsv", "a") as o:
        personal = data.get("personal", {"submitter_name":"MIBiG", "submitter_institution":"", "submitter_email":"mibig@secondarymetabolites.org"})
        o.write("{}\t{}\t{}\t{}\n".format(data["cluster"]["mibig_accession"], personal["submitter_name"], personal["submitter_institution"], personal["submitter_email"]))
    data.pop("personal", None)

    transform_data_genes(data)
    transform_data_compounds(data)
    
    transform_data_polyketide(data)
    transform_data_saccharide(data)
    transform_data_nrp(data)
    transform_data_ripp(data)
    transform_data_terpene(data)
    transform_data_alkaloid(data)
    transform_data_other(data)
    
    return data

In [None]:
def match_attributes_to_schema(data):
    transform_data(data)
    this_file_props = count_props(data, "", {})
    for prop in this_file_props:
        if prop not in schema_props.keys():
            print(prop)
            sys.exit(0)
    return


In [None]:
if not path.exists("../../preprocessed/p6-json/"):
    makedirs("../../preprocessed/p6-json/")
    
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/p5-json/")):
    json_obj = None
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        print(json_path)
        json_obj = fix_none_or_unknown(json_obj)
        match_attributes_to_schema(json_obj)
        with open(path.join("../../preprocessed/p6-json/", path.basename(json_path)), "w") as json_file:
            json.dump(json_obj, json_file, indent=4, separators=(',', ': '), sort_keys=True)

print("All data transformed!")

In [None]:
last_error = 0

In [None]:
def fix_errors(data, error):
    if len(error.path) < 1:
        # problem is in root, need a separate approach
        print(error.path)
    else:        
        # get problematic parent instance from data (so that we can fix it)
        error_container = data
        error_container_parent = None # for catching grandparent
        error_container_attribute = None
        while len(error.path) > 1:
            if len(error.path) == 2:
                error_container_parent = error_container
            error_container_attribute = error.path.popleft()
            error_container = error_container[error_container_attribute] # parent node containing the error instance
        error_attribute = error.path.popleft() # attribute from parent node containing the error instance

        if isinstance(error_container, ToDelete):
            return
        elif isinstance(error_container[error_attribute], ToDelete):
            return
        
        if error.validator == "type":
            if error.validator_value == "integer":
                try:
                    error_container[error_attribute] = int(error.instance)
                    return
                except:
                    pass
            elif error.validator_value == "number":
                try:
                    error_container[error_attribute] = float(error.instance)
                    return
                except:
                    pass
            elif error.validator_value == "boolean":
                if error_container[error_attribute] == "true":
                    error_container[error_attribute] = True
                elif error_container[error_attribute] == "false":
                    error_container[error_attribute] = False
            else:
                print(error.message)

    return

In [None]:
retired = {}
structures_14 = {}

with open("../../inputs/mibig_schema_phase_6.json") as json_file:
    schema_obj = json.load(json_file)
    validator = Draft7Validator(schema_obj)
    errors = {}
    for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/p6-json/")):
        bgc_id = path.basename(json_path)
        id_int = int(bgc_id[3:-5])
        if (id_int < last_error) and True:
            continue
        with open(json_path, "r") as data_file:
            data = json.load(data_file)
            structures_14[bgc_id] = []
            for compound in data["cluster"]["compounds"]:
                structures_14[bgc_id].append((compound["compound"], compound.get("chem_struct", ""), ";".join(compound.get("database_id", []))))
            error_counts_before = 0
            error_counts_after = 0
            for error in sorted(validator.iter_errors(data), key=str):
                fix_errors(data, error)
                error_counts_before += 1
            error_counts = 0
            validator = Draft7Validator(schema_obj)
            for error in sorted(validator.iter_errors(data), key=str):
                if error.message == "'ncbi_tax_id' is a required property":
                    # will be taken care of in phase 7
                    continue
                elif error.message == "'organism_name' is a required property":
                    # will be taken care of in phase 7
                    continue
                elif error.path[-2] == "evidence" or error.path[-1] == "evidence": # retire this bgc, put in a list
                    if error.path[-3] == "loci" or error.path[-2] == "loci":
                        if bgc_id not in retired:
                            retired[bgc_id] = set()
                        retired[bgc_id].add("loci.evidence")
                        continue
                    elif error.path[1] == "genes":
                        # will be taken care of in phase 7/8
                        continue
                elif error.path[-1] == "nr_iterations": # retire this bgc, put in a list
                    if bgc_id not in retired:
                        retired[bgc_id] = set()
                    retired[bgc_id].add("nr_iterations")
                    continue
                elif error.path[-1] == "module_number": # retire this bgc, put in a list
                    if bgc_id not in retired:
                        retired[bgc_id] = set()
                    retired[bgc_id].add("module_number")
                    continue
                elif error.path[-2] == "proteinogenic": # retire this bgc, put in a list
                    if bgc_id not in retired:
                        retired[bgc_id] = set()
                    retired[bgc_id].add("a_substr_spec.proteinogenic")
                    continue
                elif error.path[-1] in ["leader_sequence", "follower_sequence"]: # retire this bgc, put in a list
                    if bgc_id not in retired:
                        retired[bgc_id] = set()
                    retired[bgc_id].add("ripp.leader/follower_sequence")
                    continue
                elif error.path[-1] == "subcluster" and error.path[-3] == "chem_moieties":
                    if bgc_id not in retired:
                        retired[bgc_id] = set()
                    retired[bgc_id].add("chem_moieties.subcluster")
                    continue
                error_counts_after += 1
            print("Validated and fixed {}... Before {} error(s), After: {} error(s)".format(bgc_id, error_counts_before, error_counts_after))
            if error_counts_after > 0:
                last_error = id_int
                exit(1)
            with open(path.join("../../preprocessed/p6-json/", bgc_id), "w") as jo:
                json.dump(data, jo, indent=4, separators=(',', ': '), sort_keys=True)
    print("All data validated!")

In [None]:
with open("../../preprocessed/reports/p6-retired_list.tsv", "w") as o:
    for bgc_id in retired:
        o.write("{}\t{}\n".format(bgc_id.split(".")[0], ";".join(list(retired[bgc_id]))))

In [None]:
with open("../../outputs/bgc_structures_14.tsv", "w") as o:
    for bgc_id in structures_14:
        for compound in structures_14[bgc_id]:
            o.write("{}\t{}\t{}\t{}\n".format(bgc_id[:-5], compound[0], compound[1], compound[2]))