In [1]:
# phase 0: compare old schema vs old data, output the summary in an excel file* to help decision making
# phase 1: transform old schema to match (correct version) of old data
# phase 2: compare phase 1 schema vs old data, output the summary in an excel file* to help decision making
# phase 3: transform old data to match schema from phase 1 (doesn't count dependencies/required keywords)
# phase 4: transform schema from phase 1 to mat|ch JSON Schema draft v7 (we will call it 'new schema')
# phase 5: transform data from phase 2 to match the new schema (include all dependencies/required keywords)

In [2]:
## common imports ##
from os import path, makedirs
import glob
import json
from jsonschema.validators import Draft7Validator
from sys import exit
import datetime
import time

In [3]:
## common functions ##
def fetch_mibig_json_filepaths(dir_path):
    """fetch mibig json paths from a specific folder"""
    return glob.glob(path.join(dir_path, "BGC*.json"))

def count_props(input_dict, cur_path, result):
    """given a (mibig?) json, construct a list of property paths
    along with its presence count in the json object"""
    key_path = cur_path
    
    if isinstance(input_dict, dict):
        for key in input_dict.keys():
            result = count_props(input_dict[key], "{}/{}".format(key_path, key), result)
    elif isinstance(input_dict, list):
        key_path = "{}[]".format(key_path)
        for node in input_dict:
            result = count_props(node, "{}".format(key_path), result)

    if not isinstance(input_dict, dict):
        if key_path not in result:
            result[key_path] = 0
        result[key_path] += 1
    
    return result


def fetch_props_old_schema(input_dict, cur_path, result):
    """given a (mibig?) json schema, construct a list of property paths
    along with either required == True for each properties"""
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_props_old_schema(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_props_old_schema(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result and "properties" not in input_dict:
        result[key_path] = "required" in input_dict and input_dict["required"] == True
    return result


def fetch_props_new_schema(input_dict, cur_path, result):
    """given a (mibig?) json draft7 schema, construct a list of property paths
    along with either required == True for each properties"""
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_props_old_schema(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_props_old_schema(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result and "properties" not in input_dict:
        result[key_path] = False # can't really use this
    return result


def search_and_delete(key, input_dict):
    """delete keys from nested dict"""
    if isinstance(input_dict, list):
        for i in input_dict:
            search_and_delete(key, i)
    elif not isinstance(input_dict, dict):
        return
    to_del = []
    for k in input_dict:
        if k == key:
            to_del.append(k)
        elif isinstance(input_dict[k], dict):
            search_and_delete(key, input_dict[k])
    for k in to_del:
        del input_dict[k]

        
def rename_key(from_key, to_key, parent_dict):
    """rename key in dict"""
    if from_key in parent_dict:
        parent_dict[to_key] = parent_dict[from_key]
        del parent_dict[from_key]

def del_key(key, parent_dict):
    if key in parent_dict:
        del parent_dict[key]
        
import time
def date2iso(thedate):
    strdate = thedate.strftime("%Y-%m-%dT%H:%M:%S")
    minute = (time.localtime().tm_gmtoff / 60) % 60
    hour = ((time.localtime().tm_gmtoff / 60) - minute) / 60
    utcoffset = "%.2d:%.2d" %(hour, minute)
    if utcoffset[0] != '-':
        utcoffset = '+' + utcoffset
        return strdate + utcoffset
    
class ToDelete():
    """dummy class for lazy deletion of list members"""
    pass

def lazily_deletes(input_dict):
    """traverse and lazily delete list/dict members"""
    if isinstance(input_dict, list):
        new_list = []
        for i, node in enumerate(input_dict):
            if not isinstance(node, ToDelete):
                input_dict[i] = lazily_deletes(node)
                new_list.append(node)
        return new_list
    elif isinstance(input_dict, dict):
        key_to_dels = []
        for key in input_dict:
            if not isinstance(input_dict[key], ToDelete):
                input_dict[key] = lazily_deletes(input_dict[key])
            else:
                key_to_dels.append(key)
        for key in key_to_dels:
            del input_dict[key]
    return input_dict


In [4]:
###### phase 0: compare old schema vs old data, output the summary in an excel file* to help decision making ########

In [5]:
all_props = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../inputs/json_1.4/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props:
                all_props[prop] = [path.basename(json_path)]
            else:
                all_props[prop].append(path.basename(json_path))

In [6]:
if not path.exists("../../preprocessed/reports/"):
    makedirs("../../preprocessed/reports/")

In [7]:
with open("../../inputs/mibig_schema.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj, "", {})
    with open("../../preprocessed/reports/p0-old_schema_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{},{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/reports/p0-old_data_vs_old_schema.csv", "w") as o:
        not_in_schema = []
        for key in sorted(all_props.keys()):
            if key not in schema_props.keys():
                not_in_schema.append((key, all_props[key]))
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))

File written: ../../preprocessed/reports/p0-old_schema_properties.csv
File written: ../../preprocessed/reports/p0-old_data_vs_old_schema.csv


In [8]:
###### phase 1: transform old schema to match (correct version) of old data ######

In [9]:
# (everything is manually done) -- TODO: should write hardcoded scripts to make it reproducible
# update all comma-separated based properties into arrays
# gene_pubs: integer --> gene_pubs: array
print("File location: ../../inputs/mibig_schema_phase_1.json")

File location: ../../inputs/mibig_schema_phase_1.json


In [10]:
##### phase 2: compare phase 1 schema vs old data, output the summary in an excel file* to help decision making ####

In [11]:
# use all_props from phase 0
with open("../../inputs/mibig_schema_phase_1.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj, "", {})
    with open("../../preprocessed/reports/p2-schema_phase_1_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{},{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    not_in_schema = []
    for key in sorted(all_props.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props[key]))
    with open("../../preprocessed/reports/p2-old_data_vs_schema_phase_1.csv", "w") as o:
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/reports/p2-bgc_to_fix.csv", "w") as o:
        bgc_to_fix = {}
        for rep in not_in_schema:
            for bgc in rep[1]:
                if bgc not in bgc_to_fix:
                    bgc_to_fix[bgc] = []
                bgc_to_fix[bgc].append(rep[0])
        for bgc in bgc_to_fix:
            o.write("{},{}\n".format(bgc, ";".join(bgc_to_fix[bgc])))
        print("File written: {}".format(o.name))

File written: ../../preprocessed/reports/p2-schema_phase_1_properties.csv
File written: ../../preprocessed/reports/p2-old_data_vs_schema_phase_1.csv
File written: ../../preprocessed/reports/p2-bgc_to_fix.csv


In [12]:
#### phase 3: transform old data to match schema from phase 1 (doesn't count dependencies/required keywords) ####
def match_attributes_to_schema_phase_1(data):
    # fix /Comments
    rename_key("Comments", "comments", data)
    # fix /general_params/loci/nucl_acc[]/conn_comp_cluster
    for nuac in data["general_params"]["loci"]["nucl_acc"]:
        if "conn_comp_cluster" in nuac and isinstance(nuac["conn_comp_cluster"], str):
            nuac["conn_comp_cluster"] = [s.strip() for s in nuac["conn_comp_cluster"].split(",")]
    # fix /general_params/Polyketide/Saccharide
    if "Polyketide" in data["general_params"] and "Saccharide" in data["general_params"]["Polyketide"]:
        data["general_params"]["Saccharide"] = data["general_params"]["Polyketide"]["Saccharide"]
        del data["general_params"]["Polyketide"]["Saccharide"]
    # fix /general_params/Saccharide/Sugar_subclass
    if "Saccharide" in data["general_params"] and "Sugar_subclass" in data["general_params"]["Saccharide"]:
        rename_key("Sugar_subclass", "saccharide_subclass", data["general_params"]["Saccharide"])
    # fix /general_params/Saccharide/gt_genes[]/sugar_subcluster
    if "Saccharide" in data["general_params"] and "gt_genes" in data["general_params"]["Saccharide"]:
        for gtg in data["general_params"]["Saccharide"]["gt_genes"]:
            if "sugar_subcluster" in gtg and isinstance(gtg["sugar_subcluster"], str):
                gtg["sugar_subcluster"] = [s.strip() for s in gtg["sugar_subcluster"].split(",")]    
    for comp in data["general_params"]["compounds"]:
        # fix /general_params/compounds[]/chem_target
        if "chem_target" in comp and isinstance(comp["chem_target"], str):
            comp["chem_target"] = [s.strip() for s in comp["chem_target"].split(",")]  
        # fix /general_params/compounds[]/chem_moieties[]/subcluster
        if "chem_moieties" in comp:
            for moi in comp["chem_moieties"]:
                if "subcluster" in moi:
                    if moi["subcluster"] == "unknown":
                        del moi["subcluster"]
    # fix /general_params/genes/gene[]/evidence_genefunction[][]**
    if "genes" in data["general_params"]:
        if "gene" in data["general_params"]["genes"]:
            for gen in data["general_params"]["genes"]["gene"]:
                if "evidence_genefunction" in gen:
                    for i, evgen in enumerate(gen["evidence_genefunction"]):
                        def getvalevgen(ar):
                            if isinstance(ar, list):
                                return getvalevgen(ar[0])
                            else:
                                return ar
                        gen["evidence_genefunction"][i] = getvalevgen(evgen)
    # fix /general_params/Other/biosyn_class[]
    if "Other" in data["general_params"] and "biosyn_class" in data["general_params"]["Other"]:
        clas = data["general_params"]["Other"]["biosyn_class"][0]
        del data["general_params"]["Other"]["biosyn_class"]
        data["general_params"]["Other"]["other_subclass"] = clas
    # fix /general_params/publications
    if "publications" in data["general_params"] and isinstance(data["general_params"]["publications"], str):
        data["general_params"]["publications"] = [s.strip() for s in data["general_params"]["publications"].split(",")]
    # Polyketide
    if "Polyketide" in data["general_params"]:
        pol = data["general_params"]["Polyketide"]
        # fix /general_params/Polyketide/cyclases
        if "cyclases" in pol and isinstance(pol["cyclases"], str):
            pol["cyclases"] = [s.strip() for s in pol["cyclases"].split(",")]
        # fix /general_params/Polyketide/pks_genes
        if "pks_genes" in pol and isinstance(pol["pks_genes"], str):
            pol["pks_genes"] = [s.strip() for s in pol["pks_genes"].split(",")]
        # fix /general_params/Polyketide/pufa_mod_doms
        if "pufa_mod_doms" in pol and isinstance(pol["pufa_mod_doms"], str):
            pol["pufa_mod_doms"] = [s.strip() for s in pol["pufa_mod_doms"].split(",")]
        # fix /general_params/Polyketide/mod_pks_genes[]/pks_module[]/pks_mod_doms
        if "mod_pks_genes" in pol:
            for modpk in pol["mod_pks_genes"]:
                if "pks_module" in modpk:
                    for pkmod in modpk["pks_module"]:
                        if "pks_mod_doms" in pkmod and isinstance(pkmod["pks_mod_doms"], str):
                            pkmod["pks_mod_doms"] = [s.strip() for s in pkmod["pks_mod_doms"].split(",")]
    # RiPP
    if "RiPP" in data["general_params"]:
        rip = data["general_params"]["RiPP"]
        if "precursor_loci" in rip:
            for ploc in rip["precursor_loci"]:
                # fix /general_params/RiPP/precursor_loci[]/cleavage_recogn_site
                if "cleavage_recogn_site" in ploc and isinstance(ploc["cleavage_recogn_site"], str):
                    ploc["cleavage_recogn_site"] = [s.strip() for s in ploc["cleavage_recogn_site"].split(",")]
                # fix /general_params/RiPP/precursor_loci[]/core_pept_aa
                if "core_pept_aa" in ploc and isinstance(ploc["core_pept_aa"], str):
                    ploc["core_pept_aa"] = [s.strip() for s in ploc["core_pept_aa"].split(",")]
                            
    return

if not path.exists("../../preprocessed/p3-json/"):
    makedirs("../../preprocessed/p3-json/")
    
for json_path in sorted(fetch_mibig_json_filepaths("../../inputs/json_1.4/")):
    json_obj = None
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        match_attributes_to_schema_phase_1(json_obj)        
        with open(path.join("../../preprocessed/p3-json/", path.basename(json_path)), "w") as json_file:
            json.dump(json_obj, json_file, indent=4, separators=(',', ': '))

In [13]:
# verify that all data matched schema
all_props_phase_3 = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/p3-json/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props_phase_3:
                all_props_phase_3[prop] = [path.basename(json_path)]
            else:
                all_props_phase_3[prop].append(path.basename(json_path))

In [14]:
with open("../../inputs/mibig_schema_phase_1.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj, "", {})
    not_in_schema = []
    for key in sorted(all_props_phase_3.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props_phase_3[key]))
            print(key)
    print("Number of conflicts: {}".format(len(not_in_schema)))

Number of conflicts: 0


In [15]:
###### phase 4: transform schema from phase 1 to match JSON Schema draft v7 (we will call it 'new schema') ######

In [16]:
new_schema = None
with open("../../inputs/mibig_schema_phase_1.json") as json_file:
    new_schema = json.load(json_file) # pre-load with old schema

In [17]:
# 1: convert 'required' to Json Schema draft 7 style
def fix_required(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":
        input_dict["required"] = []
        for prop in input_dict["properties"]:
            child = input_dict["properties"][prop]
            if "required" in child and child["required"] == True:
                input_dict["required"].append(prop)
            fix_required(child)
        if len(input_dict["required"]) < 1:
            del input_dict["required"]
    else:
        if "type" in input_dict and input_dict["type"] == "array":
            fix_required(input_dict["items"])
        if "required" in input_dict:
            del input_dict["required"]
fix_required(new_schema)

In [18]:
# 2: convert 'dependencies' to Json Schema draft 7 style
def fix_dependencies(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":
        input_dict["dependencies"] = {}
        for prop in input_dict["properties"]:
            child = input_dict["properties"][prop]
            if "dependencies" in child and isinstance(child["dependencies"], str):
                if child["dependencies"] in input_dict["properties"]:
                    if child["dependencies"] not in input_dict["dependencies"]:
                        input_dict["dependencies"][child["dependencies"]] = []
                    input_dict["dependencies"][child["dependencies"]].append(prop)
                else:
                    print("Error: {} not found".format(child["dependencies"]))
            fix_dependencies(child)
        if len(input_dict["dependencies"].keys()) < 1:
            del input_dict["dependencies"]
    else:
        if "type" in input_dict and input_dict["type"] == "array":
            fix_dependencies(input_dict["items"])
        if "dependencies" in input_dict:
            del input_dict["dependencies"]
fix_dependencies(new_schema)

In [19]:
# 3: make sure 'enum' contain unique items, and remove all trailing white spaces
def fix_enum(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":        
        for prop in input_dict["properties"]:
            fix_enum(input_dict["properties"][prop])
    elif "type" in input_dict and input_dict["type"] == "array":
        fix_enum(input_dict["items"])
            
    if "enum" in input_dict:
        for i, item in enumerate(input_dict["enum"]):
            input_dict["enum"][i] = item.rstrip().lstrip()
        input_dict["enum"] = list(set(input_dict["enum"]))
fix_enum(new_schema)

In [20]:
# 4: manual (but reproducible) curations

In [21]:
now = datetime.datetime.now()
new_schema["$schema"] = "http://json-schema.org/draft-07/schema#"
new_schema["$schema_version"] = "2.0"
new_schema["$schema_created"] = date2iso(now)

# remove version, replace with changelogs
del new_schema["properties"]["version"]
new_schema["properties"]["changelogs"] = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "version": {
                "type": "string", # or integer?
                "pattern": "^\\d+(\\.\\d+)*$"
            },
            "comment": {
                "type": "string"
            }
        }
    }
}

# remove "embargo" (we will track it in the database and not in the json schema)
del new_schema["properties"]["embargo"]

# require "changelogs", "general_params", "personal"
new_schema["required"] = ["changelogs", "general_params", "personal"]

# require "mibig_accession", "biosyn_class", "compounds", "publications"
new_schema["properties"]["general_params"]["required"] = ["mibig_accession", "biosyn_class", "compounds", "publications"]

# delete properties we don't need anymore (i.e. ones meant for AlpacaJS forms)
del new_schema["properties"]["personal"]["properties"]["submitter_institution"]["format"]
search_and_delete("default", new_schema)
search_and_delete("description", new_schema) # we should add the proper descriptions later

# rename Polyketide, NRP, RiPP, Terpene, Saccharide, Alkaloid, Other, to lowercases
rename_key("Polyketide", "polyketide", new_schema["properties"]["general_params"]["properties"])
rename_key("NRP", "nrp", new_schema["properties"]["general_params"]["properties"])
rename_key("RiPP", "ripp", new_schema["properties"]["general_params"]["properties"])
rename_key("Terpene", "terpene", new_schema["properties"]["general_params"]["properties"])
rename_key("Saccharide", "saccharide", new_schema["properties"]["general_params"]["properties"])
rename_key("Alkaloid", "alkaloid", new_schema["properties"]["general_params"]["properties"])
rename_key("Other", "other", new_schema["properties"]["general_params"]["properties"])

# fix publications.pattern (cover pubmed id, (google) patent id, doi, and url)
del new_schema["properties"]["general_params"]["properties"]["publications"]["items"]["pattern"]
new_schema["properties"]["general_params"]["properties"]["publications"]["items"]["oneOf"] = [
    {"pattern": "^pubmed:(\\d+)$"},
    {"pattern": "^doi:10\\.\\d{4,9}/[-\\._;()/:a-zA-Z0-9]+$"},
    {"pattern": "^patent:(.+)$"},
    {"pattern": "^url:https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)$"}
]

In [22]:
### general_params.compounds ###

# remove database_deposited and databases_deposited. we can always infer it from their respective accession ids
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["database_deposited"]
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["databases_deposited"]
# delete old dependencies format for compounds
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["dependencies"]
# required = ["compound"]
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["required"] = ["compound"]
# if chem_act == "other", require other_chem_act
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["allOf"].append({
    "if": {
            "properties": {"chem_act": {"contains":{"enum": ["Other"]}}},
            "required": ["chem_act"]
          },
          "then": {
            "required": ["other_chem_act"]
          }
})
# delete old dependencies format for chem_moieties
del new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["chem_moieties"]["items"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["chem_moieties"]["items"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["chem_moieties"]["items"]["allOf"].append({
    "if": {
            "properties": {"chem_moiety": {"enum": ["Other"]}},
            "required": ["chem_moiety"]
          },
          "then": {
            "required": ["other_chem_moiety"]
          }
})

In [23]:
### general_params.loci ###

# this is a big change: make sure that BGC have ONLY 1 loci (move everything from 'nucl_acc' to the 'loci' level)
for nuc_key in new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]["items"]["properties"]:
    new_schema["properties"]["general_params"]["properties"]["loci"]["properties"][nuc_key] = new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]["items"]["properties"][nuc_key]
del new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["nucl_acc"]

# change 'Accession' to 'accession'
rename_key("Accession", "accession", new_schema["properties"]["general_params"]["properties"]["loci"]["properties"])
# accession pattern -- now also accept 'mibig:BGCXXXXXXX': this will cover sequences not in NCBI (we host it ourself)
new_schema["properties"]["general_params"]["properties"]["loci"]["properties"]["accession"]["pattern"] = "^([A-Za-z0-9\\._]{3,}){1}|(MIBIG:BGC\\d{7}){1}$"

# loci.required = ["complete", "accession", "conn_comp_cluster"]
new_schema["properties"]["general_params"]["properties"]["loci"]["required"] = ["complete", "accession", "conn_comp_cluster"]


In [24]:
### general_params.genes ###

# operon.required = ["operon_genes", "evidence_operon"]
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["operon"]["items"]["required"] = ["operon_genes", "evidence_operon"]
# gene.required = ["gene_function"]
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["required"] = ["gene_function"]
# delete old dependencies format, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"] = []

# add gene.strand
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["properties"]["strand"] = {
    "type": "integer",
    "enum": [ -1, 0, 1 ]
}

# add gene.aa_seq
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["properties"]["aa_seq"] = {
    "type": "string"
}

# if gene_id is not supplied, require gene_name
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"].append({
    "if": {
            "not": {
                "required": ["gene_id"]
               }
          },
          "then": {
            "required": ["gene_name"]
          },
})

# if gene_function != Unknown, require evidence_genefunction
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"].append({
    "if": {
            "not": {"properties": {"gene_function": {"enum": ["Unknown"]}}},
            "required": ["gene_function"]
          },
          "then": {
            "required": ["evidence_genefunction"]
          }
})
# if gene_function == Tailoring, require tailoring
new_schema["properties"]["general_params"]["properties"]["genes"]["properties"]["gene"]["items"]["allOf"].append({
    "if": {
            "properties": {"gene_function": {"enum": ["Tailoring"]}},
            "required": ["gene_function"]
          },
          "then": {
            "required": ["tailoring"]
          }
})


In [25]:
### general_params.polyketide ###
# for now, just remove all dependencies and requirements
search_and_delete("required", new_schema["properties"]["general_params"]["properties"]["polyketide"])
search_and_delete("dependencies", new_schema["properties"]["general_params"]["properties"]["polyketide"])

# add evidence_at_spec = "None"
new_schema["properties"]["general_params"]["properties"]["polyketide"]["properties"]["mod_pks_genes"]["items"]["properties"]["pks_module"]["items"]["properties"]["evidence_at_spec"]["enum"].append("None")

# update pks_domains enum
# These should be allowed:
# "KS", “SAT”, "AT", "CAL", "DH", "KR", "ER", "T", "C", "A", "E", “MT”, “TE”, “TD”, "ST", "PT", "PPT"
new_schema["properties"]["general_params"]["properties"]["polyketide"]["properties"]["mod_pks_genes"]["items"]["properties"]["pks_module"]["items"]["properties"]["pks_domains"]["items"]["enum"] = [
    "Ketosynthase", #KS
    "ACP transacylase", #SAT
    "Acyltransferase", #AT
    "CoA-ligase", #CAL
    "Dehydratase", #DH
    "Ketoreductase", #KR
    "Enoylreductase", #ER
    "Thiolation (ACP/PCP)", #T
    "Condensation", #C
    "Adenlyation", #A
    "Epimerization", #E
    "Methyltransferase", #MT
    "Thioesterase", #TE
    "Thioeter reductase", #TD
    "Sulfotransferase", #ST
    "Product Template domain", #PT
    "Phosphopantetheinyl transferase", #PPT
    "Thiol reductase" #PPT
]

if False: #TODO: fix Polyketide schema (requirements, dependencies)
    # required = ["pk_subclass", "pks_subclass", "lin_cycl_pk", "starter_unit", "pks_genes", "cyclases"]
    new_schema["properties"]["general_params"]["properties"]["polyketide"]["required"] = ["pk_subclass", "pks_subclass", "pks_te_type", "lin_cycl_pk", "starter_unit", "ketide_length"]

    # delete old dependencies format, replace with allOf
    del new_schema["properties"]["general_params"]["properties"]["polyketide"]["dependencies"]
    new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"] = []
    new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
        "if": {
                "properties": {"pks_subclass": {"enum": ["Modular type I"]}},
                "required": ["pks_subclass"]
              },
              "then": {
                "required": ["mod_pks_genes"],
                "properties": {"mod_pks_genes": {"minItems": 1}}
              }
    })
    new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
        "if": {
                "properties": {"pks_subclass": {"enum": ["Trans-AT type I"]}},
                "required": ["pks_subclass"]
              },
              "then": {
                "required": ["trans_at"],
                "properties": {"trans_at": {"minItems": 1}}
              }
    })
    new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
        "if": {
                "properties": {"pks_subclass": {"enum": ["Iterative type I"]}},
                "required": ["pks_subclass"]
              },
              "then": {
                "required": ["iterative_subtype", "nr_iterations", "iter_cycl_type"]
              }
    })
    new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
        "if": {
                "properties": {"pks_subclass": {"enum": ["PUFA synthase or related"]}},
                "required": ["pks_subclass"]
              },
              "then": {
                "required": ["pufa_mod_doms"],
                "properties": {"pufa_mod_doms": {"minItems": 1}}
              }
    })
    new_schema["properties"]["general_params"]["properties"]["polyketide"]["allOf"].append({
        "if": {
                "not": { "properties": {"pks_te_type": {"enum": ["None"]}} },
                "required": ["pks_te_type"]
              },
              "then": {
                "required": ["pks_thioesterase"],
                "properties": {"pks_thioesterase": {"minItems": 1}}
              }
    })
# --- todo: assert dependencies of pks_subtype --> requirements
# --- todo: apply dependencies for mod_pks_genes

In [26]:
### general_params.nrp ###
# for now, just remove all dependencies and requirements
search_and_delete("required", new_schema["properties"]["general_params"]["properties"]["nrp"])
search_and_delete("dependencies", new_schema["properties"]["general_params"]["properties"]["nrp"])

if False: #TODO: fix NRP schema (requirements, dependencies)
    # delete old dependencies format, replace with allOf
    del new_schema["properties"]["general_params"]["properties"]["nrp"]["dependencies"]
    new_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"] = []
    new_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"].append({
        "if": {
                "not": { "properties": {"nrps_te_type": {"enum": ["None"]}} },
                "required": ["nrps_te_type"]
              },
              "then": {
                "required": ["nrps_thioesterase"],
                "properties": {"nrps_thioesterase": {"minItems": 1}}
              }
    })
    new_schema["properties"]["general_params"]["properties"]["nrp"]["allOf"].append({
        "if": {
                "properties": {"subclass": {"enum": ["Other lipopeptide"]}},
                "required": ["subclass"]
              },
              "then": {
                "required": ["lipid_moiety"]
              }
    })

    # nrps_module -- required = ["module_nr", "cdom_subtype", "nrps_mod_doms"]
    new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["required"] = ["module_nr", "cdom_subtype", "nrps_mod_doms"]
    # nrps_module -- delete old dependencies, replace with allOf
    del new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["dependencies"]
    new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["allOf"] = []
    new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["allOf"].append({
        "if": {
                "properties": {"nrps_mod_doms": {"enum": ["Other"]}},
                "required": ["nrps_mod_doms"]
              },
              "then": {
                "required": ["nrps_other_mod_dom"]
              }    
    })
    new_schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["allOf"].append({
        "if": {
                "not": {"properties": {"nrps_mod_skip_iter": {"enum": ["Neither"]}}},
                "required": ["nrps_mod_skip_iter"]
              },
              "then": {
                "required": ["nrps_evidence_skip_iter"]
              }    
    })

## todo -- assert dependencies

In [27]:
### general_params.saccharide ###

# delete old dependencies format, replace with allOf
del new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["dependencies"]
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["allOf"] = []
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["allOf"].append({
    "if": {
            "properties": {"gt_specificity": {"enum": ["Other"]}},
            "required": ["gt_specificity"]
          },
          "then": {
            "required": ["other_gt_spec"]
          }
})
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["allOf"].append({
    "if": {
            "not": { "properties": {"gt_specificity": {"enum": ["Unknown"]}} },
            "required": ["gt_specificity"]
          },
          "then": {
            "required": ["evidence_gt_spec"]
          }
})

# move 'gt_genes.sugar_subcluster' to sugar_subclusters, we can infer the specificities from the list of gene ids
new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["sugar_subclusters"] = {
    "title": "Sub-clusters for sugar biosynthesis",
    "type": "array",
    "items": new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["properties"]["sugar_subcluster"]
}
del new_schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["properties"]["sugar_subcluster"]


In [28]:
### for all 'required' fields, if it is an array, specify 'minItems = 1' so that it can't be empty
def apply_array_required(input_dict):
    if "properties" in input_dict:
        if "required" in input_dict:
            for req in input_dict["required"]:
                if "type" in input_dict["properties"][req] and input_dict["properties"][req]["type"] == "array":
                    if "minItems" not in input_dict["properties"][req]:
                        input_dict["properties"][req]["minItems"] = 1
        for key in input_dict["properties"]:
            apply_array_required(input_dict["properties"][key])
    elif "items" in input_dict:
        apply_array_required(input_dict["items"])
                                
apply_array_required(new_schema)

### for all 'type' = 'string', if no 'pattern' or 'format' specified, specify 'minLength' = 1
def apply_string_minlength(input_dict):
    if "properties" in input_dict:
        if "type" in input_dict and input_dict["type"] == "string":
            if "pattern" not in input_dict and "format" not in input_dict and "minLength" not in input_dict:
                input_dict["minLength"] = 1
        for key in input_dict["properties"]:
            apply_string_minlength(input_dict["properties"][key])
    elif "items" in input_dict:
        apply_string_minlength(input_dict["items"])
        
apply_string_minlength(new_schema)

In [29]:
# if class = "Polyketide", requires "polyketide" attribute, etc. except if minimal = true
del new_schema["properties"]["general_params"]["dependencies"]
new_schema["properties"]["general_params"]["allOf"] = []
prop_attr_pairs = [
    ("NRP", "nrp"),
    ("Polyketide", "polyketide"),
    ("RiPP", "ripp"),
    ("Terpene", "terpene"),
    ("Saccharide", "saccharide"),
    ("Alkaloid", "alkaloid"),
    ("Other", "other")
]
for prop, attr in prop_attr_pairs:
    then = { "required": [attr] }
    sub_attr = new_schema["properties"]["general_params"]["properties"][attr]
    if "required" in sub_attr:
        then["properties"] = {}
        then["properties"][attr] = {"required": sub_attr["required"]}
        del sub_attr["required"]
    new_schema["properties"]["general_params"]["allOf"].append({
        "if": {
            "not": {"properties": {"minimal": {"const": True}}, "required": ["minimal"]},
            "properties": {"biosyn_class": {"contains":{"enum": [prop]}}}
          },
          "then": then
    })

In [30]:
# 5: save new schema
with open("../../preprocessed/p4-mibig_schema_draft7.json", "w") as o:
    o.write(json.dumps(new_schema, indent=4, separators=(',', ': ')))

In [None]:
#### phase 5: transform data from phase 2 to match the new schema (include all dependencies/required keywords) ######

In [None]:
# 1: check data vs new schema to get quick overview of changed structures
# use all_props from phase 3
with open("../../preprocessed/p4-mibig_schema_draft7.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_new_schema(json_obj, "", {})
    with open("../../preprocessed/reports/p5-schema_draft7_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    not_in_schema = []
    for key in sorted(all_props_phase_3.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props_phase_3[key]))
    with open("../../preprocessed/reports/p5-data_phase_3_vs_schema_draft7.csv", "w") as o:
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/reports/p5-bgc_to_fix.csv", "w") as o:
        bgc_to_fix = {}
        for rep in not_in_schema:
            for bgc in rep[1]:
                if bgc not in bgc_to_fix:
                    bgc_to_fix[bgc] = []
                bgc_to_fix[bgc].append(rep[0])
        for bgc in bgc_to_fix:
            o.write("{},{}\n".format(bgc, ";".join(bgc_to_fix[bgc])))
        print("File written: {}".format(o.name))

In [None]:
# 2: fix data and assert that there is no more unrecognized attributes present
def match_attributes_to_schema_7(data):
    # remove 'embargo'
    del_key("embargo", data)
    # remove 'version'
    del_key("version", data)
    con_comp_temp = []
    for nuc in data["general_params"]["loci"]["nucl_acc"]:
        # rename Accession to accession
        rename_key("Accession", "accession", nuc)
        if "conn_comp_cluster" in nuc:
            for con_comp in nuc["conn_comp_cluster"]:
                if con_comp not in con_comp_temp:
                    con_comp_temp.append(con_comp)
            del nuc["conn_comp_cluster"]
    # fix /general_params/loci/nucl_acc[]/conn_comp_cluster[]
    if len(con_comp_temp) > 0:
        data["general_params"]["loci"]["conn_comp_cluster"] = con_comp_temp
    # rename Polyketide, NRP, etc. to its lowercase version
    rename_key("Polyketide", "polyketide", data["general_params"])
    rename_key("NRP", "nrp", data["general_params"])
    rename_key("RiPP", "ripp", data["general_params"])
    rename_key("Terpene", "terpene", data["general_params"])
    rename_key("Saccharide", "saccharide", data["general_params"])
    rename_key("Alkaloid", "alkaloid", data["general_params"])
    rename_key("Other", "other", data["general_params"])
    for comp in data["general_params"]["compounds"]:
        # fix /general_params/compounds[]/database_deposited
        if "database_deposited" in comp:
            del comp["database_deposited"]
        # fix /general_params/compounds[]/databases_deposited[]
        if "databases_deposited" in comp:
            del comp["databases_deposited"]
    # fix /general_params/saccharide/gt_genes[]/sugar_subcluster[]
    sugsub = []
    if "saccharide" in data["general_params"]:
        if "gt_genes" in data["general_params"]["saccharide"]:
            for gtg in data["general_params"]["saccharide"]["gt_genes"]:
                if "sugar_subcluster" in gtg:
                    sugsub.append(gtg["sugar_subcluster"])
                    del gtg["sugar_subcluster"]
        if len(sugsub) > 0:
            data["general_params"]["saccharide"]["sugar_subclusters"] = sugsub
    return

if not path.exists("../../preprocessed/p5-json/"):
    makedirs("../../preprocessed/p5-json/")
    
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/p3-json/")):
    json_obj = None
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        match_attributes_to_schema_7(json_obj)
        with open(path.join("../../preprocessed/p5-json/", path.basename(json_path)), "w") as json_file:
            json.dump(json_obj, json_file, indent=4, separators=(',', ': '))

all_props_phase_5 = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/p5-json/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props_phase_5:
                all_props_phase_5[prop] = [path.basename(json_path)]
            else:
                all_props_phase_5[prop].append(path.basename(json_path))

with open("../../preprocessed/p4-mibig_schema_draft7.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_new_schema(json_obj, "", {})
    not_in_schema = []
    for key in sorted(all_props_phase_5.keys()):
        if key not in schema_props.keys():
            not_in_schema.append(key)
            print("{},{}".format(key, len(all_props_phase_5[key])))
    print("Number of conflicts: {}".format(len(not_in_schema)))

In [None]:
# 3: validate data using JSON Schema V7 validator
with open("../../preprocessed/p4-mibig_schema_draft7.json") as json_file:
    schema_obj = json.load(json_file)
    validator = Draft7Validator(schema_obj)
    errors = {}
    errors_by_message = {}
    for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/p5-json/")):
        bgc_id = path.basename(json_path)
        break
        print("Validating {}...".format(bgc_id))
        with open(json_path, "r") as json_file:
            data = json.load(json_file)
            for error in sorted(validator.iter_errors(data), key=str):
                # error by path
                path_err = ".".join([str(i) for i in list(error.schema_path)])
                inst_err = ""
                if not (isinstance(error.instance, dict) or isinstance(error.instance, list)):
                    inst_err = str(error.instance)
                if path_err not in errors:
                    errors[path_err] = {
                        "files": [],
                        "instances": []
                    }
                if bgc_id not in errors[path_err]["files"]:
                    errors[path_err]["files"].append(bgc_id)
                if inst_err not in errors[path_err]["instances"]:
                    errors[path_err]["instances"].append(inst_err)
                # error by message
                if error.message not in errors_by_message:
                    errors_by_message[error.message] = []
                if bgc_id not in errors_by_message[error.message]:
                    errors_by_message[error.message].append(bgc_id)
                    
    with open("../../preprocessed/reports/p5-errors.tsv", "w") as error_list:
        for error in sorted(errors.keys(), reverse = True):
            error_list.write("{}\t{}\t{}\t{}\n".format(error, error.split(".")[-1], len(errors[error]["files"]), ";".join(errors[error]["instances"])))
            
    with open("../../preprocessed/reports/p5-errors_by_message.tsv", "w") as error_list:
        for error in sorted(errors_by_message.keys(), reverse = True):
            error_list.write("{}\t{}\n".format(error, len(errors_by_message[error])))
            

In [None]:
# track validated files so that we don't need to rerun them
last_error = 0

In [None]:
# file to store list of accessions without publications
with open("../../preprocessed/reports/p5-accession_without_publications.tsv", "w") as acc_list:
    acc_list.write("")

# track genes not in gbk, but without any accession nor location information
with open("../../preprocessed/reports/p5-accession_with_lost_genes.tsv", "w") as acc_list:
    acc_list.write("")
    
# track mibig data without accessions
with open("../../preprocessed/reports/p5-accession_with_lost_nucl_accs.tsv", "w") as acc_list:
    acc_list.write("")
    
import re
# open excel list of publications
def fetch_publications(xls_file):
    import xlrd
    accession_and_publications = xlrd.open_workbook(xls_file)
    worksheet = accession_and_publications.sheet_by_index(0)
    pubs_and_new_accs = {}
    i = 1
    def value_exist(i, j):
        try:
            return worksheet.cell(i, j).value != xlrd.empty_cell.value
        except:
            return False
    while value_exist(i, 0):
        mibig_acc = worksheet.cell(i, 0).value
        pubs = []
        if worksheet.cell(i, 2).value != xlrd.empty_cell.value:
            for pub in worksheet.cell(i, 2).value.split(";"):
                pubs.append(pub)
        new_accs = []
        pubs_and_new_accs[mibig_acc] = (pubs, new_accs)
        i += 1
    return pubs_and_new_accs
pubs_and_new_accs = {}#fetch_publications("../../docs/accession_and_publications.xls")

def update_pub_and_accs(data, pubs_and_new_accs):
    mibig_acc = data["general_params"]["mibig_accession"]
    if mibig_acc in pubs_and_new_accs:
        pubs, new_accs = pubs_and_new_accs[mibig_acc]
        data["general_params"]["publications"] = [pub for pub in pubs]
    else:
        if "publications" not in data["general_params"]:
            data["general_params"]["publications"] = []
        else:
            pubs = []
            for i, pub in enumerate(data["general_params"]["publications"]):
                for pubbb in pub.split(";"):
                    for pubb in [s.strip().strip("'").replace(" ", "") for s in pubbb.split(",")]:
                        if pubb.startswith("doi:"):
                            pubb = pubb[4:-1]
                        if pubb.startswith("PMC"):
                            pubb = pubb[3:-1]
                        if pubb.startswith("doi.org/"):
                            pubb = pubb[8:-1]
                        if pubb == "-":
                            continue
                        if pubb == "unpublished":
                            continue
                        if len(pubb) > 0:
                            pubs.append(pubb)
            data["general_params"]["publications"] = pubs

    new_pub_list = []
    for i, pub in enumerate(data["general_params"]["publications"]):
        if re.match("^(\\d+)$", pub):
            new_pub_list.append("pubmed:{}".format(pub))
        elif re.match("^10\\.\\d{4,9}/[-\\._;()/:a-zA-Z0-9]+$", pub):
            new_pub_list.append("doi:{}".format(pub))
        elif re.match("^([A-Z0-9]+)$", pub):
            new_pub_list.append("patent:{}".format(pub))
        elif re.match("^https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)$", pub):
            new_pub_list.append("url:{}".format(pub))
            
    data["general_params"]["publications"] = new_pub_list

    
def update_positional_offset(data): # check which offset (zero-based? one-based) this entry used, then adjust to genbank style
    return

In [None]:
# track mibig data without accessions
with open("../../preprocessed/reports/p5-non_ncbi_accessions.tsv", "w") as acc_list:
    acc_list.write("")

import urllib.request
def update_ncbi_accs(data):
    ## mibig old json data is "a bit" messed up, e.g. many BGCs semi-predicted by antiSMASH
    ## but didn't retained their positional information (look for the ones with startpos == -1
    ## and endpos == -1). Moreover, some BGCs didn't have any NCBI accession available.
    ## therefore, re-fetch data from the final.gbk instead
    ## todo: check for MIxS compliance

    new_accessions = []
    mibig_id = data["general_params"]["mibig_accession"]
    whole_gbk = ""
    try:
        if mibig_id == "BGC0000299": # Alterochromides, ncbi accession is updated, so fetch directly from mibig instead
            raise Exception()
        with urllib.request.urlopen("https://mibig.secondarymetabolites.org/repository/{}/{}.1.final.gbk".format(mibig_id, mibig_id)) as response:
            whole_gbk = str(response.read()).replace(" ", "").replace("\\n", "")
    except:
        whole_gbk = "VERSION{}.1".format(mibig_id)
        pass
    match_gbks = re.findall("(theregionbetween(\d+)-(\d+)ntfrom){0,1}GenBankID([A-Z0-9_\.]+)\.", whole_gbk)
    if len(match_gbks) > 0:
        for match_gbk in match_gbks:
            loci = { "accession": match_gbk[-1] }
            if len(match_gbk[0]) > 0:
                loci["start_coord"] = int(match_gbk[1])
                loci["end_coord"] = int(match_gbk[2])
            new_accessions.append(loci)
    else: # use mibig gbk
        match_mibig_bgcs = re.findall("VERSION{}\.(\d)+".format(mibig_id), whole_gbk)
        for match_mibig_bgc in match_mibig_bgcs:
            loci = { "accession": "MIBIG:{}.{}".format(mibig_id, match_mibig_bgc[-1]) }
            with open("../../preprocessed/reports/p5-non_ncbi_accessions.tsv", "a") as acc_list:
                acc_list.write("{}.{}\n".format(mibig_id, match_mibig_bgc[-1]))
            new_accessions.append(loci)
    data["general_params"]["loci"]["nucl_acc"] = new_accessions
            

In [None]:
with open("../../preprocessed/reports/p5-multi_loci_bgcs.tsv", "w") as of:
    of.write("")

In [None]:
# 4: fix conflicts, then save to final output folder
def fix_data_new_schema(data, error):
    if len(error.path) < 1:
        # problem is in root, need a separate approach
        if error.validator == "required":
            missing_keys = set(error.validator_value) - set(data.keys())
            if "personal" in missing_keys:
                data["personal"] = {
                    "submitter_name": "mibig.secondarymetabolites.org",
                    "submitter_institution": "MIBiG",
                    "submitter_email": "info@mibig.secondarymetabolites.org"
                }
            if "changelogs" in missing_keys:
                data["changelogs"] = [{
                    "version": "2.0",
                    "comment": "Submitted"
                }]
    else:        
        # get problematic parent instance from data (so that we can fix it)
        error_container = data
        error_container_parent = None # for catching grandparent
        error_container_attribute = None
        while len(error.path) > 1:
            if len(error.path) == 2:
                error_container_parent = error_container
            error_container_attribute = error.path.popleft()
            error_container = error_container[error_container_attribute] # parent node containing the error instance
        error_attribute = error.path.popleft() # attribute from parent node containing the error instance

        if isinstance(error_container, ToDelete):
            return
        elif isinstance(error_container[error_attribute], ToDelete):
            return
        
        # function to replace attribute values
        def replace_attr(attr_pairs):
            for attr_from, attr_to in attr_pairs:
                if error.instance == attr_from:
                    error_container[error_attribute] = attr_to
                    return True
            return False
        
        # fix type errors (should be generalizable)
        if error.validator == "type":
            if error.validator_value == "integer":
                try:
                    error_container[error_attribute] = int(error.instance)
                    return
                except:
                    # may need tailored fix
                    pass
            elif error.validator_value == "number":
                try:
                    error_container[error_attribute] = float(error.instance)
                    return
                except:
                    # may need tailored fix
                    pass
            elif error.validator_value == "string":
                # fix mut_pheno, error list shown that these are None/null values, delete the attribute instead
                if error_attribute == "mut_pheno":
                    del error_container[error_attribute]
                # fix gene_annotation, name, id
                elif error_attribute == "gene_id":
                    if len(set(["gene_name"]).intersection(error_container.keys())) == 1:
                        # it is a custom annotation
                        del error_container[error_attribute]
                    else:
                        mibig_acc = data["general_params"]["mibig_accession"]
                        with open("../../preprocessed/reports/p5-accession_with_lost_genes.tsv", "a") as acc_list:
                            acc_list.write("1\t{}\t{}\n".format(mibig_acc, ", ".join(["{}:{}".format(k, v) for k, v in error_container_parent[error_container_attribute].items() if isinstance(v, str)])))
                        # for now, delete the gene (?)
                        error_container_parent[error_container_attribute] = ToDelete()
                elif error_attribute == "gene_annotation":
                    del error_container[error_attribute]                    
                elif error_attribute == "gene_name":
                    if "gene_id" in error_container:
                        del error_container[error_attribute]
                    else:
                        mibig_acc = data["general_params"]["mibig_accession"]
                        with open("../../preprocessed/reports/p5-accession_with_lost_genes.tsv", "a") as acc_list:
                            acc_list.write("3\t{}\t{}\n".format(mibig_acc, ", ".join(["{}:{}".format(k, v) for k, v in error_container_parent[error_container_attribute].items() if isinstance(v, str)])))
                        # for now, delete the gene (?)
                        error_container_parent[error_container_attribute] = ToDelete()
                elif error_attribute == "gene_function":
                    error_container[error_attribute] = "Unknown"
                    if "evidence_genefunction" in error_container:
                        error_container["evidence_genefunction"] = ToDelete()
                # fix comments, gene_comments
                elif error_attribute in ["comments", "gene_comments"]:
                    error_container[error_attribute] = ""

        # fix minimum errors (generalizable, but needs to be careful)
        elif error.validator == "minimum":
            # fix gene.startpos
            if error_attribute == "gene_startpos":
                error_container[error_attribute] = ToDelete()
                if "gene_endpos" in error_container:
                    error_container["gene_endpos"] = ToDelete()
            # fix gene.endpos
            if error_attribute == "gene_endpos":
                error_container[error_attribute] = ToDelete()
                if "gene_startpos" in error_container:
                    error_container["gene_startpos"] = ToDelete()

        # fix minItems errors (generalizable but limited e.g. need to consider 'required')
        elif error.validator == "minItems":
            # fix operon_genes = [], then delete operon
            if error_attribute == "operon_genes":
                del data["general_params"]["genes"]["operon"]
            # fix ripp..gene_id = []
            elif error_attribute == "gene_id":
                del data["general_params"]["ripp"]["precursor_loci"]
                data["general_params"]["minimal"] = True
            # fix publications
            elif error_attribute == "publications":
                mibig_acc = data["general_params"]["mibig_accession"]
                print("Missing publication: {}".format(mibig_acc))
                with open("../../preprocessed/reports/p5-accession_without_publications.tsv", "a") as acc_list:
                    acc_list.write("{}\t{}\n".format(mibig_acc, data["general_params"]["compounds"][0]["compound"]))
                # for now, fill with empty string (let's tackle other issues first)
                error_container[error_attribute] = ["pubmed:0"]

                
        # fix enum errors (semi-generalizable, needs to know what to put in place of the wrong value)
        elif error.validator == "enum":
            # fix biosyn_class
            if error_container_attribute == "biosyn_class":
                error_container[error_attribute] = "Other"
                if "other" not in data["general_params"]:
                    data["general_params"]["other"] = {}
                data["general_params"]["other"]["other_subclass"] = error.instance
            # fix genes..gene_function
            if error_attribute == "gene_function":
                attr_pairs = [
                    ("", "Unknown"),
                    ("None", "Unknown"),
                    ("Scaffold Biosynthesis", "Scaffold biosynthesis")
                ]
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Unknown"
                    pass
                if error_container[error_attribute] == "Unknown":       
                    if "evidence_genefunction" in error_container:
                        error_container["evidence_genefunction"] = ToDelete()
            # fix genes..tailoring
            if error_attribute == "tailoring":
                if error_container.get("gene_function") == "Tailoring":
                    attr_pairs = [
                        ("None", "Unknown"),
                    ]
                    if not replace_attr(attr_pairs):
                        # let it be, we will update the schema instead
                        pass
                else: # we don't need "tailoring"
                    error_container[error_attribute] = ToDelete()
            # fix genes..evidence_genefunction
            if error_container_attribute == "evidence_genefunction":
                evidences = []
                errorinstance = error.instance.replace(" ", "").replace("-", "").lower()
                if "activityassay" in errorinstance:
                    evidences.append("Activity assay")
                if "knockout" in errorinstance:
                    evidences.append("Knock-out")
                if "invivo" in errorinstance:
                    evidences.append("Other in vivo study")
                if "sequence" in errorinstance:
                    evidences.append("Sequence-based prediction")
                if "expression" in errorinstance:
                    evidences.append("Heterologous expression")
                if len(evidences) < 1:
                    evidences.append(error.instance) # let it be, we will update the schema instead
                error_container[error_attribute] = evidences[0]
                if len(evidences) > 1:
                    error_container.extend(evidences)
            # fix loci..complete
            elif error_attribute == "complete":
                if error.instance == "partial":
                    error_container[error_attribute] = "incomplete"
            # fix loci..conn_comp_cluster
            elif error_container_attribute == "conn_comp_cluster":
                attr_pairs = [
                    ("Proven expression in natural host", "Gene expression correlated with compound production"),
                    ("Knock-outstudies", "Knock-out studies")
                ]
                if not replace_attr(attr_pairs):
                    pass
            # fix compounds..chem_act
            elif error_container_attribute == "chem_act":
                attr_pairs = [
                    ("", "Unknown")
                ]
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Other"
                    error_container_parent["other_chem_act"] = error.instance # later on, we will update this
                    pass
            # fix compounds..chem_moiety
            elif error_attribute == "chem_moiety":
                attr_pairs = []
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Other"
                    error_container["other_chem_moiety"] = error.instance # later on, we will update this
                    pass                
            # fix compounds.mass_ion_type
            elif error_attribute == "mass_ion_type":
                # let it be, we will update the schema instead
                pass
            # fix polyketide.pks_subclass
            elif error_container_attribute == "pks_subclass":
                attr_pairs = [
                    ("Type I", "Modular type I"), # (aculeximycin)
                    ("Iterative typeI", "Iterative type I"),
                    ("Modular Type I", "Modular type I")
                ]
                if not replace_attr(attr_pairs):
                    # should be empty
                    pass
            # fix polyketide.starter_unit
            elif error_attribute == "starter_unit":
                if error.instance == "None":
                    error_container[error_attribute] = ToDelete()
                # else, we will keep for schema update
            # fix polyketide.pks_te_type
            elif error_attribute == "pks_te_type":
                attr_pairs = [
                    ("other", "Other"),
                ]
                if not replace_attr(attr_pairs):
                    # should be empty
                    pass
            # fix polyketide..pks_domains
            elif error_container_attribute == "pks_domains":
                attr_pairs = [
                    ("AT", "Acyltransferase"),
                    ("DH", "Dehydratase"),
                    ("KR", "Ketoreductase"),
                    ("ACP", "Thiolation (ACP/PCP)"),
                    ("PCP", "Thiolation (ACP/PCP)"),
                    ("T", "Thiolation (ACP/PCP)"),
                    ("CAL", "CoA-ligase"),
                    ("ER", "Enoylreductase"),
                    ("KS", "Ketosynthase"),
                    ("FAAL", "CoA-ligase"),
                    ("CMET", "Methyltransferase"),
                    ("TE/CLC", "Thioesterase"),
                    ("TE", "Thioesterase"),
                    ("SulphurT", "Sulfotransferase"),
                    ("ST", "Sulfotransferase"),
                    ("PT", "Product Template domain"),
                    ("C", "Condensation"),
                    ("A", "Adenlyation"),
                    ("E", "Epimerization"),
                    ("PPTASE", "Phosphopantetheinyl transferase"),
                    ("SAT", "ACP transacylase"),
                    ("TR", "Thiol reductase")
                ]
                if not replace_attr(attr_pairs):
                    #should be empty
                    pass
            # fix mod_pks_genes..kr_stereochem, A->S->D-OH, B->R->L-OH
            elif error_attribute == "kr_stereochem":
                attr_pairs = [
                    ("A-group", "D-OH"),
                    ("B-group", "L-OH")
                ]
                if not replace_attr(attr_pairs):
                    #should be empty
                    pass
            # fix mod_pks_genes..at_substr_spec
            elif error_attribute == "at_substr_spec":
                attr_pairs = [
                    ("malonyl-CoA", "Malonyl-CoA"),
                    ("methylmalonyl-CoA", "Methylmalonyl-CoA"),
                    ("Malonyl-CoA/Malonyl-CoA/Malonyl-CoA", "Malonyl-CoA"),
                    ("Methylmalonyl-CoA/Methylmalonyl-CoA", "Methylmalonyl-CoA"),
                    ("N/A", "None")
                ]
                if not replace_attr(attr_pairs):
                    if error.instance == "Acetyl-CoA/Methylmalonyl-CoA":
                        error_container[error_attribute] = "Multiple (promiscuous)"
                        error_container["at_multiple_spec"] = ["Acetyl-CoA", "Methylmalonyl-CoA"]
                    elif error.instance == "Methylmalonyl-CoA/Malonyl-CoA":
                        error_container[error_attribute] = "Multiple (promiscuous)"
                        error_container["at_multiple_spec"] = ["Malonyl-CoA", "Methylmalonyl-CoA"]
                    elif error.instance == "Acetyl-CoA + Malonyl CoA":
                        error_container[error_attribute] = "Multiple (promiscuous)"
                        error_container["at_multiple_spec"] = ["Malonyl-CoA", "Acetyl-CoA"]
                    else:
                        # what to do? 4-hydroxyphenylpyruvate;Various atypical acyl-CoAs;phenylacetate-like;Decanoyl-CoA
                        # let it be, we'll update the schema instead
                        pass
            # fix mod_pks_genes..evidence_at_spec
            elif error_attribute == "evidence_at_spec":
                attr_pairs = [
                    #("Feeding study", "Other")
                ]
                if not replace_attr(attr_pairs):
                    # let it be, we'll update the schema instead
                    pass
            # fix mod_pks_genes..pks_mod_doms
            elif error_container_attribute == "pks_mod_doms":
                attr_pairs = []
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Other"
                    error_container_parent["pks_other_mod_dom"] = error.instance # we'll update the schema later
                    pass
            # fix nrps_modules..nrps_mod_doms
            elif error_attribute == "nrps_mod_doms":
                attr_pairs = []
                if not replace_attr(attr_pairs):
                    error_container[error_attribute] = "Other"
                    error_container["nrps_other_mod_dom"] = error.instance # we'll update the schema later
                    pass
            # fix nrps_modules..prot_adom_spec
            elif error_attribute == "prot_adom_spec":
                if error.instance == "Asparigine":
                    error_container[error_attribute] = "Asparagine"
            # fix nrp..cdom_subtype
            elif error_attribute == "cdom_subtype":
                if error.instance in ["N/A", "None"]:
                    error_container[error_attribute] = ToDelete()
            # fix nrp..nonprot_adom_spec
            elif error_attribute == "nonprot_adom_spec":
                error_container[error_attribute] = "Other"
                error_container["other_spec"] = error.instance # we'll fix the schema later on
            # fix ripp..ripp_subclass
            elif error_attribute == "ripp_subclass":
                attr_pairs = [
                    ("Lantipeptide", "Lanthipeptide"),
                    ("Head-To-Tail Cyclized Peptide", "Head-to-tailcyclized peptide"),
                    ("Lap", "LAP"),
                    ("Lap / Microcin", "LAP"),
                    ("Lasso Peptide", "Lassopeptide"),
                    ("None", "Unknown") ## this is not available atm, but we'll fix that later
                ]
                if not replace_attr(attr_pairs):
                    # should be empty
                    pass
            # fix ripp..lin_cycl_ripp
            elif error_attribute == "lin_cycl_ripp":
                attr_pairs = [
                    ("linear", "Linear")
                ]
                if not replace_attr(attr_pairs):
                    # should be empty
                    pass
            # fix saccharide..saccharide_subclass
            elif error_attribute == "saccharide_subclass":
                if error.instance == "hyrbid/tailoring":
                    error_container[error_attribute] = "hybrid/tailoring"
            # fix saccharide..gt_specificity
            elif error_attribute == "gt_specificity":
                if error.instance == "None":
                    error_container[error_attribute] = "Unknown"
                else:
                    error_container[error_attribute] = "Other"
                    error_container["other_gt_spec"] = error.instance # we'll fix this later
            # fix saccharide..evidence_gt_spec
            elif error_attribute == "evidence_gt_spec":
                if error.instance == "structure-based inference":
                    error_container[error_attribute] = "Structure-based inference"                
            # fix other.other_subclass
            elif error_attribute == "other_subclass":
                error_container[error_attribute] = "Unknown" # it's either other/none, basically they have no idea what the class is
            ## try to fix "None"/"N/A"/"" --> "Unknown"
            else:
                use_others_instead = [
                    "crosslink_type",
                    "evidence_a_spec",
                    "nrps_evidence_skip_iter",
                    "terpene_subclass",
                    "terpene_c_len",
                    "terpene_precursor",
                    "pk_subclass",
                    "subclass",
                    "pks_evidence_skip_iter"
                ]
                if error_attribute in use_others_instead:
                    # keep them, we'll fix the schema instead
                    pass
                else:
                    attr_to_use = "Unknown"
                    attr_pairs = [
                        ("N/A", attr_to_use),
                        ("None", attr_to_use),
                        ("", attr_to_use)
                    ]
                    if not replace_attr(attr_pairs):
                        # keep them, we'll fix the schema instead
                        pass
                
        # fix requirement errors (needs hand-on approach)        
        elif error.validator == "required":
            missing_keys = set(error.validator_value) - set(error_container[error_attribute].keys())
            # fix loci (only fix BGCs with a single loci. for multi-loci, put them into the "retired" list)
            if "accession" in missing_keys:
                if len(data["general_params"]["loci"]["nucl_acc"]) > 1:
                    mibig_acc = data["general_params"]["mibig_accession"]
                    all_locis = []
                    for loci in data["general_params"]["loci"]["nucl_acc"]:
                        acc_and_loc = loci["accession"]
                        if "start_coord" in loci and "end_coord" in loci:
                            acc_and_loc = "{}({}-{})".format(acc_and_loc, loci["start_coord"], loci["end_coord"])
                        all_locis.append(acc_and_loc)
                    with open("../../preprocessed/reports/p5-multi_loci_bgcs.tsv", "a") as of:
                        of.write("{}\t{}\n".format(bgc_id, "; ".join(all_locis)))
                for nkey in data["general_params"]["loci"]["nucl_acc"][0]:
                    data["general_params"]["loci"][nkey] = data["general_params"]["loci"]["nucl_acc"][0][nkey]
                del data["general_params"]["loci"]["nucl_acc"]
                    
                    
            # fix saccharide subclasses
            if "saccharide" in missing_keys:
                # if PKS+Saccharide, assume it is a tailoring GT
                if "Polyketide" in data["general_params"]["biosyn_class"]:
                    data["general_params"]["saccharide"] = { "saccharide_subclass": "hybrid/tailoring" }
                # if NR+Saccharide, assume it is a tailoring GT
                elif "NRP" in data["general_params"]["biosyn_class"]:
                    data["general_params"]["saccharide"] = { "saccharide_subclass": "hybrid/tailoring" }
            # fix other subclasses
            if "other" in missing_keys:
                data["general_params"]["other"] = {"other_subclass": "Unknown"} # we'll fix that later
            # fix nrp subclasses
            if "nrp" in missing_keys:
                # set minimal = true
                data["general_params"]["minimal"] = True
            # fix polyketide subclasses
            if "polyketide" in missing_keys:
                # set minimal = true
                data["general_params"]["minimal"] = True
            # fix publication
            if "publications" in missing_keys:
                if "accession" in data["general_params"]["loci"]: # otherwise, don't bother
                    ncbi_acc = data["general_params"]["loci"]["accession"]
                    def fix_publications(acc_pub_pairs):
                        for ncbi_accs, pubs in acc_pub_pairs:
                            if ncbi_acc in ncbi_accs:
                                error_container[error_attribute]["publications"] = pubs
                                return True
                        return False
                    acc_pub_pairs = []
                    if not fix_publications(acc_pub_pairs):
                        mibig_acc = data["general_params"]["mibig_accession"]
                        print("Missing publication: {}".format(mibig_acc))
                        with open("../../preprocessed/reports/p5-accession_without_publications.tsv", "a") as acc_list:
                            acc_list.write("{}\t{}\n".format(mibig_acc, data["general_params"]["compounds"][0]["compound"]))
                        # for now, fill with empty publication
                        error_container[error_attribute]["publications"] = ["pubmed:0"]
                    
            # fix compounds.other_chem_act, remove the entry of "Other" from "chem_act"
            if "other_chem_act" in missing_keys:
                if "chem_act" in error_container[error_attribute]:
                    for i, act in enumerate(error_container[error_attribute]["chem_act"]):
                        if act == "Other":
                            error_container[error_attribute]["chem_act"][i] = ToDelete()
                error_container[error_attribute]["other_chem_act"] = ToDelete()
            # fix compounds..other_chem_moiety
            if "other_chem_moiety" in missing_keys:
                error_container[error_attribute] = ToDelete()
            # fix loci.evidence_struct, set to 'Other'
            if "conn_comp_cluster" in missing_keys:
                error_container[error_attribute]["conn_comp_cluster"] = ["Unknown"] # we'll fix this later
            # fix loci.complete, set to 'unknown'
            if "complete" in missing_keys:
                error_container[error_attribute]["complete"] = "unknown"
            # fix genes..tailoring
            if "tailoring" in missing_keys:
                error_container[error_attribute]["tailoring"] = "Unknown"
            # fix genes..gene_function
            if "gene_function" in missing_keys:
                error_container[error_attribute]["gene_function"] = "Unknown"
                if "evidence_genefunction" in error_container[error_attribute]:
                    error_container[error_attribute]["evidence_genefunction"] = ToDelete()
            # fix genes..evidence_genefunction
            if "evidence_genefunction" in missing_keys:
                if "mut_pheno" in error_container[error_attribute]:
                    error_container[error_attribute]["evidence_genefunction"] = ["Knock-out"]
                else:
                    error_container[error_attribute]["evidence_genefunction"] = ["Unknown"]
            # fix genes..evidence_operon
            if "evidence_operon" in missing_keys:
                error_container[error_attribute]["evidence_operon"] = "Unknown" # should be fixed later
            # fix saccharide..evidence_gt_spec
            if "evidence_gt_spec" in missing_keys:
                error_container[error_attribute]["evidence_gt_spec"] = "Unknown" # should be fixed later
            # fix saccharide..gt_gene
            if "gt_gene" in missing_keys:
                error_container_parent[error_container_attribute] = ToDelete()
            # fix ripp..precursor_loci by setting minimal=True (no good solution otherwise)
            if "precursor_loci" in missing_keys:
                data["general_params"]["minimal"] = True
                
        # fix pattern errors (needs hand-on approach)        
        elif error.validator == "pattern":
            # fix pks and nrp module_nr
            if error_attribute == "module_nr":
                # X/x, let's delete it for now (?)
                error_container[error_attribute] = ToDelete()
            elif error_attribute == "accession": # ncbi accession
                accession = error_container[error_attribute]
                attr_pairs = {
                    "    KF899892": "KF899892",
                    "KE145356.1 ": "KE145356.1",
                    "MQUP01000022 ": "MQUP01000022",
                    "MG837518.1 ": "MG837518.1",
                    "MG837519.1 ": "MG837519.1",
                    "MG837520.1 ": "MG837520.1",
                    "MG837521.1 ": "MG837521.1",
                    "MG837522.1 ": "MG837522.1",
                    "MG837524.1 ": "MG837524.1",
                    "NZ_PKFQ01000001 ": "NZ_PKFQ01000001",
                    "MG266907 ": "MG266907",
                    "MG459168 ": "MG459168"
                }
                if accession in attr_pairs:
                    error_container[error_attribute] = attr_pairs[accession]
                else:
                    # mark as missing, fill placeholder string in place
                    with open("../../preprocessed/reports/p5-accession_with_lost_nucl_accs.tsv", "a") as acc_list:
                        acc_list.write("{}\t{}\n".format(data["general_params"]["mibig_accession"], accession))
                    error_container[error_attribute]["accession"] = "MIBIG:{}".format(data["general_params"]["mibig_accession"])
                    pass
    return

In [None]:
def fetch_enum(input_dict, cur_path, result):
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_enum(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_enum(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result and "enum" in input_dict:
        result[key_path] = {}
        for enum_val in input_dict["enum"]:
            result[key_path][enum_val] = []
    return result

In [None]:
with open("../../preprocessed/reports/p5-unknown_enum_options.tsv", "w") as o:
    o.write("bgc_id\tpath\tadded_enum\tdecision\n")
    
def update_schema_enum(bgc_id, schema_obj, error):
    if error.validator == "enum":
        enum_container = schema_obj
        schema_path = error.schema_path
        schema_path_string = ".".join(schema_path)
        while len(schema_path) > 0:
            enum_container = enum_container[schema_path.popleft()]
        # update the schema enum
        if error.instance not in enum_container:
            val = error.instance
            decision = ""
            if error.instance in ["None", "Unknown"]:
                decision = "approve"
            else:
                decision = "retire"
                
            with open("../../preprocessed/reports/p5-unknown_enum_options.tsv", "a") as o:
                o.write("{}\t{}\t{}\t{}\n".format(bgc_id, schema_path_string, val, decision))
                
            if decision == "approve":
                enum_container.append(val)
                return True
            elif decision == "retire":                
                return False

In [None]:
data_contain_others = []
with open("../../preprocessed/reports/p5-enum_contain_others.tsv", "w") as o:
    o.write("")
def check_and_remove_other(keys, input_dict, attribute, parent, grandparent):
    if isinstance(input_dict, list):
        for i, node in enumerate(input_dict):
            check_and_remove_other(keys, node, i, input_dict, parent)
    elif isinstance(input_dict, dict):
        for k in input_dict:
            check_and_remove_other("{}/{}".format(keys, k), input_dict[k], k, input_dict, parent)
        for toDel in ["nrps_other_mod_dom", "other_chem_act", "other_chem_moiety", "other_gt_spec", "pks_other_mod_dom", "other_spec"]:
            input_dict.pop(toDel, None)
    else:
        val = str(input_dict)
        if val.lower().rstrip().lstrip() in ["other", "others"]:
            if keys not in data_contain_others:
                data_contain_others.append(keys)
            if keys == "/general_params/nrp/nrps_genes/nrps_module/nrps_mod_doms":
                parent[attribute] = parent.get("nrps_other_mod_dom", "Unknown")
            elif keys == "/general_params/compounds/chem_act":
                parent[attribute] = grandparent.get("other_chem_act", "Unknown")
            elif keys == "/general_params/compounds/chem_moieties/chem_moiety":
                parent[attribute] = parent.get("other_chem_moiety", "Unknown")
            elif keys == "/general_params/saccharide/gt_genes/gt_specificity":
                parent[attribute] = parent.get("other_gt_spec", "Unknown")
            elif keys == "/general_params/polyketide/mod_pks_genes/pks_module/pks_mod_doms":
                parent[attribute] = grandparent.get("pks_other_mod_dom", "Unknown")
            elif keys == "/general_params/nrp/nrps_genes/nrps_module/a_substr_spec/nonprot_adom_spec":
                parent[attribute] = parent.get("other_spec", "Unknown")
            elif keys == "/general_params/polyketide/pks_subclass":
                pass
            elif keys == "/general_params/polyketide/pk_subclass":
                pass
            elif keys == "/general_params/biosyn_class":
                pass
            else:
                parent[attribute] = "Unknown"
        elif val.lower().rstrip().lstrip() == "n/a":
            parent[attribute] = "Unknown"
        elif val.lower().rstrip().lstrip() == "none":
            parent[attribute] = "None"
        elif val.lower().rstrip().lstrip() == "unknown":
            parent[attribute] = "Unknown"

In [None]:
def remove_other_from_schema(schema):
    if "other_spec" not in schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["properties"]["a_substr_spec"]["properties"]:
        return # is done
    # ["nrps_other_mod_dom", "other_chem_act", "other_chem_moiety", "other_gt_spec", "pks_other_mod_dom", "other_spec"]
    del schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["properties"]["nrps_other_mod_dom"]
    del schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["other_chem_act"]
    del schema["properties"]["general_params"]["properties"]["compounds"]["items"]["allOf"]
    del schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["chem_moieties"]["items"]["properties"]["other_chem_moiety"]
    del schema["properties"]["general_params"]["properties"]["compounds"]["items"]["properties"]["chem_moieties"]["items"]["allOf"]
    del schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["properties"]["other_gt_spec"]
    del schema["properties"]["general_params"]["properties"]["saccharide"]["properties"]["gt_genes"]["items"]["allOf"][0]
    del schema["properties"]["general_params"]["properties"]["polyketide"]["properties"]["mod_pks_genes"]["items"]["properties"]["pks_module"]["items"]["properties"]["pks_other_mod_dom"]
    del schema["properties"]["general_params"]["properties"]["nrp"]["properties"]["nrps_genes"]["items"]["properties"]["nrps_module"]["items"]["properties"]["a_substr_spec"]["properties"]["other_spec"]
    def remove_other_from_enum(schema):
        if isinstance(schema, dict):
            for key in schema:
                if key == "allOf":
                    continue
                elif key in ["pks_subclass", "pk_subclass", "biosyn_class"]:
                    continue
                elif key == "enum":
                    if "Other" in schema[key]:
                        new_enums = []
                        for enum in schema[key]:
                            if enum != "Other":
                                new_enums.append(enum)
                        schema[key] == new_enums
                        pass
                    elif "unknown" in schema[key]:
                        new_enums = []
                        for enum in schema[key]:
                            if enum != "Other":
                                new_enums.append(enum)
                            else:
                                new_enums.append("Unknown")
                        schema[key] == new_enums
                        pass
                else:
                    remove_other_from_enum(schema[key])
        else:
            return
    remove_other_from_enum(schema) 

In [None]:
last_error = 0

In [None]:
if not path.exists("../../preprocessed/p5-json/"):
    makedirs("../../preprocessed/p5-json/")
    
with open("../../preprocessed/p4-mibig_schema_draft7.json") as json_file:
    schema_obj = json.load(json_file)
    validator = Draft7Validator(schema_obj)
    errors = {}
    for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/p5-json/")):
        bgc_id = path.basename(json_path)
        id_int = int(bgc_id[3:-5])
        if (id_int < last_error) and True:
            continue
        with open(json_path, "r") as json_file:
            data = json.load(json_file)
            error_counts_before = 0
            error_counts_after = 0
            update_pub_and_accs(data, pubs_and_new_accs)
            update_ncbi_accs(data)
            for error in sorted(validator.iter_errors(data), key=str):
                fix_data_new_schema(data, error)
                error_counts_before += 1
            lazily_deletes(data)
            error_counts = 0
            check_and_remove_other("", data, None, None, None)
            remove_other_from_schema(schema_obj)
            validator = Draft7Validator(schema_obj)
            for error in sorted(validator.iter_errors(data), key=str):
                # fix remaining enum errors by updating the schema
                update_schema_enum(bgc_id, schema_obj, error)
            validator = Draft7Validator(schema_obj)
            for error in sorted(validator.iter_errors(data), key=str):
                if error.validator != "enum":
                    error_counts_after += 1
            print("Validated and fixed {}... Before {} error(s), After: {} error(s)".format(bgc_id, error_counts_before, error_counts_after))
            with open(path.join("../../preprocessed/p5-json/", bgc_id), "w") as jo:
                json.dump(data, jo, indent=4, separators=(',', ': '))
            if error_counts_after > 0:
                last_error = id_int
                exit(1)
    
    with open("../../preprocessed/p5-mibig_schema_draft7.json", "w") as o:
        o.write(json.dumps(schema_obj, indent=4, separators=(',', ': ')))
        
    print("All data validated!")

In [None]:
with open("../../preprocessed/reports/p5-enum_contain_others.tsv", "w") as o:
    for path in data_contain_others:
        o.write("{}\n".format(path))

In [None]:
# merge all retired BGCs into one tsv
retired_bgcs = {}
with open("../../preprocessed/reports/p5-accession_without_publications.tsv", "r") as o:
    for line in o:
        bgc_id = line.split("\t")[0]
        if bgc_id not in retired_bgcs:
            retired_bgcs[bgc_id] = set()
        retired_bgcs[bgc_id].add("no_publication")
with open("../../preprocessed/reports/p5-multi_loci_bgcs.tsv", "r") as o:
    for line in o:
        bgc_id = line.split("\t")[0].split(".")[0]
        if bgc_id not in retired_bgcs:
            retired_bgcs[bgc_id] = set()
        retired_bgcs[bgc_id].add("multi_loci")
with open("../../preprocessed/reports/p5-unknown_enum_options.tsv", "r") as o:
    for line in o:
        bgc_id, path, added_enum, decision = line.rstrip().split("\t")
        if decision == "retire":
            bgc_id = bgc_id.split(".")[0]
            if bgc_id not in retired_bgcs:
                retired_bgcs[bgc_id] = set()
            retired_bgcs[bgc_id].add("enum")

with open("../../preprocessed/reports/p5-retired_list.tsv", "w") as o:
    for bgc_id in retired_bgcs:
        o.write("{}\t{}\n".format(bgc_id, ";".join(retired_bgcs[bgc_id])))