In [1]:
# phase 0: compare old schema vs old data, output the summary in an excel file* to help decision making
# phase 1: transform old schema to match (correct version) of old data
# phase 2: compare phase 1 schema vs old data, output the summary in an excel file* to help decision making
# phase 3: transform old data to match schema from phase 1 (doesn't count dependencies/required keywords)
# phase 4: transform schema from phase 1 to match JSON Schema draft v7 (we will call it 'new schema')
# phase 5: transform data from phase 2 to match the new schema (include all dependencies/required keywords)

In [2]:
## common imports ##
from os import path
import glob
import json

In [3]:
## common functions ##
def fetch_mibig_json_filepaths(dir_path):
    """fetch mibig json paths from a specific folder"""
    return glob.glob(path.join(dir_path, "BGC*.json"))

def count_props(input_dict, cur_path, result):
    """given a (mibig?) json, construct a list of property paths
    along with its presence count in the json object"""
    key_path = cur_path
    
    if isinstance(input_dict, dict):
        for key in input_dict.keys():
            result = count_props(input_dict[key], "{}/{}".format(key_path, key), result)
    elif isinstance(input_dict, list):
        key_path = "{}[]".format(key_path)
        for node in input_dict:
            result = count_props(node, "{}".format(key_path), result)

    if key_path not in result:
        result[key_path] = 0
    result[key_path] += 1
    
    return result


def fetch_props_old_schema(input_dict, cur_path = "", result = {}):
    """given a (mibig?) json schema, construct a list of property paths
    along with either required == True for each properties"""
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_props_old_schema(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_props_old_schema(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result:
        result[key_path] = "required" in input_dict and input_dict["required"] == True
    return result

In [4]:
###### phase 0: compare old schema vs old data, output the summary in an excel file* to help decision making ########

In [5]:
all_props = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../inputs/json_1.4/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props:
                all_props[prop] = [path.basename(json_path)]
            else:
                all_props[prop].append(path.basename(json_path))

In [6]:
with open("../../inputs/mibig_schema.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj)
    with open("../../preprocessed/old_schema_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{},{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/old_data_vs_old_schema.csv", "w") as o:
        not_in_schema = []
        for key in sorted(all_props.keys()):
            if key not in schema_props.keys():
                not_in_schema.append((key, all_props[key]))
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))

File written: ../../preprocessed/old_schema_properties.csv
File written: ../../preprocessed/old_data_vs_old_schema.csv


In [7]:
###### phase 1: transform old schema to match (correct version) of old data ######

In [8]:
# (everything is manually done)
# update all comma-separated based properties into arrays
# gene_pubs: integer --> gene_pubs: array
print("File written: ../../preprocessed/mibig_schema_phase_1.json")

File written: ../../preprocessed/mibig_schema_phase_1.json


In [9]:
##### phase 2: compare phase 1 schema vs old data, output the summary in an excel file* to help decision making ####

In [10]:
# use all_props from phase 0
with open("../../preprocessed/mibig_schema_phase_1.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj)
    with open("../../preprocessed/schema_phase_1_properties.csv", "w") as o:
        for key in sorted(schema_props.keys()):
            o.write("{},{}\n".format(key, schema_props[key]))
        print("File written: {}".format(o.name))
    not_in_schema = []
    for key in sorted(all_props.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props[key]))
    with open("../../preprocessed/old_data_vs_schema_phase_1.csv", "w") as o:
        for rep in sorted(not_in_schema, key=lambda x: len(x[1]), reverse = True):
            o.write("{},{}\n".format(rep[0], len(rep[1])))
        print("File written: {}".format(o.name))
    with open("../../preprocessed/bgc_to_fix_phase_2.csv", "w") as o:
        bgc_to_fix = {}
        for rep in not_in_schema:
            for bgc in rep[1]:
                if bgc not in bgc_to_fix:
                    bgc_to_fix[bgc] = []
                bgc_to_fix[bgc].append(rep[0])
        for bgc in bgc_to_fix:
            o.write("{},{}\n".format(bgc, ";".join(bgc_to_fix[bgc])))
        print("File written: {}".format(o.name))

File written: ../../preprocessed/schema_phase_1_properties.csv
File written: ../../preprocessed/old_data_vs_schema_phase_1.csv
File written: ../../preprocessed/bgc_to_fix_phase_2.csv


In [11]:
#### phase 3: transform old data to match schema from phase 1 (doesn't count dependencies/required keywords) ####
# (manually copy and do manual fixes)
# automate for conversion of /general_params/Other/biosyn_class[] to /general_params/Other/other_subclass|
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/json_1.4_phase_3/")):
    json_obj = None
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
    if "Other" in json_obj["general_params"] and "biosyn_class" in json_obj["general_params"]["Other"]:
        clas = json_obj["general_params"]["Other"]["biosyn_class"][0]
        del json_obj["general_params"]["Other"]["biosyn_class"]
        json_obj["general_params"]["Other"]["other_subclass"] = clas
        with open(json_path, "w") as json_file:
            json.dump(json_obj, json_file)
            print("Wrote {}".format(json_path)) # will only be called once ever

In [12]:
# verify that all data matched schema
all_props_phase_3 = {}
for json_path in sorted(fetch_mibig_json_filepaths("../../preprocessed/json_1.4_phase_3/")):
    with open(json_path, "r") as json_file:
        json_obj = json.load(json_file)
        this_file_props = count_props(json_obj, "", {})
        for prop in this_file_props:
            if prop not in all_props_phase_3:
                all_props_phase_3[prop] = [path.basename(json_path)]
            else:
                all_props_phase_3[prop].append(path.basename(json_path))

In [13]:
with open("../../preprocessed/mibig_schema_phase_1.json") as json_file:
    json_obj = json.load(json_file)
    schema_props = fetch_props_old_schema(json_obj)
    not_in_schema = []
    for key in sorted(all_props_phase_3.keys()):
        if key not in schema_props.keys():
            not_in_schema.append((key, all_props_phase_3[key]))
    print("Number of conflicts: {}".format(len(not_in_schema)))

Number of conflicts: 0


In [14]:
###### phase 4: transform schema from phase 1 to match JSON Schema draft v7 (we will call it 'new schema') ######

In [94]:
new_schema = None
with open("../../preprocessed/mibig_schema_phase_1.json") as json_file:
    new_schema = json.load(json_file) # pre-load with old schema

In [95]:
# 1: fix 'required'
def fix_required(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":
        input_dict["required"] = []
        for prop in input_dict["properties"]:
            child = input_dict["properties"][prop]
            if "required" in child and child["required"] == True:
                input_dict["required"].append(prop)
            fix_required(child)
        if len(input_dict["required"]) < 1:
            del input_dict["required"]
    else:
        if "type" in input_dict and input_dict["type"] == "array":
            fix_required(input_dict["items"])
        if "required" in input_dict:
            del input_dict["required"]
fix_required(new_schema)

In [96]:
# 2: fix 'dependencies'
def fix_dependencies(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":
        input_dict["dependencies"] = {}
        for prop in input_dict["properties"]:
            child = input_dict["properties"][prop]
            if "dependencies" in child and isinstance(child["dependencies"], str):
                if child["dependencies"] in input_dict["properties"]:
                    if child["dependencies"] not in input_dict["dependencies"]:
                        input_dict["dependencies"][child["dependencies"]] = []
                    input_dict["dependencies"][child["dependencies"]].append(prop)
                else:
                    print("Error: {} not found".format(child["dependencies"]))
            fix_dependencies(child)
        if len(input_dict["dependencies"].keys()) < 1:
            del input_dict["dependencies"]
    else:
        if "type" in input_dict and input_dict["type"] == "array":
            fix_dependencies(input_dict["items"])
        if "dependencies" in input_dict:
            del input_dict["dependencies"]
fix_dependencies(new_schema)

In [97]:
# 3: make sure 'enum' contain unique items
def fix_enum(input_dict):
    if "type" in input_dict and input_dict["type"] == "object":        
        for prop in input_dict["properties"]:
            fix_enum(input_dict["properties"][prop])
    elif "type" in input_dict and input_dict["type"] == "array":
        fix_enum(input_dict["items"])
            
    if "enum" in input_dict:
        input_dict["enum"] = list(set(input_dict["enum"]))
fix_enum(new_schema)

In [98]:
# 5: manual curations
new_schema["$schema"] = "http://json-schema.org/draft-07/schema#"
new_schema["$schema_version"] = "2.0"
new_schema["$schema_created"] = "20-03-2019"

#del new_schema["properties"]["general_params"]["dependencies"]
#new_schema["properties"]["general_params"]["allOf"] = [
#    {
#        "properties": {
#            "biosyn_class": {"enum": ["Polyketide"] }
#        },
#        "required": ["Polyketide"]
#    }
#]

In [99]:
# 4: save new schema
with open("../../outputs/mibig_schema_draft7.json", "w") as o:
    o.write(json.dumps(new_schema, indent=4, separators=(',', ': ')))