In [None]:
from os import path, makedirs
import glob
import json
import shutil
import re
from jsonschema import validate, Draft7Validator

In [None]:
reasons = set()
bgcs = {}
todos = {}
todos_reasons = set()

for phase in [5, 6, 7]:
    with open("../../preprocessed/reports/p{}-retired_list.tsv".format(phase), "r") as pl:
        for line in pl:
            cols = line.strip().split("\t")
            bgc_id = cols[0]
            if bgc_id not in bgcs:
                bgcs[bgc_id] = set()
            bgcs[bgc_id].update(cols[1].split(";"))
            reasons.update(cols[1].split(";"))

In [None]:
for phase in [7]:
    with open("../../preprocessed/reports/p{}-fixed_list.tsv".format(phase), "r") as pl:
        for line in pl:
            cols = line.strip().split("\t")
            bgc_id = cols[0]
            fixed_problems = cols[1].split(";")
            for fixed_problem in fixed_problems:
                try:
                    bgcs[bgc_id].remove(fixed_problem)
                except:
                    print(bgc_id)

In [None]:
for phase in [7]:
    with open("../../preprocessed/reports/p{}-todo_list.tsv".format(phase), "r") as pl:
        for line in pl:
            cols = line.strip().split("\t")
            bgc_id = cols[0]
            reason = cols[1]
            ids = cols[2].split(";")
            if bgc_id not in todos:
                todos[bgc_id] = {}
            todos[bgc_id][reason] = len(ids)
            todos_reasons.add(reason)

In [None]:
def append_addprop(input_dict):
    if isinstance(input_dict, dict):
        if "properties" in input_dict and "additionalProperties" not in input_dict:
            input_dict["additionalProperties"] = False
        for key in input_dict:
            if key in ["allOf"]:
                continue
            append_addprop(input_dict[key])
    elif isinstance(input_dict, list):
        for node in input_dict:
            append_addprop(node)
    return

def add_rules_additionalProperties(schema):
    append_addprop(schema)

In [None]:
final_schema = "../../inputs/mibig_schema_phase_6.json"
output_schema = "../../outputs/mibig_2.0_schema.json"
schema_obj = None
validator = None

with open(final_schema, "r") as json_file:
    schema_obj = json.load(json_file)
    add_rules_additionalProperties(schema_obj)
    validator = Draft7Validator(schema_obj)
    with open(output_schema, "w") as o:
        o.write(json.dumps(schema_obj, indent=4, separators=(',', ': '), sort_keys=True))

In [None]:
def check_and_remove_unknowns(input_dict, path, schema):
    if isinstance(input_dict, dict):
        new_dict = {}
        for key in input_dict:
            val = input_dict[key]
            path_to_key = path + ["properties", key]
            if isinstance(val, str) and val.lower() in ["unknown", "none"]: # don't need to discard others, it was already handled in phase_5
                schema_entry = schema
                for walk in path_to_key:
                    schema_entry = schema_entry[walk]
                assert isinstance(schema_entry, dict)
                if "enum" not in schema_entry:
                    # discard this attribute
                    continue
            new_dict[key] = check_and_remove_unknowns(val, path_to_key, schema)
        input_dict = new_dict
    elif isinstance(input_dict, list):
        path_to_key = path + ["items"]
        schema_entry = schema
        for walk in path_to_key:
            schema_entry = schema_entry[walk]
        assert isinstance(schema_entry, dict)
        if "enum" not in schema_entry:
            new_list = []
            for i, val in enumerate(input_dict):
                if isinstance(val, str) and val.lower() in ["unknown", "none"]:
                    # discard this value
                    continue
                new_list.append(val)
            input_dict = new_list
    return input_dict

In [None]:
def check_and_remove_empty_values(input_dict):
    if isinstance(input_dict, dict):
        new_dict = {}
        for key in input_dict:
            val = check_and_remove_empty_values(input_dict[key])
            if val != None:
                new_dict[key] = val
        if len(new_dict.keys()) < 1:
            return None
        else:
            input_dict = new_dict
    elif isinstance(input_dict, list):
        new_list = []
        for val in input_dict:
            val = check_and_remove_empty_values(val)
            if val != None:
                new_list.append(val)
        if len(new_list) < 1:
            return None
        else:
            input_dict = new_list
    elif isinstance(input_dict, str):
        if len(input_dict) < 1: # ""
            return None
    return input_dict

In [None]:
def clean_data(data, schema):
    # remove "Unknown", "None", "Other/s" that are not specified in the schema's enum
    data = check_and_remove_unknowns(data, [], schema)
    # remove empty arrays/dicts/values
    # if it caused validation error, then the data needs to be fixed first at prior phases
    data = check_and_remove_empty_values(data)
    return data

In [None]:
def validate_data(data, validator):
    for error in sorted(validator.iter_errors(data), key=str):
        print(error.message)
        sys.exit(0)

In [None]:
input_folder = "../../preprocessed/p7-json/"
output_folder = "../../outputs/json_2.0/"
retired_folder = "../../outputs/retired/"

summary_file = "../../outputs/summary.tsv"
todo_file = "../../outputs/todos.tsv"

structures_20 = {}

if path.exists(output_folder):
    shutil.rmtree(output_folder)
makedirs(output_folder)

if path.exists(retired_folder):
    shutil.rmtree(retired_folder)
makedirs(retired_folder)

with open(summary_file, "w") as sf:
    reasons_sorted = list(reasons)
    sf.write("bgc_id\t{}\n".format("\t".join(reasons_sorted)))
    for json_path in sorted(glob.glob(path.join(input_folder, "BGC*.json"))):
        with open(json_path, "r") as json_file:
            bgc_id = path.basename(json_path).split(".")[0]
            data = json.load(json_file)
            structures_20[bgc_id] = []
            for compound in data["cluster"]["compounds"]:
                structures_20[bgc_id].append((compound["compound"], compound.get("chem_struct", ""), ";".join(compound.get("database_id", []))))
            retirement_reasons = bgcs.get(bgc_id, set())
            sf.write("{}\t{}\n".format(bgc_id, "\t".join([str(int(reason in retirement_reasons)) for reason in reasons_sorted])))
            if len(bgcs.get(bgc_id, set())) == 0:
                print("Copying {}".format(bgc_id))
                data = clean_data(data, schema_obj)
                validate_data(data, validator)
                with open(path.join(output_folder, "{}.json".format(bgc_id)), "w") as o:
                    o.write(json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True))
            else:
                print("Retiring {}".format(bgc_id))
                print(bgcs.get(bgc_id, set()))
                with open(path.join(retired_folder, "{}.json".format(bgc_id)), "w") as o:
                    o.write(json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True))
    print("Done!")
    
with open(todo_file, "w") as sf:
    reasons_sorted = list(todos_reasons)
    sf.write("bgc_id\t{}\n".format("\t".join(todos_reasons)))
    for json_path in sorted(glob.glob(path.join(input_folder, "BGC*.json"))):
        with open(json_path, "r") as json_file:
            bgc_id = path.basename(json_path).split(".")[0]
            data = json.load(json_file)
            sf.write(bgc_id)
            for i, reason in enumerate(reasons_sorted):
                sf.write("\t")
                sf.write(str(todos.get(bgc_id, {}).get(reason, 0)))
            sf.write("\n")
    print("Done!")

In [None]:
with open("../../outputs/bgc_structures_20.tsv", "w") as o:
    for bgc_id in structures_20:
        for compound in structures_20[bgc_id]:
            o.write("{}\t{}\t{}\t{}\n".format(bgc_id, compound[0], compound[1], compound[2]))