In [1]:
from os import path, makedirs
import glob
import json
import shutil
import re
from jsonschema import validate, Draft7Validator

In [2]:
reasons = set()
bgcs = {}
todos = {}
todos_reasons = set()

for phase in [5, 6, 7, 9, 10]:
    with open("../../preprocessed/reports/p{}-retired_list.tsv".format(phase), "r") as pl:
        for line in pl:
            cols = line.strip().split("\t")
            bgc_id = cols[0]
            if bgc_id not in bgcs:
                bgcs[bgc_id] = set()
            bgcs[bgc_id].update(cols[1].split(";"))
            reasons.update(cols[1].split(";"))

In [3]:
for phase in [7, 8, 9, 10]:
    with open("../../preprocessed/reports/p{}-fixed_list.tsv".format(phase), "r") as pl:
        for line in pl:
            cols = line.strip().split("\t")
            bgc_id = cols[0]
            fixed_problems = cols[1].split(";")
            for fixed_problem in fixed_problems:
                try:
                    bgcs[bgc_id].remove(fixed_problem)
                except:
                    print(bgc_id)

BGC0001331


In [4]:
for phase in [7, 9]:
    with open("../../preprocessed/reports/p{}-todo_list.tsv".format(phase), "r") as pl:
        for line in pl:
            cols = line.strip().split("\t")
            bgc_id = cols[0]
            reason = cols[1]
            ids = cols[2].split(";")
            if bgc_id not in todos:
                todos[bgc_id] = {}
            todos[bgc_id][reason] = len(ids)
            todos_reasons.add(reason)

In [5]:
def append_addprop(input_dict):
    if isinstance(input_dict, dict):
        if "properties" in input_dict and "additionalProperties" not in input_dict:
            input_dict["additionalProperties"] = False
        for key in input_dict:
            if key in ["allOf"]:
                continue
            append_addprop(input_dict[key])
    elif isinstance(input_dict, list):
        for node in input_dict:
            append_addprop(node)
    return

def add_rules_additionalProperties(schema):
    append_addprop(schema)

In [6]:
final_schema = "../../inputs/mibig_schema_phase_6.json"
output_schema = "../../outputs/mibig_2.0_schema.json"
schema_obj = None
validator = None

with open(final_schema, "r") as json_file:
    schema_obj = json.load(json_file)
    add_rules_additionalProperties(schema_obj)
    validator = Draft7Validator(schema_obj)
    with open(output_schema, "w") as o:
        o.write(json.dumps(schema_obj, indent=4, separators=(',', ': '), sort_keys=True))

In [7]:
def check_and_remove_unknowns(input_dict, path, schema):
    if isinstance(input_dict, dict):
        new_dict = {}
        for key in input_dict:
            val = input_dict[key]
            path_to_key = path + ["properties", key]
            if isinstance(val, str) and val.lower() in ["unknown", "none"]: # don't need to discard others, it was already handled in phase_5
                schema_entry = schema
                for walk in path_to_key:
                    schema_entry = schema_entry[walk]
                assert isinstance(schema_entry, dict)
                if "enum" not in schema_entry:
                    # discard this attribute
                    continue
            new_dict[key] = check_and_remove_unknowns(val, path_to_key, schema)
        input_dict = new_dict
    elif isinstance(input_dict, list):
        path_to_key = path + ["items"]
        schema_entry = schema
        for walk in path_to_key:
            schema_entry = schema_entry[walk]
        assert isinstance(schema_entry, dict)
        if "enum" not in schema_entry:
            new_list = []
            for i, val in enumerate(input_dict):
                if isinstance(val, str) and val.lower() in ["unknown", "none"]:
                    # discard this value
                    continue
                new_list.append(val)
            input_dict = new_list
    return input_dict

In [8]:
def check_and_remove_empty_values(input_dict):
    if isinstance(input_dict, dict):
        new_dict = {}
        for key in input_dict:
            val = check_and_remove_empty_values(input_dict[key])
            if val != None:
                new_dict[key] = val
        if len(new_dict.keys()) < 1:
            return None
        else:
            input_dict = new_dict
    elif isinstance(input_dict, list):
        new_list = []
        for val in input_dict:
            val = check_and_remove_empty_values(val)
            if val != None:
                new_list.append(val)
        if len(new_list) < 1:
            return None
        else:
            input_dict = new_list
    elif isinstance(input_dict, str):
        if len(input_dict) < 1: # ""
            return None
    return input_dict

In [9]:
def clean_data(data, schema):
    # remove "Unknown", "None", "Other/s" that are not specified in the schema's enum
    data = check_and_remove_unknowns(data, [], schema)
    # remove empty arrays/dicts/values
    # if it caused validation error, then the data needs to be fixed first at prior phases
    data = check_and_remove_empty_values(data)
    return data

In [10]:
def validate_data(data, validator):
    for error in sorted(validator.iter_errors(data), key=str):
        print(error.message)
        sys.exit(0)

In [11]:
input_folder = "../../preprocessed/p10-json/"
output_folder = "../../outputs/json_2.0/"
retired_folder = "../../outputs/retired/"

summary_file = "../../outputs/summary.tsv"
todo_file = "../../outputs/todos.tsv"

if path.exists(output_folder):
    shutil.rmtree(output_folder)
makedirs(output_folder)

if path.exists(retired_folder):
    shutil.rmtree(retired_folder)
makedirs(retired_folder)

with open(summary_file, "w") as sf:
    reasons_sorted = list(reasons)
    sf.write("bgc_id\t{}\n".format("\t".join(reasons_sorted)))
    for json_path in sorted(glob.glob(path.join(input_folder, "BGC*.json"))):
        with open(json_path, "r") as json_file:
            bgc_id = path.basename(json_path).split(".")[0]
            data = json.load(json_file)
            retirement_reasons = bgcs.get(bgc_id, set())
            sf.write("{}\t{}\n".format(bgc_id, "\t".join([str(int(reason in retirement_reasons)) for reason in reasons_sorted])))
            if len(bgcs.get(bgc_id, set())) == 0:
                print("Copying {}".format(bgc_id))
                data = clean_data(data, schema_obj)
                validate_data(data, validator)
                with open(path.join(output_folder, "{}.json".format(bgc_id)), "w") as o:
                    o.write(json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True))
            else:
                print("Retiring {}".format(bgc_id))
                with open(path.join(retired_folder, "{}.json".format(bgc_id)), "w") as o:
                    o.write(json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True))
    print("Done!")
    
with open(todo_file, "w") as sf:
    reasons_sorted = list(todos_reasons)
    sf.write("bgc_id\t{}\n".format("\t".join(todos_reasons)))
    for json_path in sorted(glob.glob(path.join(input_folder, "BGC*.json"))):
        with open(json_path, "r") as json_file:
            bgc_id = path.basename(json_path).split(".")[0]
            data = json.load(json_file)
            sf.write(bgc_id)
            for i, reason in enumerate(reasons_sorted):
                sf.write("\t")
                sf.write(str(todos.get(bgc_id, {}).get(reason, 0)))
            sf.write("\n")
    print("Done!")

Copying BGC0000001
Copying BGC0000002
Copying BGC0000003
Copying BGC0000004
Copying BGC0000005
Copying BGC0000006
Copying BGC0000007
Copying BGC0000008
Copying BGC0000009
Copying BGC0000010
Copying BGC0000011
Copying BGC0000012
Copying BGC0000013
Copying BGC0000014
Retiring BGC0000015
Copying BGC0000016
Copying BGC0000017
Copying BGC0000018
Copying BGC0000019
Copying BGC0000020
Copying BGC0000021
Copying BGC0000022
Copying BGC0000023
Copying BGC0000024
Copying BGC0000025
Copying BGC0000026
Copying BGC0000027
Copying BGC0000028
Copying BGC0000029
Copying BGC0000030
Copying BGC0000031
Copying BGC0000032
Copying BGC0000033
Copying BGC0000034
Copying BGC0000035
Copying BGC0000036
Copying BGC0000037
Copying BGC0000038
Copying BGC0000039
Copying BGC0000040
Copying BGC0000041
Copying BGC0000042
Copying BGC0000043
Copying BGC0000044
Copying BGC0000045
Copying BGC0000046
Copying BGC0000047
Copying BGC0000048
Retiring BGC0000049
Copying BGC0000050
Copying BGC0000051
Copying BGC0000052
Copying BG

Copying BGC0000446
Copying BGC0000447
Copying BGC0000448
Copying BGC0000449
Copying BGC0000450
Copying BGC0000451
Copying BGC0000452
Copying BGC0000453
Copying BGC0000454
Copying BGC0000455
Retiring BGC0000456
Copying BGC0000457
Copying BGC0000458
Copying BGC0000459
Copying BGC0000460
Copying BGC0000461
Retiring BGC0000462
Copying BGC0000463
Copying BGC0000464
Copying BGC0000465
Copying BGC0000466
Copying BGC0000467
Copying BGC0000468
Copying BGC0000469
Copying BGC0000470
Copying BGC0000471
Copying BGC0000472
Copying BGC0000473
Copying BGC0000474
Copying BGC0000475
Copying BGC0000476
Copying BGC0000477
Copying BGC0000478
Copying BGC0000479
Copying BGC0000480
Copying BGC0000481
Copying BGC0000482
Copying BGC0000483
Copying BGC0000484
Copying BGC0000485
Copying BGC0000486
Copying BGC0000487
Copying BGC0000488
Copying BGC0000489
Copying BGC0000490
Copying BGC0000491
Copying BGC0000492
Copying BGC0000493
Copying BGC0000494
Copying BGC0000495
Copying BGC0000496
Copying BGC0000497
Copying BG

Copying BGC0000915
Copying BGC0000916
Copying BGC0000917
Copying BGC0000918
Copying BGC0000919
Copying BGC0000920
Copying BGC0000921
Copying BGC0000922
Copying BGC0000923
Copying BGC0000924
Copying BGC0000925
Copying BGC0000926
Retiring BGC0000927
Copying BGC0000928
Copying BGC0000929
Copying BGC0000930
Copying BGC0000931
Copying BGC0000932
Copying BGC0000933
Copying BGC0000934
Copying BGC0000935
Copying BGC0000936
Copying BGC0000937
Copying BGC0000938
Copying BGC0000939
Copying BGC0000940
Copying BGC0000941
Copying BGC0000942
Copying BGC0000943
Copying BGC0000944
Retiring BGC0000945
Copying BGC0000946
Copying BGC0000947
Copying BGC0000948
Copying BGC0000949
Copying BGC0000950
Copying BGC0000951
Retiring BGC0000952
Copying BGC0000953
Copying BGC0000954
Copying BGC0000955
Copying BGC0000956
Copying BGC0000957
Copying BGC0000958
Copying BGC0000959
Copying BGC0000960
Copying BGC0000961
Copying BGC0000962
Copying BGC0000963
Copying BGC0000964
Copying BGC0000965
Copying BGC0000966
Copying B

Copying BGC0001376
Copying BGC0001377
Copying BGC0001378
Copying BGC0001379
Copying BGC0001380
Copying BGC0001381
Retiring BGC0001382
Copying BGC0001383
Copying BGC0001384
Retiring BGC0001385
Copying BGC0001386
Copying BGC0001387
Copying BGC0001388
Copying BGC0001389
Copying BGC0001390
Copying BGC0001391
Copying BGC0001392
Copying BGC0001393
Copying BGC0001394
Copying BGC0001395
Copying BGC0001396
Copying BGC0001397
Retiring BGC0001398
Copying BGC0001399
Copying BGC0001400
Retiring BGC0001401
Copying BGC0001402
Copying BGC0001403
Copying BGC0001404
Copying BGC0001405
Copying BGC0001406
Copying BGC0001407
Copying BGC0001408
Copying BGC0001409
Copying BGC0001410
Copying BGC0001411
Copying BGC0001412
Copying BGC0001413
Copying BGC0001414
Copying BGC0001415
Copying BGC0001416
Copying BGC0001417
Copying BGC0001418
Copying BGC0001419
Copying BGC0001420
Copying BGC0001421
Copying BGC0001422
Copying BGC0001423
Copying BGC0001424
Copying BGC0001425
Copying BGC0001426
Copying BGC0001427
Copying 

Copying BGC0001829
Copying BGC0001830
Done!
Done!
