In [1]:
###### phase 7: re-check gene annotations, comparing it to the GBK, filter out ones without any new information added ######
#### it also populate organism and taxon properties from mibig finalgbk ####

In [2]:
from os import path, makedirs
import glob
import json
from jsonschema import validate, Draft7Validator
from tempfile import TemporaryDirectory
from Bio import SeqIO, Entrez
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Alphabet import generic_protein
from Bio.SeqRecord import SeqRecord
import sys
import datetime
import time
import re
import urllib3
from urllib.parse import quote
import certifi
from xml.dom import minidom
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
from jsonschema.validators import Draft7Validator
from shutil import copy2

In [3]:
def get_ncbi_tax_ids(organism_name, email):
    tax_ids = []
    
    Entrez.email = email
    num_try = 1
    while num_try < 6:
        try:
            organism_name_escaped = "\"{}\"".format(organism_name)
            dom = minidom.parse(Entrez.esearch(db="taxonomy", term=organism_name_escaped))
            ids = dom.getElementsByTagName('Id')
            if len(ids) < 1:
                raise Exception()
            for tax_id in ids:
                tax_ids.append(tax_id.firstChild.nodeValue)
            break
        except:
            pass
        num_try += 1
        time.sleep(5)
    
    return tax_ids

def fetch_gbk(nucl_acc, email, clean_cache = False):
    cache_folder = "../../preprocessed/cache/cached_gbks/"
    cache_path = path.join(cache_folder, "{}.gbk".format(nucl_acc))

    if not path.exists(cache_folder):
        makedirs(cache_folder)
    
    if clean_cache or not path.exists(cache_path) or path.getsize(cache_path) < 100:
        if nucl_acc.startswith("MIBIG.BGC"):
            [_, bgc_id, cl_id] = nucl_acc.split(".")
            mibig_final_file_path = "../../inputs/cached_mibig_finalgbks/{}.1.final.gbk".format(bgc_id)
            with open(mibig_final_file_path, "r") as mibig_final_file:
                with open(cache_path, "w", encoding="utf-8") as cache_gbk_file:
                    write_to_file = False
                    for line in mibig_final_file:
                        if line.startswith("LOCUS"):
                            gbk_accession = line.split(" ")[1]
                            if gbk_accession == "{}.{}".format(bgc_id, cl_id):
                                write_to_file = True
                        if write_to_file:
                            cache_gbk_file.write(line)
                        if line.rstrip() == "//":
                            break
        else: # (re)download from ncbi
            Entrez.email = email
            handle = Entrez.efetch(db="nucleotide", id=nucl_acc, rettype="gbwithparts", retmode="text")
            if not path.exists(cache_folder):
                makedirs(cache_folder)
            with open(cache_path, "w") as gbk_file:
                gbk_file.write(handle.read())
            
    return open(cache_path, "r")

# from antismash
def get_aa_translation(seq_record, feature):
    """Obtain content for translation qualifier for specific CDS feature in sequence record"""
    extracted = feature.extract(seq_record.seq).ungap('-')

    # ensure the extracted section is a multiple of three by trimming any excess
    if len(extracted) % 3 != 0:
        extracted = extracted[:-(len(extracted) % 3)]

    fasta_seq = extracted.translate(to_stop=True)
    if len(fasta_seq) == 0:
        print("Retranslating {} with stop codons".format(feature.id))
        fasta_seq = extracted.translate()

    # replace ambiguous aminos with an explicit unknown
    string_version = str(fasta_seq)
    for bad in "*BJOUZ":
        string_version = string_version.replace(bad, "X")

    # and remove any gaps
    string_version = string_version.replace("-", "")
    fasta_seq = Seq(string_version, generic_protein)

    return fasta_seq

def get_aa_sequence(feature, to_stop=False):
    """Extract sequence from specific CDS feature in sequence record"""
    fasta_seq = feature.qualifiers['translation'][0]
    if "*" in fasta_seq:
        if to_stop:
            fasta_seq = fasta_seq.split('*')[0]
        else:
            fasta_seq = fasta_seq.replace("*","X")
    if "-" in fasta_seq:
        fasta_seq = fasta_seq.replace("-","")
    return fasta_seq


def count_props(input_dict, cur_path, result):
    """given a (mibig?) json, construct a list of property paths
    along with its presence count in the json object"""
    key_path = cur_path
    
    if isinstance(input_dict, dict):
        for key in input_dict.keys():
            result = count_props(input_dict[key], "{}/{}".format(key_path, key), result)
    elif isinstance(input_dict, list):
        key_path = "{}[]".format(key_path)
        for node in input_dict:
            result = count_props(node, "{}".format(key_path), result)

    if not isinstance(input_dict, dict):
        if key_path not in result:
            result[key_path] = 0
        result[key_path] += 1
    
    return result


def fetch_props_new_schema(input_dict, cur_path, result):
    """given a (mibig?) json draft7 schema, construct a list of property paths
    along with either required == True for each properties"""
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_props_new_schema(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_props_new_schema(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result and "properties" not in input_dict:
        result[key_path] = False # can't really use this
    return result

def sanitise_gene_name(name):
    if name is None:
        return None
    name = str(name)
    illegal_chars = set("!\"#$%&()*+,:; \r\n\t=>?@[]^`'{|}/ ")
    for char in set(name).intersection(illegal_chars):
        name = name.replace(char, "_")
    return name

In [4]:
def fetch_mibig_final_gbk(bgc_id, clean_cache = False):
    return open("../../inputs/cached_mibig_finalgbks/{}.1.final.gbk".format(bgc_id), "r")

In [5]:
def _fetch_val_to_ref(input_dict, parent_dict, key, all_references):
    if isinstance(input_dict, dict):
        for key in input_dict.keys():
            _fetch_val_to_ref(input_dict[key], input_dict, key, all_references)
    elif isinstance(input_dict, list):
        for i, node in enumerate(input_dict):
            _fetch_val_to_ref(node, input_dict, i, all_references)
    elif isinstance(input_dict, str):
        if parent_dict != None:
            valu = input_dict.upper()
            if valu not in all_references:
                all_references[valu] = []
            all_references[valu].append((parent_dict, key))
    return all_references

with open("../../preprocessed/reports/p7-updated_cases.tsv", "w") as uc:
    uc.write("")
    
def _make_gene_id_match_reference(input_dict, reference_ids, bgc_id):
    all_references = _fetch_val_to_ref(input_dict, None, None, {})
    all_rids = {}
    for rid in reference_ids:
        if len(rid) > 0:
            all_rids[rid.upper()] = rid
    matches = set(all_references.keys()).intersection(set(all_rids.keys()))    
    for key in matches:
        for ar in all_references[key]:
            if ar[0][ar[1]] != all_rids[key]:
                with open("../../preprocessed/reports/p7-updated_cases.tsv", "a") as uc:
                    uc.write("{}\t{}\t{}\n".format(bgc_id, ar[0][ar[1]], all_rids[key]))
                ar[0][ar[1]] = all_rids[key]

def make_gene_id_match_reference(data, reference_ids):
    bgc_id = data["cluster"]["mibig_accession"]
    _make_gene_id_match_reference(data["cluster"]["compounds"], reference_ids, bgc_id)
    _make_gene_id_match_reference(data["cluster"].get("genes", {}), reference_ids, bgc_id)
    _make_gene_id_match_reference(data["cluster"].get("polyketide", {}), reference_ids, bgc_id)
    _make_gene_id_match_reference(data["cluster"].get("nrp", {}), reference_ids, bgc_id)
    _make_gene_id_match_reference(data["cluster"].get("ripp", {}), reference_ids, bgc_id)
    _make_gene_id_match_reference(data["cluster"].get("terpene", {}), reference_ids, bgc_id)
    _make_gene_id_match_reference(data["cluster"].get("saccharide", {}), reference_ids, bgc_id)
    
    return data

In [6]:
removed_extra_genes = {}
removed_annotations = {}
removed_operons = {}
no_taxid = {}
added_mibig_genes = {}
no_gbk = {}
fixed = {}
ripps_to_fix = {}

nt_length_and_cds_count = {}

def check_gene_annotations(data):
    loci = data["cluster"]["loci"]
    annots = data["cluster"].get("genes", {})
    gbk_acc = loci["accession"].upper()
    bgc_id = data["cluster"]["mibig_accession"]
    email = "mibig@secondarymetabolites.org"
    gbk_record = None
    if True:
        num_try = 1
        clean_cache = False
        while num_try < 6:
            try:
                with fetch_gbk(gbk_acc, email, clean_cache) as gbk_handle:
                    seq_record = SeqIO.read(gbk_handle, "genbank") # the gbk should contains only 1 file
                    if len(seq_record.seq) < 1:
                        raise Exception("Empty sequence record {}".format(gbk_acc))
                    gbk_record = seq_record
                    break
            except:
                print("Error...")
                clean_cache = True
            num_try += 1
            time.sleep(5)
        if not isinstance(gbk_record, SeqRecord): # failed to download NCBI data
            print("{} Failed to download: {}".format(data["cluster"]["mibig_accession"], gbk_acc))
            no_gbk[bgc_id] = gbk_acc
            return {}

    # fill up taxid
    tax_ids = get_ncbi_tax_ids(gbk_record.annotations["organism"], email)
    if len(tax_ids) != 1:
        no_taxid[bgc_id] = gbk_record.annotations["organism"]
        print(tax_ids)
    else:
        data["cluster"]["ncbi_tax_id"] = tax_ids[0]
        data["cluster"]["organism_name"] = gbk_record.annotations["organism"]
         
    # fetch all CDS inside the cluster
    cluster_cds = {}
    cluster_cds_ids = set(["", "No protein ID"]) # quick lookup for existing IDs
    for feature in seq_record.features:
        if feature.type == "CDS":
            if ("start_coord" not in loci) or (feature.location.start >= loci["start_coord"]-1 and feature.location.end <= loci["end_coord"]):
                if "gene" in feature.qualifiers:
                    cluster_cds_ids.add(feature.qualifiers["gene"][0])
                    cluster_cds[feature.qualifiers["gene"][0]] = feature
                if "protein_id" in feature.qualifiers:
                    cluster_cds_ids.add(feature.qualifiers["protein_id"][0])
                    cluster_cds[feature.qualifiers["protein_id"][0]] = feature
                if "locus_tag" in feature.qualifiers:
                    cluster_cds_ids.add(feature.qualifiers["locus_tag"][0])
                    cluster_cds[feature.qualifiers["locus_tag"][0]] = feature
                
    # fetch all CDS in MIBiG finalgbk
    mibig_extra_genes = []
    with fetch_mibig_final_gbk(bgc_id) as mibig_gbk_handle:
        mibig_seq_records = SeqIO.parse(mibig_gbk_handle, "genbank")
        for mibig_seq_record in mibig_seq_records:
            # look for gene information
            match = re.search("(between(?P<start>\d+)-(?P<end>\d+)ntfrom){0,1}GenBankID(?P<acc>[A-Z0-9_\.]+)\.", mibig_seq_record.annotations['comment'].replace(" ", "").replace("\n", ""))
            if match:
                gbk_acc_mibig = match.group("acc").upper()
                if gbk_acc_mibig == gbk_acc:
                    offset = 0
                    if match.group("start"):
                        offset = int(match.group("start")) - 1
                    for feature in mibig_seq_record.features:
                        if feature.type == "CDS":
                            ids = []
                            if "gene" in feature.qualifiers:
                                if feature.qualifiers["gene"][0].upper() in [sanitise_gene_name(gid.upper()) for gid in cluster_cds_ids]:
                                    continue
                                ids.append(feature.qualifiers["gene"][0])
                                cluster_cds[feature.qualifiers["gene"][0]] = feature
                            if "protein_id" in feature.qualifiers:
                                if feature.qualifiers["protein_id"][0].upper() in [sanitise_gene_name(gid.upper()) for gid in cluster_cds_ids]:
                                    continue
                                ids.append(feature.qualifiers["protein_id"][0])
                                cluster_cds[feature.qualifiers["protein_id"][0]] = feature
                            if "locus_tag" in feature.qualifiers:
                                if feature.qualifiers["locus_tag"][0].upper() in [sanitise_gene_name(gid.upper()) for gid in cluster_cds_ids]:
                                    continue
                                ids.append(feature.qualifiers["locus_tag"][0])
                                cluster_cds[feature.qualifiers["locus_tag"][0]] = feature
                            cluster_cds_ids.update(set(ids))
                            mibig_extra_genes.append(feature._shift(offset)) 
                            if bgc_id not in added_mibig_genes:
                                added_mibig_genes[bgc_id] = []
                            added_mibig_genes[bgc_id].append("/".join([gid for gid in ids]))
                            
    # add mibig extra genes
    for feature in mibig_extra_genes:
        if "extra_genes" not in annots:
            annots["extra_genes"] = []
        if "annotations" not in annots:
            annots["annotations"] = []
        gene_id = feature.qualifiers.get("locus_tag", feature.qualifiers.get("protein_id", feature.qualifiers.get("gene", [None])))[0]
        if gene_id != None:
            extra_gene = {
                "id": gene_id,
                "location": {
                    "exons": [{"start": location.start + 1, "end": location.end} for location in feature.location.parts],
                    "strand": feature.location.strand
                }
            }
            #if "translation" in feature.qualifiers:
            #    extra_gene["translation"] = feature.qualifiers["translation"][0]
            annots["extra_genes"].append(extra_gene)
            annot = {
                "id": gene_id
            }
            if "product" in feature.qualifiers:
                annot["product"] = feature.qualifiers["product"][0]
            if "note" in feature.qualifiers:
                annot["comments"] = feature.qualifiers["note"][0]
            if len(annot.keys()) > 1:
                annots["annotations"].append(annot)

    # check extra genes annotation
    extra_genes = []
    for i, extra_gene in enumerate(annots.get("extra_genes", [])):
        gene_id = extra_gene["id"]
        if gene_id.upper() not in [gid.upper() for gid in cluster_cds_ids] or gene_id.upper() in [gid.upper() for gid in "/".join(added_mibig_genes.get(bgc_id, "")).split("/")]:
            # check location, if outside the cluster, remove the locus information
            if "location" in extra_gene:
                exons = extra_gene["location"].get("exons", [])
                if len(exons) > 0:
                    cds_start = len(gbk_record.seq)
                    cds_end = 0
                    for exon in exons:
                        if exon["start"] < cds_start:
                            cds_start = exon["start"]
                        if exon["end"] > cds_end:
                            cds_end = exon["end"]
                    if "start_coord" in loci and (cds_start < loci["start_coord"] or cds_end > loci["end_coord"]):
                        extra_gene.pop("location", None)
                else:
                    extra_gene.pop("location", None)
            extra_genes.append(extra_gene)
            cluster_cds_ids.add(gene_id)
        else:
            if bgc_id not in removed_extra_genes:
                removed_extra_genes[bgc_id] = []
            removed_extra_genes[bgc_id].append(str(i))
            
    if len(extra_genes) > 0:
        annots["extra_genes"] = extra_genes
    else:
        annots.pop("extra_genes", None)
        
    if ("start_coord" not in loci):
        nt_length = len(gbk_record.seq)
    else:
        nt_length = loci["end_coord"] - loci["start_coord"]
    gene_counts = len(cluster_cds.keys()) + len(extra_genes)
    nt_length_and_cds_count[bgc_id] = (nt_length, gene_counts)
            
    # remove junk ids
    cluster_cds_ids.discard("")
    cluster_cds_ids.discard(None)
    
    # check annotation, remove if no added info
    gene_annots = []
    for i, annot in enumerate(annots.get("annotations", [])):
        approve = False
        found_id = None
        for gid in cluster_cds_ids:
            if annot.get("id", "").upper() == gid.upper():
                found_id = gid
                annot["id"] = gid
                break
            elif annot.get("name", "").upper() == gid.upper():
                found_id = gid
                annot["name"] = gid
                break
        ids = "id='{}',name='{}'".format(annot.get("id", ""), annot.get("name", ""))
        if True:
            if len(annot.get("name", "")) < 1:
                annot.pop("name", None)
            if len(annot.get("product", "")) < 1:
                annot.pop("product", None)
            if len(annot.get("mut_pheno", "")) < 1:
                annot.pop("mut_pheno", None)
            if len(annot.get("comments", "")) < 1:
                annot.pop("comments", None)
            g_functions = []
            for g_function in annot.get("functions", []):
                if len(g_function.get("category", "")) < 1:
                    continue
                elif len(g_function.get("evidence", [])) < 1:
                    continue
                elif len(set(g_function["evidence"]) - set(["Sequence-based prediction", "Other in vivo study", "Heterologous expression", "Knock-out", "Activity assay"])) > 0:
                    continue
                else:
                    g_functions.append(g_function)
            if len(g_functions) > 0:
                annot["functions"] = g_functions
            else:
                annot.pop("functions", None)
            if len(annot.get("tailoring", [])) < 1:
                annot.pop("tailoring", None)
            if len(annot.get("publications", [])) < 1:
                annot.pop("publications", None)
                
            if found_id == None:
                annot = {} # uncomment to add to extra genes when not found                
                #if len(annot.get("id", "")) < 1:
                #    if len(annot.get("name", "")) < 1:
                #        annot = {}
                #    else:
                #        annot["id"] = annot["name"]
                #if len(annot.get("id", "")) > 0:
                #    if "extra_genes" not in annots:
                #        annots["extra_genes"] = []
                #    annots["extra_genes"].append({"id": annot["id"]})
                #    cluster_cds_ids.add(annot["id"])
                    
            if len(annot.keys()) > 1: # 1 is the id
                approve = True
        if approve:
            gene_annots.append(annot)
        else:
            if bgc_id not in removed_annotations:
                removed_annotations[bgc_id] = []
            removed_annotations[bgc_id].append(ids)
    if len(gene_annots) > 0:
        annots["annotations"] = gene_annots
    else:
        annots.pop("annotations", None)
                
    # check operons
    operons = []
    for i, operon in enumerate(annots.get("operons", [])):
        o_genes = set([gid.upper() for gid in operon.get("genes", [])])
        o_evidence = set(operon.get("evidence", [])).intersection(set(["Sequence-based prediction", "RACE", "ChIPseq", "RNAseq"]))
        if len(o_genes) > 0 and len(o_genes - cluster_cds_ids) < 1:
            if len(o_evidence) > 0:
                operons.append({
                    "genes": list(o_genes),
                    "evidence": list(o_evidence)
                })
                continue
        if bgc_id not in removed_operons:
            removed_operons[bgc_id] = []
        removed_operons[bgc_id].append(str(i))
        
    if len(operons) > 0:
        annots["operons"] = operons
    else:
        annots.pop("operons", None)

    # try to fix ripp leader/follower sequences
    if "ripp" in data["cluster"]:
        fixed_precursors = []
        unfixable_precursors = []
        for i, precursor in enumerate(data["cluster"]["ripp"].get("precursor_genes", [])):
            if "gene_id" in precursor:
                cds_feature = None
                for gid in cluster_cds_ids:
                    if precursor["gene_id"].upper() == gid.upper():
                        cds_feature = cluster_cds[gid]
                        break
                if cds_feature != None:
                    if "translation" in cds_feature.qualifiers:
                        aa_sequence = str(cds_feature.qualifiers["translation"][0])
                        leader_length = precursor.get("leader_sequence", 0)
                        follower_length = precursor.get("follower_sequence", 0)
                        if leader_length < 1 or leader_length > len(aa_sequence) or follower_length > len(aa_sequence):
                            print("[ripp] Wrong leader/follower length information {}".format(precursor["gene_id"]))
                            unfixable_precursors.append(i)
                        elif len(precursor.get("core_sequence", [])) < 1:
                            print("[ripp] No core sequence {}".format(precursor["gene_id"]))
                            unfixable_precursors.append(i)
                        else:
                            cores_checked = True
                            for core in precursor["core_sequence"]:
                                if len(core) < 1:
                                    cores_checked = False
                                    break
                                core_start = aa_sequence.find(core)
                                if core_start < 0:
                                    # can't find core
                                    cores_checked = False
                                    break
                                elif core_start < leader_length or core_start + len(core) >= len(aa_sequence) - follower_length:
                                    # core overlaps with leader/follower
                                    cores_checked = False
                                    break
                            if cores_checked:
                                precursor["leader_sequence"] = aa_sequence[0:leader_length]
                                if follower_length > 0:
                                    precursor["follower_sequence"] = aa_sequence[follower_length - 1:]
                                else:
                                    precursor.pop("follower_sequence", None)
                                fixed_precursors.append(i)
                            else:
                                print("[ripp] Wrong sequence information for {}".format(precursor["gene_id"]))
                                unfixable_precursors.append(i)
                    else:
                        print("[ripp] Can't find AA sequence for {}".format(precursor["gene_id"]))
                        unfixable_precursors.append(i)
                else:
                    print("[ripp] Can't find CDS feature for {}".format(precursor["gene_id"]))
                    unfixable_precursors.append(i)
                    
        if len(fixed_precursors) > 0 and len(fixed_precursors) == len(data["cluster"]["ripp"]["precursor_genes"]):
            # remove the retired flag from ripp.leader/follower_sequence
            print("[ripp] solved precursors {}".format(bgc_id))
            if bgc_id not in fixed:
                fixed[bgc_id] = []
            fixed[bgc_id].append("ripp.leader/follower_sequence")
        elif len(unfixable_precursors) > 0:
            ripps_to_fix[bgc_id] = unfixable_precursors
            # temporarily remove the precursors, and remove the retired flag from ripp.leader/follower_sequence
            precursors = []
            for i, precursor in enumerate(data["cluster"]["ripp"]["precursor_genes"]):
                if i not in unfixable_precursors:
                    precursors.append(precursor)
            if len(precursors) > 0:
                data["cluster"]["ripp"]["precursor_genes"] = precursors
            else:
                data["cluster"]["ripp"].pop("precursor_genes", None)
            if bgc_id not in fixed:
                fixed[bgc_id] = []
            fixed[bgc_id].append("ripp.leader/follower_sequence")            
            
        
    # update gene ids in specific_infos
    data = make_gene_id_match_reference(data, cluster_cds_ids)
        
    if len(annots.keys()) > 0:
        data["cluster"]["genes"] = annots
    else:
        data["cluster"].pop("genes", None)

In [7]:
def validate_data(data, validator, schema_props):
    for error in sorted(validator.iter_errors(data), key=str):
        try:
            if error.path[-2] == "evidence" or error.path[-1] == "evidence":
                if error.path[-3] == "loci" or error.path[-2] == "loci":
                    continue
            elif error.path[-1] == "nr_iterations":
                continue
            elif error.path[-1] == "module_number":
                continue
            elif error.path[-2] == "proteinogenic":
                continue
            elif error.path[-1] in ["leader_sequence", "follower_sequence"]:
                continue
            elif error.path[-1] == "organism":
                if data["cluster"]["mibig_accession"] in no_gbk:
                    continue
            elif error.path[-1] == "subcluster" and error.path[-3] == "chem_moieties":
                continue
        except:
            pass
        print(error.message)
        #sys.exit(0)
    this_file_props = count_props(data, "", {})
    for prop in this_file_props:
        if prop not in schema_props.keys():
            print(prop)
            sys.exit(0)

In [8]:
input_path = "../../preprocessed/p6-json/"
output_folder = "../../preprocessed/p7-json/"

if not path.exists(output_folder):
    makedirs(output_folder)

validator = None
schema_props = {}
with open("../../inputs/mibig_schema_phase_6.json") as json_file:
    schema_obj = json.load(json_file)
    validator = Draft7Validator(schema_obj)
    schema_props = fetch_props_new_schema(schema_obj, "", {})


for json_path in sorted(glob.glob(path.join(input_path, "BGC*.json"))):
    with open(json_path, "r") as json_file:
        bgc_id = path.basename(json_path).split(".")[0]
        data = json.load(json_file)
        print("Scanning {}".format(bgc_id))
        check_gene_annotations(data)
        validate_data(data, validator, schema_props)
        with open(path.join(output_folder, "{}.json".format(bgc_id)), "w") as o:
            o.write(json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True))
            
print("All data fetched!")

Scanning BGC0000001
Scanning BGC0000002
Scanning BGC0000003
Scanning BGC0000004
Scanning BGC0000005
Scanning BGC0000006
Scanning BGC0000007
Scanning BGC0000008
Scanning BGC0000009
Scanning BGC0000010
Scanning BGC0000011
Scanning BGC0000012
Scanning BGC0000013
Scanning BGC0000014
Scanning BGC0000015
Scanning BGC0000016
Scanning BGC0000017
Scanning BGC0000018
Scanning BGC0000019
Scanning BGC0000020
Scanning BGC0000021
Scanning BGC0000022
Scanning BGC0000023
Scanning BGC0000024
Scanning BGC0000025
Scanning BGC0000026
Scanning BGC0000027
Scanning BGC0000028
Scanning BGC0000029
Scanning BGC0000030
Scanning BGC0000031
Scanning BGC0000032
Scanning BGC0000033
Scanning BGC0000034
Scanning BGC0000035
Scanning BGC0000036
Scanning BGC0000037
Scanning BGC0000038
Scanning BGC0000039
Scanning BGC0000040
Scanning BGC0000041
Scanning BGC0000042
Scanning BGC0000043
Scanning BGC0000044
Scanning BGC0000045
Scanning BGC0000046
Scanning BGC0000047
Scanning BGC0000048
Scanning BGC0000049
Scanning BGC0000050


Scanning BGC0000407
Scanning BGC0000408
Scanning BGC0000409
Scanning BGC0000410
Scanning BGC0000411
Scanning BGC0000412
Scanning BGC0000413
Scanning BGC0000414
Scanning BGC0000415
Scanning BGC0000416
Scanning BGC0000417
Scanning BGC0000418
Scanning BGC0000419
Scanning BGC0000420
Scanning BGC0000421
Scanning BGC0000422
Scanning BGC0000423
Scanning BGC0000424
Scanning BGC0000425
Scanning BGC0000426
Scanning BGC0000427
Scanning BGC0000428
Scanning BGC0000429
Scanning BGC0000430
Scanning BGC0000431
Scanning BGC0000432
Scanning BGC0000433
Scanning BGC0000434
Scanning BGC0000435
Scanning BGC0000436
Scanning BGC0000437
Scanning BGC0000438
Scanning BGC0000439
Scanning BGC0000440
Scanning BGC0000441
Scanning BGC0000442
Scanning BGC0000443
Scanning BGC0000444
Scanning BGC0000445
Scanning BGC0000446
Scanning BGC0000447
Scanning BGC0000448
Scanning BGC0000449
Scanning BGC0000450
Scanning BGC0000451
Scanning BGC0000452
Scanning BGC0000453
Scanning BGC0000454
Scanning BGC0000455
Scanning BGC0000456


Scanning BGC0000703
Scanning BGC0000704
Scanning BGC0000705
Scanning BGC0000706
Scanning BGC0000707
Scanning BGC0000708
Scanning BGC0000709
Scanning BGC0000710
Scanning BGC0000711
Scanning BGC0000712
Scanning BGC0000713
Scanning BGC0000714
Scanning BGC0000715
Scanning BGC0000716
Scanning BGC0000717
Scanning BGC0000718
Scanning BGC0000719
Scanning BGC0000720
Scanning BGC0000721
Scanning BGC0000722
Scanning BGC0000723
Scanning BGC0000724
Scanning BGC0000725
Scanning BGC0000726
Scanning BGC0000727
Scanning BGC0000728
Scanning BGC0000729
Scanning BGC0000730
Scanning BGC0000731
Scanning BGC0000732
Scanning BGC0000733
Scanning BGC0000734
Scanning BGC0000735
Scanning BGC0000736
Scanning BGC0000737
Scanning BGC0000738
Scanning BGC0000739
Scanning BGC0000740
Scanning BGC0000741
Scanning BGC0000742
Scanning BGC0000743
Scanning BGC0000744
Scanning BGC0000745
Scanning BGC0000746
Scanning BGC0000747
Scanning BGC0000748
Scanning BGC0000749
Scanning BGC0000750
Scanning BGC0000751
Scanning BGC0000752


Scanning BGC0001110
Scanning BGC0001111
Scanning BGC0001112
Scanning BGC0001113
Scanning BGC0001114
Scanning BGC0001115
Scanning BGC0001116
Scanning BGC0001117
Scanning BGC0001118
Scanning BGC0001119
Scanning BGC0001120
Scanning BGC0001121
Scanning BGC0001122
Scanning BGC0001123
Scanning BGC0001124
Scanning BGC0001125
Scanning BGC0001126
Scanning BGC0001127
Scanning BGC0001128
Scanning BGC0001129
Error...
Error...
Error...
Error...
Error...
BGC0001129 Failed to download: MIBIG.BGC0001129.1
'ncbi_tax_id' is a required property
'organism_name' is a required property
Scanning BGC0001130
Scanning BGC0001131
Scanning BGC0001132
Scanning BGC0001133
Scanning BGC0001134
Scanning BGC0001135
Scanning BGC0001136
Scanning BGC0001137
Scanning BGC0001138
Scanning BGC0001140
Scanning BGC0001141
Scanning BGC0001142
Scanning BGC0001143
Scanning BGC0001144
Scanning BGC0001145
[ripp] Can't find CDS feature for wp_004943468
Scanning BGC0001146
[ripp] Can't find CDS feature for aid54693
Scanning BGC0001147

Scanning BGC0001435
Scanning BGC0001436
Scanning BGC0001437
Scanning BGC0001438
Scanning BGC0001439
Scanning BGC0001440
Scanning BGC0001441
Scanning BGC0001442
Scanning BGC0001443
Scanning BGC0001444
Scanning BGC0001445
Scanning BGC0001446
Scanning BGC0001447
Scanning BGC0001448
Scanning BGC0001449
[]
'ncbi_tax_id' is a required property
'organism_name' is a required property
Scanning BGC0001450
Scanning BGC0001451
Scanning BGC0001452
Scanning BGC0001453
Scanning BGC0001454
Scanning BGC0001455
Scanning BGC0001456
Scanning BGC0001457
Scanning BGC0001458
Scanning BGC0001459
Scanning BGC0001460
Scanning BGC0001461
Scanning BGC0001462
Scanning BGC0001463
Scanning BGC0001464
Scanning BGC0001465
Scanning BGC0001466
Scanning BGC0001467
Scanning BGC0001468
Scanning BGC0001469
Scanning BGC0001470
Scanning BGC0001471
[ripp] Wrong sequence information for wp_091614358.1
Scanning BGC0001472
Scanning BGC0001473
[ripp] Wrong sequence information for pyc67624.1
Scanning BGC0001474
[ripp] solved precu

In [9]:
retired_bgcs = {}
todo_list = {}

with open("../../preprocessed/reports/p7-removed_extra_genes.tsv", "w") as o:
    o.write("bgc_id\tgene_ids\n")
    for bgc_id in removed_extra_genes:
        o.write("{}\t{}\n".format(bgc_id, ",".join(removed_extra_genes[bgc_id])))
        # uncommented: we don't need to retire BGCs because we trimmed out extra_genes
        #if bgc_id not in retired_bgcs:
        #    retired_bgcs[bgc_id] = set()
        #retired_bgcs[bgc_id].add("extra_genes_removed")
    
with open("../../preprocessed/reports/p7-removed_annotations.tsv", "w") as o:
    o.write("bgc_id\tgene_ids\n")
    for bgc_id in removed_annotations:
        o.write("{}\t{}\n".format(bgc_id, ";".join(removed_annotations[bgc_id])))
        if bgc_id not in retired_bgcs:
            retired_bgcs[bgc_id] = set()
        retired_bgcs[bgc_id].add("annotations_removed")
    
with open("../../preprocessed/reports/p7-removed_operons.tsv", "w") as o:
    o.write("bgc_id\tindexes\n")
    for bgc_id in removed_operons:
        o.write("{}\t{}\n".format(bgc_id, ",".join(removed_operons[bgc_id])))
        if bgc_id not in retired_bgcs:
            retired_bgcs[bgc_id] = set()
        retired_bgcs[bgc_id].add("operons_removed")

with open("../../preprocessed/reports/p7-no_gbk.tsv", "w") as o:
    o.write("bgc_id\taccession\n")
    for bgc_id in no_gbk:
        o.write("{}\t{}\n".format(bgc_id, no_gbk[bgc_id]))
        if bgc_id not in retired_bgcs:
            retired_bgcs[bgc_id] = set()
        retired_bgcs[bgc_id].add("no_gbk")
    
with open("../../preprocessed/reports/p7-no_taxonomy.tsv", "w") as o:
    o.write("bgc_id\torganism\n")
    for bgc_id in no_taxid:
        o.write("{}\t{}\n".format(bgc_id, no_taxid[bgc_id]))
        if bgc_id not in retired_bgcs:
            retired_bgcs[bgc_id] = set()
        retired_bgcs[bgc_id].add("no_taxonomy")
        
        
with open("../../preprocessed/reports/p7-retired_list.tsv", "w") as o:
    for bgc_id in retired_bgcs:
        o.write("{}\t{}\n".format(bgc_id, ";".join(retired_bgcs[bgc_id])))
    
with open("../../preprocessed/reports/p7-added_mibig_genes.tsv", "w") as o:
    o.write("bgc_id\tgenes\n")
    for bgc_id in added_mibig_genes:
        o.write("{}\t{}\n".format(bgc_id, ",".join(added_mibig_genes[bgc_id])))

with open("../../preprocessed/reports/p7-ripps_precursors_to_fix.tsv", "w") as o:
    for bgc_id in ripps_to_fix:
        o.write("{}\t{}\n".format(bgc_id, ",".join([str(idx) for idx in ripps_to_fix[bgc_id]])))
        if bgc_id not in todo_list:
            todo_list[bgc_id] = {}
        todo_list[bgc_id]["ripp_precursors"] = ripps_to_fix[bgc_id]


In [10]:
### update fixed list ###
with open("../../preprocessed/reports/p7-fixed_list.tsv", "w") as o:
    for bgc_id in fixed:
        o.write("{}\t{}\n".format(bgc_id, ";".join(fixed[bgc_id])))
        
with open("../../preprocessed/reports/p7-todo_list.tsv", "w") as o:
    for bgc_id in todo_list:
        for issue in todo_list[bgc_id]:
            o.write("{}\t{}\t{}\n".format(bgc_id, issue, ";".join([str(idx) for idx in todo_list[bgc_id][issue]])))

In [11]:
with open("../../preprocessed/p7-gene_counts_and_loci_length.tsv", "w") as o:
    o.write("bgc_id\tnt_length\tcds_count\n")
    for bgc_id in nt_length_and_cds_count:
        nt_length, gene_counts = nt_length_and_cds_count[bgc_id]
        o.write("{}\t{}\t{}\n".format(bgc_id, nt_length, gene_counts))