In [None]:
###### phase 7: re-check gene annotations, comparing it to the GBK, filter out ones without any new information added ######
#### it also populate organism and taxon properties from mibig finalgbk ####

In [None]:
from os import path, makedirs
import glob
import json
from jsonschema import validate, Draft7Validator
from tempfile import TemporaryDirectory
from Bio import SeqIO, Entrez
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Alphabet import generic_protein
from Bio.SeqRecord import SeqRecord
import sys
import datetime
import time
import re
import urllib3
import certifi
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
from jsonschema.validators import Draft7Validator

In [None]:
def fetch_gbk(nucl_acc, email, clean_cache = False):
    cache_folder = "../../preprocessed/cache/cached_gbks/"
    cache_path = path.join(cache_folder, "{}.gbk".format(nucl_acc.replace(":", ".")))

    if not path.exists(cache_folder):
        makedirs(cache_folder)
    
    if clean_cache or not path.exists(cache_path) or path.getsize(cache_path) < 100:
        if re.match("^MIBIG:BGC\d{7}\.\d+$", nucl_acc):
            bgc_id = nucl_acc.split(":")[1].split(".")[0]
            cl_id = nucl_acc.split(":")[1].split(".")[1]
            mibig_url = "https://mibig.secondarymetabolites.org/repository/{}/{}.1.cluster{:03d}.gbk".format(bgc_id, bgc_id, int(cl_id))
            resp = http.request('GET', mibig_url)
            with open(cache_path, "w") as gbk_file:
                gbk_file.write(resp.data.decode('utf-8', 'ignore'))
        else: # (re)download from ncbi
            Entrez.email = email
            handle = Entrez.efetch(db="nucleotide", id=nucl_acc, rettype="gbwithparts", retmode="text")
            if not path.exists(cache_folder):
                makedirs(cache_folder)
            with open(cache_path, "w") as gbk_file:
                gbk_file.write(handle.read())
            
    return open(cache_path, "r")

# from antismash
def get_aa_translation(seq_record, feature):
    """Obtain content for translation qualifier for specific CDS feature in sequence record"""
    extracted = feature.extract(seq_record.seq).ungap('-')

    # ensure the extracted section is a multiple of three by trimming any excess
    if len(extracted) % 3 != 0:
        extracted = extracted[:-(len(extracted) % 3)]

    fasta_seq = extracted.translate(to_stop=True)
    if len(fasta_seq) == 0:
        print("Retranslating {} with stop codons".format(feature.id))
        fasta_seq = extracted.translate()

    # replace ambiguous aminos with an explicit unknown
    string_version = str(fasta_seq)
    for bad in "*BJOUZ":
        string_version = string_version.replace(bad, "X")

    # and remove any gaps
    string_version = string_version.replace("-", "")
    fasta_seq = Seq(string_version, generic_protein)

    return fasta_seq

def get_aa_sequence(feature, to_stop=False):
    """Extract sequence from specific CDS feature in sequence record"""
    fasta_seq = feature.qualifiers['translation'][0]
    if "*" in fasta_seq:
        if to_stop:
            fasta_seq = fasta_seq.split('*')[0]
        else:
            fasta_seq = fasta_seq.replace("*","X")
    if "-" in fasta_seq:
        fasta_seq = fasta_seq.replace("-","")
    return fasta_seq


def count_props(input_dict, cur_path, result):
    """given a (mibig?) json, construct a list of property paths
    along with its presence count in the json object"""
    key_path = cur_path
    
    if isinstance(input_dict, dict):
        for key in input_dict.keys():
            result = count_props(input_dict[key], "{}/{}".format(key_path, key), result)
    elif isinstance(input_dict, list):
        key_path = "{}[]".format(key_path)
        for node in input_dict:
            result = count_props(node, "{}".format(key_path), result)

    if not isinstance(input_dict, dict):
        if key_path not in result:
            result[key_path] = 0
        result[key_path] += 1
    
    return result


def fetch_props_new_schema(input_dict, cur_path, result):
    """given a (mibig?) json draft7 schema, construct a list of property paths
    along with either required == True for each properties"""
    key_path = cur_path
    if ("type" not in input_dict) or (input_dict["type"] not in ["object", "array"]):
        key_path = "{}".format(cur_path) # string / etc.
    elif input_dict["type"] == "object":
        for key in input_dict["properties"]:
            result = fetch_props_new_schema(input_dict["properties"][key], "{}/{}".format(key_path, key), result)
    elif input_dict["type"] == "array":
        key_path = "{}[]".format(cur_path)
        result = fetch_props_new_schema(input_dict["items"], "{}".format(key_path), result)
    
    if key_path not in result and "properties" not in input_dict:
        result[key_path] = False # can't really use this
    return result


In [None]:
def fetch_mibig_final_gbk(bgc_id, clean_cache = False):
    cache_folder = "../../preprocessed/cache/cached_mibig_finalgbks/"
    cache_path = path.join(cache_folder, "{}.1.final.gbk".format(bgc_id))
    
    if clean_cache or not path.exists(cache_path) or path.getsize(cache_path) < 100:
        mibig_url = "https://mibig.secondarymetabolites.org/repository/{}/{}.1.final.gbk".format(bgc_id, bgc_id)
        resp = http.request('GET', mibig_url)
        with open(cache_path, "w") as cf:
            cf.write(resp.data.decode('utf-8', 'ignore'))
        
    return open(cache_path, "r")

In [None]:
removed_extra_genes = {}
removed_annotations = {}
removed_operons = {}
added_mibig_genes = {}
no_gbk = {}

def check_gene_annotations(data):
    loci = data["cluster"]["loci"]
    annots = data["cluster"].get("genes", {})
    gbk_acc = loci["accession"].upper()
    bgc_id = data["cluster"]["mibig_accession"]
    email = "mibig@secondarymetabolites.org"
    gbk_record = None
    if True:
        num_try = 1
        clean_cache = False
        while num_try < 6:
            try:
                with fetch_gbk(gbk_acc, email, clean_cache) as gbk_handle:
                    seq_record = SeqIO.read(gbk_handle, "genbank") # the gbk should contains only 1 file
                    if len(seq_record.seq) < 1:
                        raise Exception("Empty sequence record {}".format(gbk_acc))
                    gbk_record = seq_record
                    break
            except:
                print("Error...")
                clean_cache = True
            num_try += 1
            time.sleep(5)
        if not isinstance(gbk_record, SeqRecord): # failed to download NCBI data
            print("{} Failed to download: {}".format(data["cluster"]["mibig_accession"], gbk_acc))
            no_gbk[bgc_id] = gbk_acc
            return {}

    # fetch all CDS inside the cluster
    cluster_cds = []
    cluster_cds_ids = set(["", "No protein ID"]) # quick lookup for existing IDs
    for feature in seq_record.features:
        if feature.type == "CDS":
            if ("start_coord" not in loci) or (feature.location.start >= loci["start_coord"]-1 and feature.location.end <= loci["end_coord"]):
                cluster_cds.append(feature)
                if "gene" in feature.qualifiers:
                    cluster_cds_ids.add(feature.qualifiers["gene"][0])
                if "protein_id" in feature.qualifiers:
                    cluster_cds_ids.add(feature.qualifiers["protein_id"][0])
                if "locus_tag" in feature.qualifiers:
                    cluster_cds_ids.add(feature.qualifiers["locus_tag"][0])
                
    # fetch all CDS in MIBiG finalgbk
    mibig_extra_genes = []
    with fetch_mibig_final_gbk(bgc_id) as mibig_gbk_handle:
        mibig_seq_records = SeqIO.parse(mibig_gbk_handle, "genbank")
        for mibig_seq_record in mibig_seq_records:
            # add organism information
            loci["organism"] = mibig_seq_record.annotations["organism"]
            loci["taxonomy"] = mibig_seq_record.annotations["taxonomy"]
            # look for gene information
            match = re.search("(between(?P<start>\d+)-(?P<end>\d+)ntfrom){0,1}GenBankID(?P<acc>[A-Z0-9_\.]+)\.", mibig_seq_record.annotations['comment'].replace(" ", "").replace("\n", ""))
            if match:
                gbk_acc_mibig = match.group("acc").upper()
                if gbk_acc_mibig == gbk_acc:
                    offset = 0
                    if match.group("start"):
                        offset = int(match.group("start")) - 1
                    for feature in mibig_seq_record.features:
                        if feature.type == "CDS":
                            ids = []
                            if "gene" in feature.qualifiers:
                                if feature.qualifiers["gene"][0] in cluster_cds_ids:
                                    continue
                                ids.append(feature.qualifiers["gene"][0])
                            if "protein_id" in feature.qualifiers:
                                if feature.qualifiers["protein_id"][0] in cluster_cds_ids:
                                    continue
                                ids.append(feature.qualifiers["protein_id"][0])
                            if "locus_tag" in feature.qualifiers:
                                if feature.qualifiers["locus_tag"][0] in cluster_cds_ids:
                                    continue
                                ids.append(feature.qualifiers["locus_tag"][0])
                            cluster_cds_ids.update(set(ids))
                            mibig_extra_genes.append(feature._shift(offset)) 
                            if bgc_id not in added_mibig_genes:
                                added_mibig_genes[bgc_id] = []
                            added_mibig_genes[bgc_id].append("/".join(ids))
                            
    # add mibig extra genes
    for feature in mibig_extra_genes:
        if "extra_genes" not in annots:
            annots["extra_genes"] = []
        if "annotations" not in annots:
            annots["annotations"] = []
        gene_id = feature.qualifiers.get("locus_tag", feature.qualifiers.get("protein_id", feature.qualifiers.get("gene", [None])))[0]
        if gene_id != None:
            extra_gene = {
                "id": gene_id,
                "location": {
                    "exons": [{"start": location.start + 1, "end": location.end} for location in feature.location.parts],
                    "strand": feature.location.strand
                }
            }
            if "translation" in feature.qualifiers:
                extra_gene["translation"] = feature.qualifiers["translation"][0]
            annots["extra_genes"].append(extra_gene)
            annot = {
                "id": gene_id
            }
            if "product" in feature.qualifiers:
                annot["product"] = feature.qualifiers["product"][0]
            if "note" in feature.qualifiers:
                annot["comments"] = feature.qualifiers["note"][0]
            if len(annot.keys()) > 1:
                annots["annotations"].append(annot)
                
    # check extra genes annotation
    extra_genes = []
    for i, extra_gene in enumerate(annots.get("extra_genes", [])):
        gene_id = extra_gene["id"]
        if gene_id not in cluster_cds_ids or gene_id in "/".join(added_mibig_genes.get(bgc_id, "")).split("/"):
            extra_genes.append(extra_gene)
            cluster_cds_ids.add(gene_id)
        else:
            if bgc_id not in removed_extra_genes:
                removed_extra_genes[bgc_id] = []
            removed_extra_genes[bgc_id].append(str(i))
    if len(extra_genes) > 0:
        annots["extra_genes"] = extra_genes
    else:
        annots.pop("extra_genes", None)
            
    # check annotation, remove if no added info
    gene_annots = []
    for i, annot in enumerate(annots.get("annotations", [])):
        gene_id = annot.get("id", annot.get("name", None))
        approve = False
        if gene_id in cluster_cds_ids:
            if len(annot.get("name", "")) < 1:
                annot.pop("name", None)
            if len(annot.get("product", "")) < 1:
                annot.pop("product", None)
            if len(annot.get("mut_pheno", "")) < 1:
                annot.pop("mut_pheno", None)
            if len(annot.get("comments", "")) < 1:
                annot.pop("comments", None)
            g_functions = []
            for g_function in annot.get("functions", []):
                if len(g_function.get("category", "")) < 1:
                    continue
                elif len(g_function.get("evidence", [])) < 1:
                    continue
                elif len(set(g_function["evidence"]) - set(["Sequence-based prediction", "Other in vivo study", "Heterologous expression", "Knock-out", "Activity assay"])) > 0:
                    continue
                else:
                    g_functions.append(g_function)
            if len(g_functions) > 0:
                annot["functions"] = g_functions
            else:
                annot.pop("functions", None)
            if len(annot.get("tailoring", [])) < 1:
                annot.pop("tailoring", None)
            if len(annot.get("publications", [])) < 1:
                annot.pop("publications", None)
            if len(annot.keys()) > 1: # 1 is the id
                approve = True
        if approve:
            gene_annots.append(annot)
        else:
            if bgc_id not in removed_annotations:
                removed_annotations[bgc_id] = []
            removed_annotations[bgc_id].append(str(i))
    if len(gene_annots) > 0:
        annots["annotations"] = gene_annots
    else:
        annots.pop("annotations", None)
                
    # check operons, remove if no added info
    operons = []
    for i, operon in enumerate(annots.get("operons", [])):
        o_genes = set(operon.get("genes", []))
        o_evidence = set(operon.get("evidence", [])).intersection(set(["Sequence-based prediction", "RACE", "ChIPseq", "RNAseq"]))
        if len(o_genes) > 0 and len(o_genes - cluster_cds_ids) < 1:
            if len(o_evidence) > 0:
                operons.append({
                    "genes": list(o_genes),
                    "evidence": list(o_evidence)
                })
                continue
        if bgc_id not in removed_operons:
            removed_operons[bgc_id] = []
        removed_operons[bgc_id].append(str(i))
        
    if len(operons) > 0:
        annots["operons"] = operons
    else:
        annots.pop("operons", None)
        
    if len(annots.keys()) > 0:
        data["cluster"]["genes"] = annots

In [None]:
def validate_data(data, validator, schema_props):
    for error in sorted(validator.iter_errors(data), key=str):
        if error.path[-2] == "evidence" or error.path[-1] == "evidence":
            if error.path[-3] == "loci" or error.path[-2] == "loci":
                continue
        elif error.path[-1] == "nr_iterations":
            continue
        elif error.path[-1] == "module_number":
            continue
        elif error.path[-2] == "proteinogenic":
            continue
        elif error.path[-1] in ["leader_sequence", "follower_sequence"]:
            continue
        elif error.path[-1] == "organism":
            if data["cluster"]["mibig_accession"] in no_gbk:
                continue
        print(error.message)
        #sys.exit(0)
    this_file_props = count_props(data, "", {})
    for prop in this_file_props:
        if prop not in schema_props.keys():
            print(prop)
            sys.exit(0)

In [None]:
with open("../../inputs/mibig_schema_phase_6.json") as schema_6:
    schema_obj = json.load(schema_6)
    schema_obj["properties"]["cluster"]["properties"]["loci"]["properties"]["organism"] = {
        "title": "Organism's name",
        "type": "string"
    }
    schema_obj["properties"]["cluster"]["properties"]["loci"]["properties"]["taxonomy"] = {
        "title": "Organism's Taxonomy",
        "type": "array",
        "items": {
            "type": "string"
        }
    }
    schema_obj["properties"]["cluster"]["properties"]["loci"]["required"].append("organism")
    with open("../../preprocessed/p7-mibig_schema_draft7.json", "w") as schema_7:
        json.dump(schema_obj, schema_7, indent=4, separators=(',', ': '), sort_keys=True)

In [None]:
input_path = "../../preprocessed/p6-json/"
output_folder = "../../preprocessed/p7-json/"

if not path.exists(output_folder):
    makedirs(output_folder)

validator = None
schema_props = {}
with open("../../preprocessed/p7-mibig_schema_draft7.json") as json_file:
    schema_obj = json.load(json_file)
    validator = Draft7Validator(schema_obj)
    schema_props = fetch_props_new_schema(schema_obj, "", {})

    
for json_path in sorted(glob.glob(path.join(input_path, "BGC*.json"))):
    with open(json_path, "r") as json_file:
        bgc_id = path.basename(json_path).split(".")[0]
        data = json.load(json_file)
        print("Scanning {}".format(bgc_id))
        check_gene_annotations(data)
        validate_data(data, validator, schema_props)
        with open(path.join(output_folder, "{}.json".format(bgc_id)), "w") as o:
            o.write(json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True))
            
print("All data fetched!")

In [None]:
retired_bgcs = {}

with open("../../preprocessed/reports/p7-removed_extra_genes.tsv", "w") as o:
    o.write("bgc_id\tgene_ids\n")
    for bgc_id in removed_extra_genes:
        o.write("{}\t{}\n".format(bgc_id, ",".join(removed_extra_genes[bgc_id])))
        if bgc_id not in retired_bgcs:
            retired_bgcs[bgc_id] = set()
        retired_bgcs[bgc_id].add("extra_genes_removed")
    
with open("../../preprocessed/reports/p7-removed_annotations.tsv", "w") as o:
    o.write("bgc_id\tgene_ids\n")
    for bgc_id in removed_annotations:
        o.write("{}\t{}\n".format(bgc_id, ",".join(removed_annotations[bgc_id])))
        if bgc_id not in retired_bgcs:
            retired_bgcs[bgc_id] = set()
        retired_bgcs[bgc_id].add("annotations_removed")
    
with open("../../preprocessed/reports/p7-removed_operons.tsv", "w") as o:
    o.write("bgc_id\tindexes\n")
    for bgc_id in removed_operons:
        o.write("{}\t{}\n".format(bgc_id, ",".join(removed_operons[bgc_id])))
        if bgc_id not in retired_bgcs:
            retired_bgcs[bgc_id] = set()
        retired_bgcs[bgc_id].add("operons_removed")

with open("../../preprocessed/reports/p7-no_gbk.tsv", "w") as o:
    o.write("bgc_id\taccession\n")
    for bgc_id in no_gbk:
        o.write("{}\t{}\n".format(bgc_id, no_gbk[bgc_id]))
        if bgc_id not in retired_bgcs:
            retired_bgcs[bgc_id] = set()
        retired_bgcs[bgc_id].add("no_gbk")
        
with open("../../preprocessed/reports/p7-retired_list.tsv", "w") as o:
    for bgc_id in retired_bgcs:
        o.write("{}\t{}\n".format(bgc_id, ";".join(retired_bgcs[bgc_id])))
    
with open("../../preprocessed/reports/p7-added_mibig_genes.tsv", "w") as o:
    o.write("bgc_id\tgenes\n")
    for bgc_id in added_mibig_genes:
        o.write("{}\t{}\n".format(bgc_id, ",".join(added_mibig_genes[bgc_id])))
