In [34]:
import pandas as pd
from Bio import SeqIO

In [36]:
fin = "final_gbk/D45mix_em_lvl.genbank"

In [63]:
rcs = list(SeqIO.parse(fin, "genbank"))

for rc in rcs:
    for feature in rc.features:
        product = feature.qualifiers.get('product')
        if product and "hypothetical protein" in product:
            anno_keys = [k for k in feature.qualifiers.keys() if 'anno_' in k]
            if len(anno_keys) > 0:
                anno = [feature.qualifiers.get(k)[0] for k in anno_keys]
                anno = [x for x in anno if x != 'hypothetical protein']
                if len(anno) > 0:
                    feature.qualifiers.update({'product': ';'.join(anno)})

with open(fout, 'w') as fh:
    SeqIO.write(rcs, fh, "genbank")

In [1]:
import pandas as pd
import re

def pretty_theader(theader):
    theader = theader.split("\x01")
    annotations = [re.sub("\[.*", "", x) for x in theader]
    anno1 = re.sub("^[.\w]+\s", "", annotations[0], 1)
    rst_anno = [re.sub("^.*: ", "", anno1).strip()]
    if len(annotations) > 1:
        anno_opt = [re.sub("^[.\w]+\s", "", x, 1).strip() for x in annotations[1:]]
        anno_opt = [x for x in anno_opt if rst_anno[0] not in x]
        anno_opt = list(set(anno_opt))
        rst_anno += anno_opt

    # choose functions that are not assigned as hypothetical protein
    rst_anno = [x for x in rst_anno if "hypothetical protein" not in x]

    return rst_anno



def main(fin, fout):
    colnames = ['query', 'target', 'evalue', 'gapopen', 'pident', 'fident',
            'nident', 'qstart', 'qend', 'qlen', 'tstart', 'tend', 'tlen',
            'bits', 'qheader', 'theader', 'mismatch', 'qcov', 'tcov', 'taxid',
            'taxname', 'taxlineage'] 
    df = pd.read_csv(fin, sep="\t", names=colnames)
    df['annotation'] = df.apply(lambda x: pretty_theader(x['theader']), axis=1)
    df['function'] = df.apply(lambda x: '' if len(x['annotation'])==0 else x['annotation'][0], axis=1)
    df['function_opt'] = df.apply(lambda x: '' if len(x['annotation'])<=1 else
            ';;'.join(x['annotation'][1:]), axis=1)

    # select columns
    cols = ['query', 'target', 'pident', 'bits', 'evalue', 'taxid', 'function', 'function_opt']
    df = df[cols]

    # save to file
    df.to_csv(fout, sep="\t", index=False)

In [27]:
fin = "emapper/output_emapper.emapper.annotations"
colnames = ["query", "seed_ortholog", "evalue", "score", "eggNOG_OGs", "max_annot_lvl", "COG_category", "Description", "Preferred_name", "GOs", "EC", "KEGG_ko", "KEGG_Pathway", "KEGG_Module", "KEGG_Reaction", "KEGG_rclass", "BRITE", "KEGG_TC", "CAZy", "BiGG_Reaction", "PFAMs"]
df = pd.read_csv(fin, sep='\t', comment="#", names=colnames)
df2 = df.replace('-', pd.NA)

In [18]:
df2.to_csv("em.tsv", index=False, sep="\t")

In [32]:
df2.iloc[30:70, :10]

Unnamed: 0,query,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs
30,D45mix_1_45,1429794.A0A0A0Q3E7_9CAUD,2.54e-88,270.0,"4QC8N@10239|Viruses,4QUR8@35237|dsDNA viruses ...",10662|Myoviridae,,,,
31,D45mix_1_46,1204539.J9QL03_9CAUD,1.38e-18,85.1,"4QGGS@10239|Viruses,4QZHP@35237|dsDNA viruses ...",10699|Siphoviridae,,,,
32,D45mix_1_49,1206558.I7KR00_9CAUD,2.31e-79,240.0,"4QB6J@10239|Viruses,4QVZD@35237|dsDNA viruses ...",10662|Myoviridae,S,dCTP diphosphatase activity,,
33,D45mix_1_50,700939.E5FID4_9CAUD,5.0600000000000005e-22,97.1,"4QFBV@10239|Viruses,4QUVM@35237|dsDNA viruses ...",10662|Myoviridae,,,,
34,D45mix_1_52,1206558.I7J3V8_9CAUD,3.24e-170,491.0,"4QBY0@10239|Viruses,4QW35@35237|dsDNA viruses ...",10662|Myoviridae,S,AAA domain,,
35,D45mix_1_54,1206558.I7KQZ5_9CAUD,3.2100000000000003e-103,305.0,"4QAJX@10239|Viruses,4QUZ8@35237|dsDNA viruses ...",10662|Myoviridae,S,Pfam:DUF5051,,
36,D45mix_1_56,45406.Q06EW8_BPR32,7.79e-16,71.6,"4QF5C@10239|Viruses,4QWBG@35237|dsDNA viruses ...",10662|Myoviridae,,,,
37,D45mix_1_57,1211280.J7I494_9CAUD,2.31e-11,67.4,"4QHGH@10239|Viruses,4QZS6@35237|dsDNA viruses ...",10662|Myoviridae,,,,
38,D45mix_1_62,760939.E5EPE9_9CAUD,1.0899999999999999e-282,790.0,"4QDCP@10239|Viruses,4QVJZ@35237|dsDNA viruses ...",10662|Myoviridae,S,DNA gyrase B,,"GO:0005575,GO:0032991"
39,D45mix_1_64,1087482.M1EB38_9CAUD,1.99e-102,335.0,"4QBG4@10239|Viruses,4QUZA@35237|dsDNA viruses ...",10662|Myoviridae,,,,
