# This notebook contains code that can be used to create a custom taxonomy from a phylogenetic tree that can be used for Kraken or emboss

In [1]:
# Load the phylogenetic tree into memory
from ete3 import Tree
from os import chdir, path, makedirs, system
from Bio.SeqIO import index as fasta_index,write

chdir("/mnt/research/germs/shane/hgt/data")
prot="Cas1_98_AVX2"
phyloTree = Tree("trees/RAxML_bestTree."+prot)
chdir("/mnt/research/germs/shane/antibioticResistance/data")

In [2]:
# Generate the custom taxonomy for Kraken based on the loaded phylogenetic tree
specificDB = "dbs/"+prot.replace('C','c')+"DB/taxonomy"
if not path.exists(specificDB): makedirs(specificDB);print("Made "+specificDB)
nodesTracker={}
names= open(specificDB+"/names.dmp",'w')
nodes= open(specificDB+"/nodes.dmp",'w')
root = phyloTree.get_tree_root()

for taxID,node in enumerate(phyloTree.traverse("preorder")):
    nodesTracker[node]=str(taxID+1)
    if not node.is_leaf():
        node.name = str(taxID+1)
        if node ==root:nodes.write("\t|\t".join([str(taxID+1),'1',"no rank","-"])+'\n') # Root is its own parent
        else: nodes.write("\t|\t".join([str(taxID+1),nodesTracker[node.up],"genus","-"])+'\n')
    else: nodes.write("\t|\t".join([str(taxID+1),nodesTracker[node.up],"species","-"])+'\n')
    names.write("\t|\t".join([str(taxID+1),"Cas1"+node.name,"-","scientific name",""])+'\n')
print(len(phyloTree))
names.close()
nodes.close()

Made dbs/cas1_98_AVX2DB/taxonomy
2184


In [3]:
# Rewrite the fasta file with the taxonomy IDs generated in Cell 2 so that Kraken can link taxonomy to the sequences
allSeqs = fasta_index("assemblies/Cas1_Class2_Coding.fa","fasta")
missing,good=0,0
with open("assemblies/Cas1-CodingSeqsCleaned.fa",'w') as fh:
    for leaf in phyloTree.get_leaves():
        if leaf.name not in allSeqs: missing+=1;continue
        rec=allSeqs[leaf.name]
        rec.id = rec.id + "|kraken:taxid|" + nodesTracker[leaf]+ " Cas1 ORF Region"
        rec.name = rec.id
        rec.description = ""
        write(rec,fh,"fasta")
        good+=1
missing,good

(0, 2184)

In [4]:
%%bash 
#Add the sequences that with the added taxIDs generated by this module to a custom database
kraken-build --add-to-library assemblies/Cas1-CodingCleaned.fa --db dbs/cas1_98_AVX2DB

Added "assemblies/Cas1-CodingCleaned.fa" to library (dbs/cas1_98_AVX2DB)


In [None]:
%%bash
#Build the Kraken database
kraken-build --threads 15 --jellyfish-hash-size 19200M --max-db-size 100 --build --db dbs/cas1_98_AVX2DB

In [None]:
%%bash
# Build the bracken database with kmer lengths=31 and mean read length of my data is 100bp (-l 100)
~/bin/Bracken-2.5/bracken-build -k 31 -l 100 -d dbs/cas1_98_AVX2DB -t 10