# This notebook contains code that can be used to create a custom taxonomy from a phylogenetic tree that can be used for Kraken or emboss

In [1]:
# Load the tree from the phylogenetic tree
from ete3 import Tree
from os import chdir, path, makedirs, system

chdir("data")
prot="Cas2"
phyloTree = Tree("trees/RAxML_result."+prot)
chdir("/mnt/research/germs/shane/antibioticResistance/data")

In [2]:
# Generate the custom taxonomy for Kraken based on the loaded phylogenetic tree
specificDB = "dbs/"+prot.replace('C','c')+"DB/taxonomy"
if not path.exists(specificDB): makedirs(specificDB);print("Made "+specificDB)
nodesTracker={}
names= open(specificDB+"/names.dmp",'w')
nodes= open(specificDB+"/nodes.dmp",'w')
root = casTree.get_tree_root()

for taxID,node in enumerate(phyloTree.traverse("preorder")):
    nodesTracker[node]=str(taxID+1)
    if not node.is_leaf():
        node.name = str(taxID+1)
        if node ==root:nodes.write("\t|\t".join([str(taxID+1),'1',"no rank","-"])+'\n') # Root is its own parent
        else: nodes.write("\t|\t".join([str(taxID+1),nodesTracker[node.up],"no rank","-"])+'\n')
    else: nodes.write("\t|\t".join([str(taxID+1),nodesTracker[node.up],"subspecies","-"])+'\n')
    names.write("\t|\t".join([str(taxID+1),node.name,"-","scientific name"])+'\n')
print(len(phyloTree))
names.close()
nodes.close()

1968


In [11]:
# Rewrite the fasta file with the taxonomy IDs generated in Cell 2 so that Kraken can link taxonomy to the sequences
from Bio.SeqIO import index as fasta_index
allSeqs = fasta_index("assemblies/Cas12Coding.fa","fasta")
missing,good=0,0
with open("assemblies/Cas12-CodingCleaned.fa",'w') as fh:
    for leaf in phyloTree.get_leaves():
        if leaf.name not in allSeqs: missing+=1;continue
        rec=allSeqs[leaf.name]
        rec.id = rec.id + "|kraken:taxid|" + nodesTracker[leaf]+ " Cas12 ORF Region"
        rec.name = rec.id
        rec.description=""
        write(rec,fh,"fasta")
        good+=1
missing,good

(131, 1827)

In [None]:
%%bash #This step should NOT be launched from a notebook because it can take hours to build and can suck a LOT of memory

#Add the sequences that with the added taxIDs generated by this module to a custom database
kraken-build --add-to-library assemblies/Cas12-CodingCleaned.fa --db dbs/cas12DB

#Build the Kraken database
kraken-build --threads 30 --jellyfish-hash-size 19200M --max-db-size 100 --build --db dbs/cas12DB