In [1]:
import os
import sys
import matplotlib as mpl
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio import Phylo
import pandas as pd
import seaborn as sns
mpl.rcParams['pdf.fonttype'] = 42
%matplotlib inline



In [2]:
def checkSlash(directory):
    if directory[-1] != '/':
        directory = directory + '/'
    return directory

In [5]:
coreDir = '/home/ubuntu/proc/sjspence/170105_PSE/'
snpDir = '11_parsnp_subgroups_snps'
coreDir = checkSlash(coreDir)
snpDir = checkSlash(snpDir)

#Identify subgroup names
subDir = '10_subgroups/'
subgroups = []
for subdirs, dirs, files in os.walk(subDir):
    for d in dirs:
        subgroups.append(d)
subgroups.sort()

**ANCESTRAL RECONSTRUCTION WITH FASTML**

In [47]:
#Removal of branch supports
#Node IDs must not be length 5 and can't be interpreted as floats
def removeBranchSupports(tree):
    treeList = tree.split(')')
    newTree = []
    for i, t in enumerate(treeList):
        t = t.split(':')
        if len(t[0]) == 5 and float(t[0]) > 0.0:
            t = t[1:]
        t = ':'.join(t)
        if (i == 0) or (i == len(treeList)-1):
            newTree.append(t)
        else:
            newTree.append(':' + t)
    newTree = ')'.join(newTree)
    return newTree

#First, edit tree files to remove non-standard quote characters
def editTree(treeFile, treeEdit):
    treeIn = open(treeFile, 'r')
    treeOut = open(treeEdit, 'w')
    for line in treeIn:
        line = line.replace('\'', '')
        line = line.replace('_contigs.fa.ref', '_ref')
        line = line.replace('_contigs.fa', '')
        line = removeBranchSupports(line)
        treeOut.write(line)
    treeIn.close()
    treeOut.close()

#Edit variant file to match tree file leaf IDs
def editVariant(variantFile, variantEdit):
    variantIn = open(variantFile, 'r')
    variantOut = open(variantEdit, 'w')
    for line in variantIn:
        if '>' in line:
            line = line.replace('_contigs.fa.ref', '_ref')
            line = line.replace('_contigs.fa', '')
            variantOut.write(line)
        else:
            variantOut.write(line)
    variantIn.close()
    variantOut.close()

In [49]:
#Edit tree and variant files for ancestral reconstruction
fastmlPath = '/home/ubuntu/tools/FastML.v3.1/www/fastml/FastML_Wrapper.pl'
for s in subgroups:
    s = checkSlash(s)
    treeFile = coreDir + snpDir + s + 'parsnp.tree'
    treeEdit = treeFile.replace('.tree', '_edit.tree')
    editTree(treeFile, treeEdit)
    variantFile = coreDir + snpDir + s + 'variants.mfa'
    variantEdit = variantFile.replace('.mfa', '_edit.mfa')
    editVariant(variantFile, variantEdit)
    outDir = coreDir + snpDir + s + 'fastml/'
    if os.path.exists(outDir):
        os.system('rm -r ' + outDir)
    os.makedirs(outDir)
    os.system('perl ' + fastmlPath + ' --MSA_File ' + variantEdit + ' --seqType nuc --outDir ' + outDir + \
         ' --Tree ' + treeEdit + ' --jointReconstruction no --indelReconstruction ML')

**NOTES**  
subK stalled out, could not finish computing.  
subM also stalled...  
and subN  

All are small groups with suspicious tree structures and long branch lengths (i.e. many snps)

In [6]:
#tree.ancestor.txt
#tree.newick.txt
#seq.marginal.txt
for s in subgroups:
    fastmlDir = coreDir + snpDir + s + 'fastml/'
    if not os.path.exists(fastmlDir + 'tree.newick.txt'):
        continue
    tree = Phylo.read(fastmlDir + 'tree.newick.txt', 'newick')
    Phylo.draw_ascii(tree)

    #START HERE
########################################################

In [6]:
testTree = Phylo.read(snpDir + 'subA/parsnp.tree', 'newick')
Phylo.draw_ascii(testTree)

        _____________________ D17-102050_contigs.fa.ref
      _|
     | |_________________________ D17-102044_contigs.fa
     |
  ___|        ____________________________________ D17-102045_contigs.fa
 |   |    ___|
 |   |   |   |___________________________ D17-102046_contigs.fa
 |   |___|
 |       |   ________________________ D17-102047_contigs.fa
 |       |__|
 |          |   ___________________________ D17-102040_contigs.fa
 |          |__|
 |             |______________________________ D17-102043_contigs.fa
 |
 |     _______________________ D17-102037_contigs.fa
_| ___|
 ||   |________________________________________ D17-102048_contigs.fa
 ||
 ||    _____________________________ D17-102042_contigs.fa
 ||___|
 |    |________________________ D17-102065_contigs.fa
 |
 |       ______________________________ D17-102036_contigs.fa
 |  ____|
 | |    |______________________ D17-102051_contigs.fa
 | |
 |_|        ____________________ D17-102039_contigs.fa
   | ______|
   ||      |__________

In [65]:
def getSNPs(contigFile):
    records = SeqIO.parse(snpDir + subDir + 'variants.mfa', 'fasta')
    for r in records:
        if r.id == contigFile:
            return(r.seq)
    print('Error, wrong file referenced')

def compareTerminals(terminals):
    for subList in terminals:
        commonClade = []
        for i in subList:
            variants = getSNPs(i)
            commonClade.append(variants)
        

In [66]:
def compareSNPs(clade):
    if clade.is_terminal():
        pass
    elif len(clade.clades) >= 2:
        terminals = []
        for c in clade.clades:
            terminalObjects = c.get_terminals()
            terminalList = []
            for o in terminalObjects:
                terminalList.append(o.name)
            terminals.append(terminalList)
        print(terminals)
        compareTerminals(terminals)
        for c in clade.clades:
            compareSNPs(c)
    else:
        print('Error, check tree configuration')

In [67]:
compareSNPs(testTree.root)

[['D17-102050_contigs.fa.ref', 'D17-102044_contigs.fa', 'D17-102045_contigs.fa', 'D17-102046_contigs.fa', 'D17-102047_contigs.fa', 'D17-102040_contigs.fa', 'D17-102043_contigs.fa'], ['D17-102037_contigs.fa', 'D17-102048_contigs.fa', 'D17-102042_contigs.fa', 'D17-102065_contigs.fa'], ['D17-102036_contigs.fa', 'D17-102051_contigs.fa', 'D17-102039_contigs.fa', 'D17-102038_contigs.fa', 'D17-102041_contigs.fa', 'D17-102049_contigs.fa']]
748
748
748
748
748
748
748
748
748
748
748
748
748
748
748
748
748
[['D17-102050_contigs.fa.ref', 'D17-102044_contigs.fa'], ['D17-102045_contigs.fa', 'D17-102046_contigs.fa', 'D17-102047_contigs.fa', 'D17-102040_contigs.fa', 'D17-102043_contigs.fa']]
748
748
748
748
748
748
748
[['D17-102050_contigs.fa.ref'], ['D17-102044_contigs.fa']]
748
748
[['D17-102045_contigs.fa', 'D17-102046_contigs.fa'], ['D17-102047_contigs.fa', 'D17-102040_contigs.fa', 'D17-102043_contigs.fa']]
748
748
748
748
748
[['D17-102045_contigs.fa'], ['D17-102046_contigs.fa']]
748
748
[['D

In [33]:
#[method for method in dir(testTree) if callable(getattr(testTree, method))]
#globals(testTree)
[method for method in dir(testTree.root) if callable(getattr(testTree.root, method))]

['__bool__',
 '__class__',
 '__delattr__',
 '__format__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__init__',
 '__iter__',
 '__len__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_filter_search',
 '_get_color',
 '_set_color',
 'collapse',
 'collapse_all',
 'common_ancestor',
 'count_terminals',
 'depths',
 'distance',
 'find_any',
 'find_clades',
 'find_elements',
 'get_nonterminals',
 'get_path',
 'get_terminals',
 'is_bifurcating',
 'is_monophyletic',
 'is_parent_of',
 'is_preterminal',
 'is_terminal',
 'ladderize',
 'prune',
 'split',
 'total_branch_length',
 'trace']

In [None]:
def pop_list(nodes=None, parent=None, node_list=None):
    if parent is None:
        return node_list
    node_list.append([])
    for node in nodes:
        if node['parent'] == parent:
            node_list[-1].append(node)
        if node['id'] == parent:
            next_parent = node['parent']

    pop_list(nodes, next_parent, node_list)
    return node_list

In [None]:
for clade in testTree.find_clades():
    print clade.name