In [1]:
import os
import sys
import matplotlib as mpl
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio import Phylo
import pandas as pd
import seaborn as sns
mpl.rcParams['pdf.fonttype'] = 42
%matplotlib inline



In [2]:
def checkSlash(directory):
    if directory[-1] != '/':
        directory = directory + '/'
    return directory

In [3]:
coreDir = '/home/ubuntu/proc/sjspence/170105_PSE/'
snpDir = '11_parsnp_subgroups_snps'
coreDir = checkSlash(coreDir)
snpDir = checkSlash(snpDir)

#Identify subgroup names
subDir = '10_subgroups/'
subgroups = []
for subdirs, dirs, files in os.walk(subDir):
    for d in dirs:
        subgroups.append(d)
subgroups.sort()

**ANCESTRAL RECONSTRUCTION WITH FASTML**

In [47]:
#Removal of branch supports
#Node IDs must not be length 5 and can't be interpreted as floats
def removeBranchSupports(tree):
    treeList = tree.split(')')
    newTree = []
    for i, t in enumerate(treeList):
        t = t.split(':')
        if len(t[0]) == 5 and float(t[0]) > 0.0:
            t = t[1:]
        t = ':'.join(t)
        if (i == 0) or (i == len(treeList)-1):
            newTree.append(t)
        else:
            newTree.append(':' + t)
    newTree = ')'.join(newTree)
    return newTree

#First, edit tree files to remove non-standard quote characters
def editTree(treeFile, treeEdit):
    treeIn = open(treeFile, 'r')
    treeOut = open(treeEdit, 'w')
    for line in treeIn:
        line = line.replace('\'', '')
        line = line.replace('_contigs.fa.ref', '_ref')
        line = line.replace('_contigs.fa', '')
        line = removeBranchSupports(line)
        treeOut.write(line)
    treeIn.close()
    treeOut.close()

#Edit variant file to match tree file leaf IDs
def editVariant(variantFile, variantEdit):
    variantIn = open(variantFile, 'r')
    variantOut = open(variantEdit, 'w')
    for line in variantIn:
        if '>' in line:
            line = line.replace('_contigs.fa.ref', '_ref')
            line = line.replace('_contigs.fa', '')
            variantOut.write(line)
        else:
            variantOut.write(line)
    variantIn.close()
    variantOut.close()

In [49]:
#Edit tree and variant files for ancestral reconstruction
fastmlPath = '/home/ubuntu/tools/FastML.v3.1/www/fastml/FastML_Wrapper.pl'
for s in subgroups:
    s = checkSlash(s)
    treeFile = coreDir + snpDir + s + 'parsnp.tree'
    treeEdit = treeFile.replace('.tree', '_edit.tree')
    editTree(treeFile, treeEdit)
    variantFile = coreDir + snpDir + s + 'variants.mfa'
    variantEdit = variantFile.replace('.mfa', '_edit.mfa')
    editVariant(variantFile, variantEdit)
    outDir = coreDir + snpDir + s + 'fastml/'
    if os.path.exists(outDir):
        os.system('rm -r ' + outDir)
    os.makedirs(outDir)
    os.system('perl ' + fastmlPath + ' --MSA_File ' + variantEdit + ' --seqType nuc --outDir ' + outDir + \
         ' --Tree ' + treeEdit + ' --jointReconstruction no --indelReconstruction ML')

**NOTES**  
subK stalled out, could not finish computing.  
subM also stalled...  
and subN  

All are small groups with suspicious tree structures and long branch lengths (i.e. many snps)

In [13]:
def compareSeqs(seq1, seq2):
    switches = []
    for i, base in enumerate(seq1):
        if seq1[i] != seq2[i]:
            switches.append(i)
    return switches

def recursion(clade, seqDict, switches, homoplasy):
    if clade.is_terminal():
        allHomoplasies[clade.name] = list(set(homoplasy))
    else:
        parentSeq = seqDict[clade.name]
        for c in clade.clades:
            cSeq = seqDict[c.name]
            s = compareSeqs(parentSeq, cSeq)
            for switch in s:
                if switch in switches:
                    homoplasy.append(switch)
                else:
                    switches.append(switch)
            recursion(c, seqDict, switches, homoplasy)

In [14]:
#tree.ancestor.txt
#tree.newick.txt
#seq.marginal.txt
allHomoplasies = {}
for s in subgroups:
    s = checkSlash(s)
    fastmlDir = coreDir + snpDir + s + 'fastml/'
    if not os.path.exists(fastmlDir + 'seq.marginal.txt'):
        continue
    tree = Phylo.read(fastmlDir + 'tree.newick.txt', 'newick')
    records = list(SeqIO.parse(fastmlDir + 'seq.marginal.txt', 'fasta'))
    for r in records:
        r.id = r.id.replace('-', '_')
    seqDict = SeqIO.to_dict(records)
    switches, homoplasy = [], []
    recursion(tree.root, seqDict, switches, homoplasy)

In [22]:
dimorphicHomoplasies = {}
#NEED TO EDIT TO MAKE SURE ONLY EDITING READS FROM ONE SUBGROUP
for s in subgroups:
    s = checkSlash(s)
    fastmlDir = coreDir + snpDir + s + 'fastml/'
    if not os.path.exists(fastmlDir + 'seq.marginal.txt'):
        continue
    ##########################
    #GET SET OF SEQ IDS TO CONSIDER FOR THIS SUBGROUP
    print(s)
    records = list(SeqIO.parse(snpDir + s + 'variants_edit.mfa', 'fasta'))
    recordSet = set()
    for r in records:
        r.id = r.id.replace('-', '_')
        recordSet.add(r.id)
    ##########################
    #DETERMINE DIMORPHIC SITES
    variableSites = {}
    for r in records:
        for i, base in enumerate(r.seq):
            if i not in variableSites:
                variableSites[i] = set()
            variableSites[i].add(base)
    recordDict = SeqIO.to_dict(records)
    dimorphic = []
    for site in variableSites:
        if len(variableSites[site]) == 2:
            dimorphic.append(site)
    ##########################
    #KEEP ONLY DIMORPHIC SITES IN HOMOPLASIES
    for seqID in recordSet:
        dimorphicList = []
        for hIndex in allHomoplasies[seqID]:
            if hIndex in dimorphic:
                dimorphicList.append(hIndex)
        dimorphicHomoplasies[seqID] = dimorphicList
    ##########################
    #CHECK WORK
    for seqID in recordSet:
        print(seqID + ': ' + str(len(recordDict[seqID])) + ',' + str(len(allHomoplasies[seqID])) + ',' + \
              str(len(dimorphicHomoplasies[seqID])))
    
    #START HERE!! Didn't trim??

subA/
D17_102043: 748,152,137
D17_102039: 748,321,292
D17_102038: 748,338,307
D17_102037: 748,181,163
D17_102036: 748,289,263
D17_102050_ref: 748,3,3
D17_102049: 748,367,332
D17_102044: 748,7,6
D17_102045: 748,41,36
D17_102046: 748,58,53
D17_102047: 748,85,78
D17_102040: 748,120,110
D17_102041: 748,355,322
D17_102042: 748,240,218
D17_102065: 748,262,238
D17_102048: 748,213,193
D17_102051: 748,307,280
subB/
D17_102237: 1253,341,279
D17_102158: 1253,417,346
D17_102197: 1253,247,200
D17_102194: 1253,357,290
D17_102195: 1253,288,233
D17_102202: 1253,507,430
D17_102061: 1253,220,175
D17_102206: 1253,466,391
D17_102207: 1253,387,317
D17_102208: 1253,536,453
D17_102209: 1253,437,364
D17_102085: 1253,599,507
D17_102086: 1253,568,481
D17_102205_ref: 1253,184,148
D17_102058: 1253,134,104
D17_102052: 1253,264,214
D17_102057: 1253,488,412
D17_102054: 1253,19,14
D17_102211: 1253,444,370
D17_102210: 1253,406,336
D17_102072: 1253,41,31
D17_102141: 1253,0,0
D17_102074: 1253,90,69
D17_102095: 1253,554,

In [None]:
########################################################
####START HERE
########################################################

In [None]:
def getSNPs(contigFile):
    records = SeqIO.parse(snpDir + subDir + 'variants.mfa', 'fasta')
    for r in records:
        if r.id == contigFile:
            return(r.seq)
    print('Error, wrong file referenced')

def compareTerminals(terminals):
    for subList in terminals:
        commonClade = []
        for i in subList:
            variants = getSNPs(i)
            commonClade.append(variants)

def compareSNPs(clade):
    if clade.is_terminal():
        pass
    elif len(clade.clades) >= 2:
        terminals = []
        for c in clade.clades:
            terminalObjects = c.get_terminals()
            terminalList = []
            for o in terminalObjects:
                terminalList.append(o.name)
            terminals.append(terminalList)
        print(terminals)
        compareTerminals(terminals)
        for c in clade.clades:
            compareSNPs(c)
    else:
        print('Error, check tree configuration')

In [33]:
#[method for method in dir(testTree) if callable(getattr(testTree, method))]
#globals(testTree)
for clade in testTree.find_clades():
    print clade.name
[method for method in dir(testTree.root) if callable(getattr(testTree.root, method))]
compareSNPs(testTree.root)

['__bool__',
 '__class__',
 '__delattr__',
 '__format__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__init__',
 '__iter__',
 '__len__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_filter_search',
 '_get_color',
 '_set_color',
 'collapse',
 'collapse_all',
 'common_ancestor',
 'count_terminals',
 'depths',
 'distance',
 'find_any',
 'find_clades',
 'find_elements',
 'get_nonterminals',
 'get_path',
 'get_terminals',
 'is_bifurcating',
 'is_monophyletic',
 'is_parent_of',
 'is_preterminal',
 'is_terminal',
 'ladderize',
 'prune',
 'split',
 'total_branch_length',
 'trace']