In [1]:
import skbio
from Bio import SeqIO
import pandas as pd
workdir = "/Volumes/McMinds/git_repos/gcmp_stan_symportal_2019/"
outdir = workdir + "output/symbio_phylo/"

In [3]:
input_file = "all_seqs.fasta"
#input_file = "72_DBV_20190609_2019-06-10_01-30-10.537772.seqs.fasta"
discard_unnamed = True
clades = {}
for seq in SeqIO.parse(workdir + "raw_data/symbio_phylo/" + input_file, "fasta"):
        
        if "_" in seq.id:
            if discard_unnamed:
                continue
            else:
                clade = seq.id.split("_")[1]
        else:
            clade = seq.id[0]
            
        if clade not in clades.keys():
            clades[clade] = []
            
        clades[clade].append(seq)   
        
clade_sizes = {}
for x in clades.keys():
    clade_sizes[x] = len(clades[x])
    print(x + " " + str(clade_sizes[x]))
    
clades_sorted = sorted(clade_sizes, key=clade_sizes.get)

A 247
B 76
C 706
F 55
D 119
E 1
G 28
H 9
I 4


In [None]:
for x in clades_sorted:
    prefix = outdir + "its2_clade_" + x
    nseqs = len(clades[x])

    print('Writing ' + str(nseqs) + ' clade ' + x + ' seq(s) to a file\n')
    SeqIO.write(clades[x], prefix + ".fasta", "fasta-2line")
    
    if nseqs > 1:
        print('Aligning clade ' + x + ':')
        !mafft --thread 4 --maxiterate 1000 --ep 0 --genafpair {prefix + ".fasta"} > {prefix + "_aligned.fasta"}
        
        #print('Building phylogeny for clade ' + x + ':')
        #!iqtree -s {prefix + "_aligned.fasta"}

In [None]:
seeds = ""
addseqs = "/dev/null"
inds = []
for x in clades_sorted:
    if len(clades[x]) > 1:
        seeds = seeds + "--seed " + outdir + "its2_clade_" + x + "_aligned.fasta "
    else:
        inds.append(clades[x])

if len(inds) > 0:
    if len(inds) == 1:
        inds = inds[0]
    print('Writing isolate clades to a file\n')
    addseqs = outdir + "its2_concat_isolates.fasta"
    SeqIO.write(inds, addseqs, "fasta-2line")       
        
!mafft --thread 4 --maxiterate 1000 --ep 0 --genafpair {seeds}{addseqs} > {outdir + "its2_all_clades_aligned.fasta"}

In [None]:
partitions_file = outdir + "iqtree_partitions.nex"
f = open(partitions_file, "w+")

part = 1
f.write("#nexus\n")
f.write("begin sets;\n")
f.write("    charset ITS2 = " + outdir + "its2_all_clades_aligned_clean.fasta:*;\n")
for gene in ("28S","23S","cob","coi","elf","psba"):
    part += 1
    f.write("    charset " + gene + " = " + workdir + "raw_data/symbio_phylo/" + gene + "/forindividual tree " + gene + ".fas:*;\n")
f.write("end;\n")
f.close()

In [None]:
!sed 's/_seed_//g' {outdir + "its2_all_clades_aligned.fasta"} > {outdir + "its2_all_clades_aligned_clean.fasta"}
!iqtree -nt AUTO -ntmax 4 -pre {outdir + "allgenes"} -mrate E,I,G,I+G,R,H,I+H -mfreq F,FO -spp {partitions_file}

In [5]:
abundance = workdir + "raw_data/symbio_phylo/72_DBV_20190609_2019-06-10_01-30-10.537772.profiles.absolute.txt"
df = pd.read_csv(abundance, sep="\t", index_col=0) 
df.drop(columns=[df.columns[0]], inplace=True)
profs = dict(df.loc['ITS2 type profile'])

In [7]:
import re
abunds = {}
for string,profdef,names in zip(df.loc['Average defining sequence proportions and [stdev]'], df.loc['ITS2 type profile'], df.columns):
    abunds[names] = {}
    for substr,div in zip(string.split('-'),profdef.replace('/','-').split('-')):
        abunds[names][div] = float(re.sub('\[.*\]','',substr)) * 1000 # need integer values for some reason! provided decimal values were at precision of 1e-3
abunds = pd.DataFrame.from_dict(abunds, orient='index')
abunds[pd.isna(abunds)] = 0

In [8]:
divTree = skbio.TreeNode.read(outdir + "allgenes.treefile").root_at_midpoint()
divTree.write(file = outdir + "allgenes_rooted.treefile")

'/Volumes/McMinds/git_repos/gcmp_stan_symportal_2019/output/symbio_phylo/allgenes_rooted.treefile'

In [9]:
wu = skbio.diversity.beta_diversity('weighted_unifrac', abunds.to_numpy(), ids=list(abunds.index), tree = divTree, otu_ids=list(abunds.columns))
f = open(outdir + "profile_WU_Distances.txt", 'w')
f.write(str(len(wu.data)) + '\n')
for name,dat in zip(abunds.index,wu.data):
    f.write(name) #f.write(df.loc['ITS2 type profile',name]) #
    for dist in dat:
        f.write('\t' + f'{dist:.19f}')
    f.write('\n')
f.close()
!fastme -i {outdir + "profile_WU_Distances.txt"} -o {outdir + "profile_WU_fastmeBal.tree"} -I {outdir + "profile_WU_fastmeBal_stats.txt"} -f 18 -m B -s -n B
profTree = skbio.TreeNode.read(outdir + "profile_WU_fastmeBal.tree").root_at_midpoint()
profTree.write(file = outdir + "profile_WU_fastmeBal_rooted.tree")
model = 'correlated'
collapseMultis = False
!Rscript {workdir + 'gcmp_stan_symportal/phylogenetics/chronogram.r'} {collapseMultis} {model} {outdir + "profile_WU_fastmeBal_rooted.tree"} {outdir + "profile_WU_fastmeBal_" + model + "_chronos.tree"}



#  Analysing dataset 1

 . Computing tree...

 . Performing NNI...

 . Performing SPR...

 . Time used 0h00m01s

Setting initial dates...
Fitting in progress... get a first set of estimates
         Penalised log-lik = -1007718 
Optimising rates... dates... -1007718 
Optimising rates... dates... -214723.3 
Optimising rates... dates... -211324.4 
Optimising rates... dates... -209131.5 
Optimising rates... dates... -206191.5 
Optimising rates... dates... -204620.6 
Optimising rates... dates... -203356.9 
Optimising rates... dates... -201833.6 
Optimising rates... dates... -200493.9 
Optimising rates... dates... -199514 
Optimising rates... dates... -198930.7 
Optimising rates... dates... -198478.3 
Optimising rates... dates... -198090.5 
Optimising rates... dates... -197308.7 
Optimising rates... dates... -197070.3 
Optimising rates... dates... -196535.1 
Optimising rates... dates... -196304.3 
Optimising rates... dates... -196020.6 
Optimising rates... dates... -195846.3 
Optimising r

In [29]:
uwu = skbio.diversity.beta_diversity('unweighted_unifrac', abunds.to_numpy(), ids=list(abunds.index), tree = divTree, otu_ids=list(abunds.columns))
f = open(outdir + "profile_UWU_Distances.txt", 'w')
f.write(str(len(uwu.data)) + '\n')
for name,dat in zip(abunds.index,uwu.data):
    f.write(df.loc['ITS2 type profile',name]) #f.write(name)
    for dist in dat:
        f.write('\t' + f'{dist:.19f}')
    f.write('\n')
f.close()
!fastme -i {outdir + "profile_UWU_Distances.txt"} -o {outdir + "profile_UWU_fastmeBal.tree"} -I {outdir + "profile_UWU_fastmeBal_stats.txt"} -f 18 -m B -s -n B
profTree = skbio.TreeNode.read(outdir + "profile_UWU_fastmeBal.tree").root_at_midpoint()
profTree.write(file = outdir + "profile_UWU_fastmeBal_rooted.tree")
model = 'correlated'
collapseMultis = False
!Rscript {workdir + 'gcmp_stan_symportal/phylogenetics/chronogram.r'} {collapseMultis} {model} {outdir + "profile_UWU_fastmeBal_rooted.tree"} {outdir + "profile_UWU_fastmeBal_" + model + "_chronos.tree"}



#  Analysing dataset 1

 . Computing tree...

 . Performing NNI...

 . Performing SPR...

 . Time used 0h00m01s

Setting initial dates...
Fitting in progress... get a first set of estimates
         Penalised log-lik = -665052.3 
Optimising rates... dates... -665052.3 
Optimising rates... dates... -95814.27 
Optimising rates... dates... -88566.07 
Optimising rates... dates... -85133.02 
Optimising rates... dates... -83609.17 
Optimising rates... dates... -82593.86 
Optimising rates... dates... -81779.21 
Optimising rates... dates... -81091.25 
Optimising rates... dates... -80620.64 
Optimising rates... dates... -80574.76 
Optimising rates... dates... -80544.01 
Optimising rates... dates... -80522.02 
Optimising rates... dates... -80510.94 
Optimising rates... dates... -80299.12 
Optimising rates... dates... -80284.97 
Optimising rates... dates... -80279.37 
Optimising rates... dates... -80275.54 
Optimising rates... dates... -80273.04 
Optimising rates... dates... -80272.75 
Optimisi