# Demo of summary statistics of phylogenetic distances
A demo notebook to get summary statistics for dataframe of phylogenetic distances.

Will also output nice plots of histograms of each distance metrics. 

That allows easy comparison across distance calculation methods

In [1]:
# import package and relevant dependencies
import toytree #toytree to construct phylogenetic trees
import numpy as np #numpy to do statistical operations
import pandas as pd #pandas to manipulate dataframe
import itertools #itertools to iterate efficiently
import os #to get filepaths
import matplotlib.pyplot as plt #for plotting
import toyplot #plotting

import distmetric #Scarlet's package in dev

### Generating random trees and comparing quartet distances

In [2]:
# generate 10 random trees with Generator and save to variable named randomtrees
TREES = distmetric.Generator(ntrees=20, ntips=10, treeheight=1.0)
randomtrees = TREES.get_randomtrees()

# calculate quartet distances with 20 random trees and PAIRWISE sampling
quart = distmetric.Quartets(trees=randomtrees, sampmethod="pairwise")
quart.run()
quart.output

Unnamed: 0,trees,Quartet_intersection
0,"0, 1",0.809524
1,"1, 2",0.77619
2,"2, 3",0.8
3,"3, 4",0.738095
4,"4, 5",0.942857
5,"5, 6",0.942857
6,"6, 7",0.942857
7,"7, 8",0.795238
8,"8, 9",0.928571
9,"9, 10",0.928571


### Calculating summary statistics for quartet method

In [10]:
#saving dataframe
df=quart.output

#getting mean
mean = round(df.mean()[0],4)
#getting standard deviation
sd = round(df.std()[0],4)
#printing
print('mean quartet distance:',mean,' standard deviation:',sd)

mean quartet distance: 0.8481  standard deviation: 0.0956


### Visualizing histogram with mean and std labelled

In [25]:
#Creating bar plot histogram for quartet intersection

#setting plot parameters
canvas = toyplot.Canvas(width=600, height=400) 
#making sure axes are cartesian coordinates and labelling axes
axes = canvas.cartesian(label="Quartet distance histogram",
                       xlabel="Quartet Distance",
                       ylabel="Frequency") 
#show axes ticks
axes.x.ticks.show = True
axes.y.ticks.show = True
#Binning values into 20 bins using np.histogram, and coloring them orange
bars = axes.bars(
    np.histogram(df.Quartet_intersection, bins=10),
    style={"fill":"orange"}
                )
#adding text for mean and standard deviation
canvas.text(350,100,"sigma=0.095",style={"font-size":"12px"});
canvas.text(350, 75, "mu=0.848", style={"font-size":"16px"});