## Imports

In [1]:
import h5py
import numba
import toytree
import toyplot
import numpy as np
import pandas as pd
import ipyparallel as ipp
import ipyrad.analysis as ipa
from scipy.stats import norm

### Data

In [2]:
## tetrad result files: a tree and a database
simtree = "./analysis-tetrad/cli.nhx"
simdata = "./analysis-tetrad/cli.output.h5"

In [3]:
## peek at the invariants matrix database.
## The first matrix is stored in the 'invariants' array (order 5,8,10,11)
## the correct order is stored in the 'quartets' array (order 5,11,8,10)
## it will be rearranged by the Hils script to the correct matrix
idx = 478
with h5py.File(simdata) as io5:
    print 'quartet:', io5["quartets"][idx]
    print '\nmatrix:'
    print io5["invariants"]["boot0"][idx]
    arr = io5["invariants"]["boot0"][idx]

quartet: [ 5 11  8 10]

matrix:
[[ 0 13 17 14 12  0  0  0 11  0  0  0 15  0  0  1]
 [ 7  0  0  0  5 26  0  0  0  0  0  0  0  0  0  0]
 [11  0  0  0  0  0  0  0  0  0 26  0  0  0  0  0]
 [11  0  0  2  0  0  0  0  0  0  0  0  1  1  0 30]
 [25  1  0  0  1 13  0  0  0  0  0  0  0  0  0  0]
 [ 0 15  0  0  8  0 16 12  0  5  1  0  0  4  0  1]
 [ 0  0  0  0  0 11  1  0  0  3 26  0  0  0  0  0]
 [ 0  0  0  0  1 12  0  3  0  0  0  0  1  1  0 26]
 [15  0  3  0  1  0  0  0  1  0 11  0  0  0  0  0]
 [ 0  0  0  0  0 34  2  0  0  0  9  0  0  0  0  0]
 [ 1  0 10  0  0  1 12  0 14  9  0 12  0  0  6  0]
 [ 0  0  0  0  0  0  0  0  0  0 11  1  0  1  7 32]
 [22  0  0  0  0  0  0  0  0  0  0  0  1  0  0  8]
 [ 0  0  0  0  0 28  0  3  0  0  0  0  0  1  0 14]
 [ 0  0  0  0  0  0  0  0  0  0 28  4  0  0  1  6]
 [ 1  0  0  8  0  0  0 15  0  0  0 13 11 18 14  0]]


### plot grid

In [4]:
canvas, table = toyplot.matrix(arr, width=300, height=300, margin=10)
table.body.gaps.columns[...] = 1
table.body.gaps.rows[...] = 1

### Functions

In [2]:
# %load /home/deren/local/src/ipyrad/ipyrad/analysis/hils.py



class Hils(object):
    """
    A Class to calculate the Hils statistic given a matrix of invariants.
    """
    def __init__(self, database, boot=0, tree=None, root=None):
        ## open file handles for accessing database
        self._open = True
        self._boot = boot
        self.hdf5 = h5py.File(database)
        self.matrix = self.hdf5["invariants"]["boot{}".format(self._boot)]
        self.quartets = self.hdf5["quartets"]
        self.nquartets = self.quartets.shape[0]
        self.tree = tree
        self.root = root
        if self.tree:
            self.snames = sorted(self.tree.get_tip_labels())
            self.sidx = {i:j for i,j in enumerate(snames)}
        

    def close_db(self):
        """close the database file"""
        self.hdf5.close()
    
    

    def get_counts_by_idx(self, idx, altmat=None):
        """
        Return site counts for a given index (quartet). Chooses the 
        'correct' matrix based on the name order in self.quartets. 
        But this can be overridden during testing by entering a 
        altmat index.
        """

        ## the matrix is stored in default order format (e.g., 0,1|2,3)
        mat = self.matrix[idx, :, :]

        ## the correct quartet is stored separate (e.g., 0,3|1,2)
        qrt = self.quartets[idx]
        
        ## the matrix needs to be arranged to be in the right order.
        ## if taxon 1 is the second lowest (e.g., 0,1|2,3) then no reorder
        ## if taxon 1 is the third lowest (e.g., 0,2|1,3) then reorder mat1
        ## if taxon 1 is the highest (e.g., 0,3|1,2) then reorder to mat2
        if isinstance(altmat, int):
            assert altmat in [0, 1, 2], "altmat must be an index in [0,1,2]"
            mat = alt_mats(mat, altmat)
        else:
            if qrt[1] > qrt[2]:
                if qrt[1] > qrt[3]:
                    mat = alt_mats(mat, 2)
                else:
                    mat = alt_mats(mat, 1)
            
        ## return counts as a dataframe with column names
        df = pd.DataFrame(
            data=count_snps(mat), 
            index=["aabb", "abba", "baba", "aaab"], 
            columns=[idx]).T
        return df
    

    
    def get_h_by_idx(self, idx, altmat=None):
        """
        calculate Hils. This could be numba-fied, but you'd have to work
        with arrays instead of dataframes. This is fine for now.
        """

        ## get counts and convert to site frequencies
        df = self.get_counts_by_idx(idx, altmat)
        nsites = df.sum(axis=1).values[0]
        pdf = df/nsites
        pdf.columns = ["p"+i for i in df.columns]
        data = pd.concat([df, pdf], axis=1)

        ## avoid zero div errors
        if data.pabba.equals(data.pbaba):
            H = 0.0
            f1 = 1.0
            f2 = 0.0

        else:
            ## get H and f1 and f2 for these data
            H, f1, f2 = calc_h(data, nsites)

            ## f1 and f2 measure differences/distances, should be positive
            f1, f2 = [abs(i) for i in (f1, f2)]

        ## return as a dataframe 
        res = pd.DataFrame(
             {"Hils":H,
              "gamma": 1. - (f1/(f1+f2)),
              "pval": norm.pdf(H, 0, 1)}, 
             index=[idx],
             )
        return pd.concat([df, pdf, res], axis=1)



    def run(self):
        """calculate Hils and return table for all idxs in database"""
        stats = pd.concat([self.get_h_by_idx(idx) for idx in xrange(self.nquartets)])
        qrts = ["{},{}|{},{}".format(*i) for i in self.quartets[:]]
        qrts = pd.DataFrame(np.array(qrts), columns=["qrts"])
        return pd.concat([stats, qrts], axis=1)




    def svds(self, idx):
        """
        returns the svd scores for the three resolutions of the matrix
        as calculated by tetrad. 
        """
        mats = np.zeros((3, 16, 16), dtype=np.uint32)
        mats[0] = self.matrix[idx]
        mats[1] = alt_mats(mats[0], 1)
        mats[2] = alt_mats(mats[0], 2)

        svds = np.zeros((3, 16), dtype=np.float64)
        scor = np.zeros(3, dtype=np.float64)
        rank = np.zeros(3, dtype=np.float64)

        ## why svd and rank?
        for test in range(3):
            svds[test] = np.linalg.svd(mats[test].astype(np.float64))[1]
            rank[test] = np.linalg.matrix_rank(mats[test].astype(np.float64))

        ## get minrank, or 11
        minrank = int(min(11, rank.min()))
        for test in range(3):
            scor[test] = np.sqrt(np.sum(svds[test, minrank:]**2))

        ## sort to find the best qorder
        return scor

    

def calc_h(data, nsites):
    """ 
    Calculate Hils statistic from site counts/frequencies.
    """

    f1 = data.paabb - data.pbaba
    f2 = data.pabba - data.pbaba           

    sigmaf1 = (1. / nsites) * (data.paabb * (1. - data.paabb) \
        + data.pbaba * (1. - data.pbaba) \
        + 2. * data.paabb * data.pbaba)

    sigmaf2 = (1. / nsites) * (data.pabba * (1. - data.pabba) \
        + data.pbaba * (1. - data.pbaba) \
        + 2. * data.pabba * data.pbaba)

    covf1f2 = (1. / nsites) * (data.pabba * (1. - data.paabb) \
        + data.paabb * data.pbaba \
        + data.pabba * data.pbaba \
        + data.pbaba * (1. - data.pbaba)) 

    num = f2 * ((f1 / f2) - 0.)
    p1 = (sigmaf2 * (f1/f2)**2)
    p2 = ((2. * covf1f2 * (f1/f2) + sigmaf1))
    denom = p1 - p2

    ## calculate hils
    H = num/np.sqrt(abs(denom))
    return H, f1, f2

    

@numba.jit(nopython=True)   
def alt_mats(mat, idx):
    """ return alternate rearrangements of matrix"""
    mats = np.zeros((3, 16, 16), dtype=np.uint32)
    mats[0] = mat
    x = np.uint8(0)
    for y in np.array([0, 4, 8, 12], dtype=np.uint8):
        for z in np.array([0, 4, 8, 12], dtype=np.uint8):
            mats[1, y:y+np.uint8(4), z:z+np.uint8(4)] = mats[0, x].reshape(4, 4)
            mats[2, y:y+np.uint8(4), z:z+np.uint8(4)] = mats[0, x].reshape(4, 4).T
            x += np.uint8(1)
    return mats[idx]
        
        

@numba.jit(nopython=True)
def count_snps(mat):
    """JIT func to return counts quickly"""
    ## array to store results
    snps = np.zeros(4, dtype=np.uint16)

    ## get concordant (aabb) pis sites
    snps[0] = np.uint16(\
           mat[0, 5] + mat[0, 10] + mat[0, 15] + \
           mat[5, 0] + mat[5, 10] + mat[5, 15] + \
           mat[10, 0] + mat[10, 5] + mat[10, 15] + \
           mat[15, 0] + mat[15, 5] + mat[15, 10])

    ## get discordant (baba) sites
    for i in range(16):
        if i % 5:
            snps[1] += mat[i, i]

    ## get discordant (abba) sites
    snps[2] = mat[1, 4] + mat[2, 8] + mat[3, 12] +\
              mat[4, 1] + mat[6, 9] + mat[7, 13] +\
              mat[8, 2] + mat[9, 6] + mat[11, 14] +\
              mat[12, 3] + mat[13, 7] + mat[14, 11]

    ## get autapomorphy sites
    snps[3] = (mat.sum() - (snps[0] + np.diag(mat).sum() + snps[2]))
    return snps

### Calculating Hils

In [6]:
## Tree for the simdata 
tre = toytree.tree(simtree)
tre.root(wildcard="3")
snames = sorted(tre.get_tip_labels())
for node in tre.tree.traverse():
    if node.is_leaf():
        node.name = "{} - {}".format(snames.index(node.name), node.name)
tre.draw(width=300);

In [7]:
## initialize a Hils object
hils = Hils(simdata)

In [8]:
## calculate for all idxs
result = hils.run()

## print first 10 results
result.head(10)

Unnamed: 0,aabb,abba,baba,aaab,paabb,pabba,pbaba,paaab,Hils,gamma,pval,qrts
0,75,9,6,551,0.117,0.014,0.009,0.86,0.814,0.042,0.286,"0,1|2,3"
1,74,9,7,622,0.104,0.013,0.01,0.874,0.517,0.029,0.349,"0,1|2,4"
2,73,8,6,633,0.101,0.011,0.008,0.879,0.552,0.029,0.342,"0,1|2,5"
3,70,8,7,614,0.1,0.011,0.01,0.878,0.263,0.016,0.385,"0,1|2,6"
4,68,9,7,618,0.097,0.013,0.01,0.88,0.518,0.032,0.349,"0,1|2,7"
5,65,8,6,669,0.087,0.011,0.008,0.894,0.555,0.033,0.342,"0,1|2,8"
6,66,10,7,679,0.087,0.013,0.009,0.891,0.772,0.048,0.296,"0,1|2,9"
7,59,7,7,671,0.079,0.009,0.009,0.902,0.0,0.0,0.399,"0,1|2,10"
8,56,7,6,680,0.075,0.009,0.008,0.908,0.283,0.02,0.383,"0,1|2,11"
9,107,2,2,622,0.146,0.003,0.003,0.849,0.0,0.0,0.399,"0,1|3,4"


In [9]:
## print just the ten most significant results
result.sort_values(by="Hils", ascending=False).head(10)

Unnamed: 0,aabb,abba,baba,aaab,paabb,pabba,pbaba,paaab,Hils,gamma,pval,qrts
456,55,7,1,681,0.074,0.009,0.001,0.915,2.529,0.1,0.016,"4,10|8,9"
181,129,0,6,622,0.17,0.0,0.008,0.822,2.379,0.047,0.024,"1,2|5,7"
172,43,6,1,734,0.055,0.008,0.001,0.936,2.288,0.106,0.029,"1,2|3,11"
297,40,9,3,753,0.05,0.011,0.004,0.935,2.257,0.14,0.031,"2,3|5,11"
365,55,8,2,670,0.075,0.011,0.003,0.912,2.251,0.102,0.032,"2,10|8,9"
494,58,4,15,577,0.089,0.006,0.023,0.882,2.231,0.204,0.033,"8,9|10,11"
421,58,8,2,664,0.079,0.011,0.003,0.907,2.226,0.097,0.034,"3,10|8,9"
161,60,8,2,679,0.08,0.011,0.003,0.907,2.211,0.094,0.035,"0,10|8,9"
193,72,4,0,718,0.091,0.005,0.0,0.904,2.178,0.053,0.037,"1,2|7,10"
94,91,0,5,665,0.12,0.0,0.007,0.874,2.167,0.055,0.038,"0,3|6,7"


### Plot results

In [10]:
## distribution of Hils across all quartet edges in dataset
canvas = toyplot.Canvas(width=650, height=300)

for idx, val in enumerate(["Hils", "gamma"]):
    axes = canvas.cartesian(
        grid=(1, 2, idx),
        xlabel=val,
        ylabel="Frequency")

    mark = axes.bars(
        np.histogram(
            result[val][~result[val].isnull()],
            density=True,
            ),
        )

## style axes
axes.x.ticks.show = True
axes.y.ticks.show = True

## Apply to empirical data
Pedicularis data set assembled in ipyrad and then run through tetrad. 

In [11]:
## parallel client **(requires an ipcluster instance to be running)**
ipyclient = ipp.Client()

## ipyrad output files for ped assembly
fphy = "/home/deren/local/src/ipyrad/tests/analysis-ipyrad/pedicularis_outfiles/pedicularis.snps.phy"
fmap = "/home/deren/local/src/ipyrad/tests/analysis-ipyrad/pedicularis_outfiles/pedicularis.snps.map"

## init tetrad object with data
tet = ipa.tetrad(
    name="pedicularis",
    data=fphy,
    mapfile=fmap,           ## <- to sample unlinked SNPs
    save_invariants=True,   ## <- need this
    nboots=100,             ## <- several replicates
    )

## run tetrad inference
tet.run(ipyclient=ipyclient, force=True)

loading seq array [13 taxa x 172383 bp]
max unlinked SNPs per quartet (nloci): 39385
inferring 715 quartet tree sets
host compute node: [40 cores] on sacra
[####################] 100% generating q-sets | 0:00:00 |  
[####################] 100% initial tree      | 0:00:00 |  
[####################] 100% bootstrap trees   | 0:00:53 |  
[####################] 100% calculating stats | 0:00:00 |  


In [13]:
## parse the tree
ptre = toytree.tree(tet.trees.nhx)
ptre.root(wildcard="prz")

## convert names back into indexes
snames = sorted(ptre.get_tip_labels())
for node in ptre.tree.traverse():
    if node.is_leaf():
        node.name = "{} -- {}".format(snames.index(node.name), node.name)

## plot the tree         
ptre.draw(
    node_labels=ptre.get_node_values("support"),
    height=300, 
    width=350);

In [14]:
## create a Hils object from the tetrad output database
hils = Hils(tet.database.output)

In [15]:
## run Hils inference
res = hils.run()

### Distribution of Hils results across all tests in database

In [16]:
## distribution of Hils across all quartet edges in dataset
canvas = toyplot.Canvas(width=650, height=300)

## plot distributions
for idx, val in enumerate(["Hils", "gamma"]):
    axes = canvas.cartesian(
        grid=(1, 2, idx),
        xlabel=val,
        ylabel="Frequency")
    mark = axes.bars(
        np.histogram(
            res[val][~res[val].isnull()],
            density=True,
            ))
## style axes
axes.x.ticks.show = True
axes.y.ticks.show = True

In [17]:
## get most significant results
res.sort_values(by="pval", ascending=True).head(10)

Unnamed: 0,aabb,abba,baba,aaab,paabb,pabba,pbaba,paaab,Hils,gamma,pval,qrts
268,62,110,146,5387,0.011,0.019,0.026,0.944,-33.164,0.3,5.833e-240,"1,3|4,8"
515,152,96,59,5505,0.026,0.017,0.01,0.947,32.96,0.285,5.092e-237,"3,4|6,10"
514,138,87,54,4781,0.027,0.017,0.011,0.945,32.803,0.282,8.854999999999999e-235,"3,4|6,9"
143,308,189,114,6309,0.045,0.027,0.016,0.912,27.804,0.279,5.359999999999999e-169,"0,4|6,7"
21,272,129,221,6172,0.04,0.019,0.033,0.908,27.803,0.643,5.491e-169,"0,4|1,7"
428,372,228,144,7678,0.044,0.027,0.017,0.912,15.457,0.269,5.220999999999999e-53,"2,4|6,7"
146,321,178,102,6400,0.046,0.025,0.015,0.914,12.602,0.258,1.303e-35,"0,4|6,10"
233,127,228,297,6535,0.018,0.032,0.041,0.909,-12.535,0.289,3.014e-35,"1,2|4,9"
250,447,408,381,12299,0.033,0.03,0.028,0.909,12.084,0.29,7.766e-33,"1,2|7,8"
232,141,264,345,7536,0.017,0.032,0.042,0.909,-11.416,0.284,2.0059999999999998e-29,"1,2|4,8"


### bootstrap distribution for a highly significant test
This shows elevated BABA relationship between thamnophila subspecies. The mean $\gamma$ value of approximately 0.5 suggests that the samples "30556_thamno" really is a nearly perfect hybrid species between "40578_rex" and "33413_thamno". Cool!! 

In [18]:
## get H for some test across many bootstraps
idx = 234

## concat bootstrap reps
reps = []
for boot in range(tet.params.nboots):
    hils = Hils(tet.database.output, boot=boot, tree=ptre)
    reps.append(hils.get_h_by_idx(idx))
bootsarr = pd.concat(reps)
bootsarr.index = range(tet.params.nboots)

## print quartet and plot 
print "{}, {} | {}, {}".format(*[hils.sidx[idx] for idx in hils.quartets[idx]])
toyplot.bars(
    np.histogram(bootsarr.Hils, density=True), 
    width=300, height=300, xlabel="Hils");
toyplot.bars(
    np.histogram(bootsarr.gamma, density=True), 
    width=300, height=300, xlabel="gamma");
toyplot.bars(
    np.histogram(bootsarr.pval, density=True), 
    width=300, height=300, xlabel="p-value");

## show head of boots array
bootsarr.head(10)

30556_thamno, 40578_rex | 30686_cyathophylla, 33413_thamno


Unnamed: 0,aabb,abba,baba,aaab,paabb,pabba,pbaba,paaab,Hils,gamma,pval
0,331,150,259,7584,0.04,0.018,0.031,0.911,6.314,0.602,8.79e-10
1,337,167,260,7656,0.04,0.02,0.031,0.909,3.83,0.547,0.0002604
2,345,169,260,7539,0.042,0.02,0.031,0.907,3.553,0.517,0.0007249
3,357,157,268,7722,0.042,0.018,0.032,0.908,4.887,0.555,2.595e-06
4,361,157,256,7732,0.042,0.018,0.03,0.909,3.896,0.485,0.0002015
5,343,173,247,7604,0.041,0.021,0.03,0.909,2.761,0.435,0.008817
6,377,154,231,7525,0.045,0.019,0.028,0.908,3.084,0.345,0.003433
7,348,180,264,7750,0.041,0.021,0.031,0.907,3.127,0.5,0.003004
8,374,159,257,7644,0.044,0.019,0.03,0.906,3.779,0.456,0.000316
9,350,154,221,7659,0.042,0.018,0.026,0.914,2.709,0.342,0.01016


In [19]:
## close database handle
hils.close_db()