### Sandbox document for getting Hils results via Tetrad

## Imports

In [1]:
import ipyrad as ip
import ipyrad.analysis as ipa
import toytree 
import h5py
import ipyparallel as ipp
import numpy as np
import math
## ipcluster start -n20

In [None]:
## conda install ipyrad -c ipyrad
## conda install toytree -c eaton-lab

In [2]:
## up-to-date versions 
print 'ip', ipa.__version__
print 'toytree', toytree.__version__

ip 0.7.14
toytree 0.1.4


In [8]:
data = ip.load_json("/Users/pmckenz1/Desktop/projects/quartet_proj/analysis-ipyrad/min4.json")

loading Assembly: min4
from saved path: ~/Desktop/projects/intro_python/analysis-ipyrad/min4.json


In [9]:
## init a tetrad analysis object
tet = ipa.tetrad(
    name=data.name,
    data=data.outfiles.snpsphy,
    mapfile=data.outfiles.snpsmap,
    nboots=10,
    save_invariants=True   ## <- new option to save the arrays
    )

loading seq array [13 taxa x 173131 bp]
max unlinked SNPs per quartet (nloci): 39634


In [10]:
ipyclient = ipp.Client()
ipyclient.ids

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [11]:
tet.run(ipyclient)

inferring 715 quartet tree sets
host compute node: [20 cores] on Patricks-MacBook-Pro.local
[####################] 100% generating q-sets | 0:00:00 |  
[####################] 100% initial tree      | 0:00:01 |  
[####################] 100% bootstrap trees   | 0:00:11 |  
[####################] 100% calculating stats | 0:00:01 |  


In [16]:
## a 16x16 matrix for one quartet
with h5py.File(tet.database.output) as db:
    idx = 0
    qrt = db['quartets'][idx]
    arr = db['invariants/boot0']
    print 'inferred quartet:', qrt
    print 'matrix for ordered set:\n', arr[idx, :, :]
    
with h5py.File(tet.database.output) as db:

inferred quartet: [0 2 1 3]
matrix for ordered set:
[[  0 145 557 251  16   3   0   1  60   1  12   0  25   0   0   0]
 [ 42  26   1   2   3  42   0   1   0   0   1   0   0   1   0   0]
 [155   1  75   2   0   0   0   0  12   0 106   1   0   0   1   0]
 [ 65   0   0  15   0   0   0   1   0   0   0   0   1   0   1  30]
 [ 16   0   0   1   8  68   1   3   0   0   0   0   0   0   0   0]
 [  2  35   1   2 157   0 112 481   0  21   3   2   2 111   1   8]
 [  0   0   0   0   0  46  13   2   0   1  11   0   0   0   0   0]
 [  0   1   0   0   4 201   7  70   0   0   0   0   0   9   1  68]
 [ 49   0  10   2   1   0   0   0  55   3 189   2   0   0   0   0]
 [  0   0   0   0   0  24   0   1   2  11  32   0   0   0   0   0]
 [  9   1  99   1   0   1  15   0 496 118   0 129   0   1  39   0]
 [  0   0   2   0   0   0   0   0   3   3  71  11   0   1   2  17]
 [ 42   0   0   3   0   3   0   0   0   0   0   0  26   1   3  51]
 [  0   0   0   0   0 127   0  10   0   0   0   0   1  56   2 113]
 [  0   0 

In [19]:
f = h5py.File(tet.database.output, 'r')

In [158]:
arr = f['invariants']['boot0'][0]

mats = np.zeros((3, 16, 16), dtype=np.uint32)
mats[0] = arr
x = np.uint8(0)
for y in np.array([0, 4, 8, 12], dtype=np.uint8):
    for z in np.array([0, 4, 8, 12], dtype=np.uint8):
        mats[1, y:y+np.uint8(4), z:z+np.uint8(4)] = mats[0, x].reshape(4, 4)
        #mats[2, y:y+np.uint8(4), z:z+np.uint8(4)] = mats[0, x].reshape(4, 4).T
        x += np.uint8(1)
x = np.uint8(0)
for z in np.array([0,1,2,3]):
    for y in np.array([0,4,8,12]):
        mats[2,:,x] = mats[0,:,(y+z)]
        x += np.uint8(1)

[calcHils(mats[0]),calcHils(mats[1]),calcHils(mats[2])]

['Parental taxa are more closely related than hybrid. Discard this.',
 '0.401917492623',
 '0.397271047622']

In [None]:
class Hils(object):
    """
    A Class to calculate the Hils statistic given a matrix of invariants.
    """
    def __init__(self, database, boot=0):
        ## open file handles for accessing database
        self._open = True
        self._boot = boot
        self.hdf5 = h5py.File(database)
        self.matrix = self.hdf5["invariants"]
        self.quartets = self.hdf5["quartets"]
        self.nquartets = self.quartets.shape[0]
    
    
    def close_db(self):
        """close the database file"""
        self.hdf5.close()
    
    
    def get_counts_by_idx(self, idx):
        """return site counts for a given index (quartet)"""
        ## get matrix
        mat = self.matrix["boot{}".format(self._boot)][idx, :, :]
        qrt = self.quartets[idx]
        
        ## arrange matrix
        if qrt[1] > qrt[3]:
            mat = alt_mats(mat, 2)
        elif qrt[1] > qrt[2]:
            mat = alt_mats(mat, 1)
            
        ## get counts and format
        df = pd.DataFrame(
            data=count_snps(mat), 
            index=["aabb", "abba", "baba", "aaab"], 
            columns=[idx]).T
        return df
    
    
    def get_h_by_idx(self, idx):
        """
        calculate Hils. This could be numba-fied, but you'd have to work
        with arrays instead of dataframes. This is fine for now.
        """
        ## get site frequencies
        df = self.get_counts_by_idx(idx)
        nsites = df.sum(axis=1).values[0]
        pdf = df/nsites
        pdf.columns = ["p"+i for i in df.columns]
        data = pd.concat([df, pdf], axis=1)
        
        ## choose invariant pattern
        f1 = data.paabb - data.pbaba
        f2 = data.pabba - data.pbaba
        ratio = f1 / f2
        
        ## calculate var, covar
        var_f1 = (1. / nsites) * (
                    data.paabb * (1. - data.paabb) \
                  + data.pbaba * (1. - data.pbaba) \
                  + 2. * data.paabb * data.pbaba)

        var_f2 = (1. / nsites) * (
                    data.pabba * (1. - data.pabba) \
                  + data.pbaba * (1. - data.pbaba) \
                  + 2. * data.pabba * data.pbaba)

        cov_f1_f2 = (1. / nsites) * (
                   -data.paabb * data.pabba \
                  + data.paabb * data.pbaba \
                  + data.pabba * data.pbaba \
                  + data.pbaba * (1. - data.pbaba))

        ## calculate hils
        num = abs(f2 * ratio)
        denom = np.sqrt(var_f2 * (ratio**2) - (2 * cov_f1_f2 * ratio + var_f1))
        H = pd.DataFrame({"Hils":num/denom, "gamma":(f1/f1+f2)}, index=[idx])

        data = pd.concat([df, pdf, H], axis=1)
        return data
    
    
    def run(self):
        """calculate Hils and return table for all idxs in database"""
        stats = pd.concat([self.get_h_by_idx(idx) for idx in xrange(self.nquartets)])
        qrts = ["{},{}|{},{}".format(*i) for i in self.quartets[:]]
        qrts = pd.DataFrame(np.array(qrts), columns=["qrts"])
        return pd.concat([stats, qrts], axis=1)
    
    
@numba.jit(nopython=True)   
def alt_mats(mat, idx):
    """ return alternate rearrangement of matrix"""
    mats = np.zeros((3, 16, 16), dtype=np.uint32)
    mats[0] = arr
    x = np.uint8(0)
    for y in np.array([0, 4, 8, 12], dtype=np.uint8):
        for z in np.array([0, 4, 8, 12], dtype=np.uint8):
            mats[1, y:y+np.uint8(4), z:z+np.uint8(4)] = mats[0, x].reshape(4, 4)
            mats[2, y:y+np.uint8(4), z:z+np.uint8(4)] = mats[0, x].reshape(4, 4).T
            x += np.uint8(1)
    return mats[idx]
        
        
@numba.jit(nopython=True)
def count_snps(mat):
    """JIT func to return counts quickly"""
    ## array to store results
    snps = np.zeros(4, dtype=np.uint16)

    ## get concordant (aabb) pis sites
    snps[0] = np.uint16(\
           mat[0, 5] + mat[0, 10] + mat[0, 15] + \
           mat[5, 0] + mat[5, 10] + mat[5, 15] + \
           mat[10, 0] + mat[10, 5] + mat[10, 15] + \
           mat[15, 0] + mat[15, 5] + mat[15, 10])

    ## get discordant (baba) sites
    for i in range(16):
        if i % 5:
            snps[1] += mat[i, i]

    ## get discordant (abba) sites
    snps[2] = mat[1, 4] + mat[2, 8] + mat[3, 12] +\
              mat[4, 1] + mat[6, 9] + mat[7, 13] +\
              mat[8, 2] + mat[9, 6] + mat[11, 14] +\
              mat[12, 3] + mat[13, 7] + mat[14, 11]

    ## get autapomorphy sites
    snps[3] = (mat.sum() - np.diag(mat).sum()) - snps[2]
    return snps


In [155]:
def calcHils(invmat, Nreq = 10, returnf = False, returnp = False, returnall = False,returnnum = False):
    invmat = invmat.astype(float)
    comb_dict = dict(zip([00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33], [0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15]))
    num_iijj = (invmat[comb_dict[00],comb_dict[11]] + invmat[comb_dict[00],comb_dict[22]] +
        invmat[comb_dict[00],comb_dict[33]] + invmat[comb_dict[11],comb_dict[00]] + invmat[comb_dict[11],comb_dict[22]] +
        invmat[comb_dict[11],comb_dict[33]] + invmat[comb_dict[22],comb_dict[00]] + invmat[comb_dict[22],comb_dict[11]] +
        invmat[comb_dict[22],comb_dict[33]] + invmat[comb_dict[33],comb_dict[00]] + invmat[comb_dict[33],comb_dict[11]] +
        invmat[comb_dict[33],comb_dict[22]])

    num_ijji = (invmat[comb_dict[01],comb_dict[10]] + invmat[comb_dict[02],comb_dict[20]] +
        invmat[comb_dict[03],comb_dict[30]] + invmat[comb_dict[10],comb_dict[01]] + invmat[comb_dict[12],comb_dict[21]] +
        invmat[comb_dict[13],comb_dict[31]] + invmat[comb_dict[20],comb_dict[02]] + invmat[comb_dict[21],comb_dict[12]] +
        invmat[comb_dict[23],comb_dict[32]] + invmat[comb_dict[30],comb_dict[03]] + invmat[comb_dict[31],comb_dict[13]] +
        invmat[comb_dict[32],comb_dict[23]])

    num_ijij = (invmat[comb_dict[01],comb_dict[01]] + invmat[comb_dict[02],comb_dict[02]] +
        invmat[comb_dict[03],comb_dict[03]] + invmat[comb_dict[10],comb_dict[10]] + invmat[comb_dict[12],comb_dict[12]] +
        invmat[comb_dict[13],comb_dict[13]] + invmat[comb_dict[20],comb_dict[20]] + invmat[comb_dict[21],comb_dict[21]] +
        invmat[comb_dict[23],comb_dict[23]] + invmat[comb_dict[30],comb_dict[30]] + invmat[comb_dict[31],comb_dict[31]] +
        invmat[comb_dict[32],comb_dict[32]])
    [num_iijj,num_ijji,num_ijij]
    if (num_ijij == 0 and num_ijji == 0):
        return("No ijij or ijji are present in data (not enough data)")
    N = sum(map(sum, invmat))
    if (N <= Nreq):
        return("Not enough snps.")
    # calculate probability, add .05 to counts in case some of them are 0
    p_iijj = (num_iijj + .05)/N
    p_ijji = (num_ijji + .05)/N
    p_ijij = (num_ijij + .05)/N
    
    if (p_ijij > max([p_iijj,p_ijji])):
        return("Parental taxa are more closely related than hybrid. Discard this.")
    
    f1 = p_iijj - p_ijij
    f2 = p_ijji - p_ijij
    if not(f2):
        p_ijji = (num_ijji + 1. + .05)/N
        f2 = p_ijji - p_ijij
    rat_f1_f2 = f1/f2

    var_f1 = (1./N) * ( p_iijj*(1.-p_iijj) + p_ijij*(1.-p_ijij) + 2.*p_iijj*p_ijij )
    var_f2 = (1./N) * ( p_ijji*(1.-p_ijji) + p_ijij*(1.-p_ijij) + 2.*p_ijji*p_ijij )

    cov_f1_f2 = (1./N) * ( -p_iijj*p_ijji + p_iijj*p_ijij + p_ijji*p_ijij + p_ijij*(1.-p_ijij))

    H = abs(f2 * rat_f1_f2) / math.sqrt( var_f2*(rat_f1_f2**2.) - 2.*cov_f1_f2*rat_f1_f2 + var_f1 )
    if returnf:
        return [H, f1, f2];
    if returnp:
        return [H, p_iijj,p_ijji,p_ijij];
    if returnall:
        return [H, f1, f2, p_iijj,p_ijji,p_ijij];
    if returnnum:
        return [num_iijj,num_ijji,num_ijij];
    if(num_ijij-num_ijji == 0):
        return('*'+str(H))
    else:
        return str(H);
def calcp(z):
    p = st.norm.sf(abs(z))*2
    return p;
def isfloat(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

In [156]:
arr = f['invariants']['boot0'][0]


mats = np.zeros((3, 16, 16), dtype=np.uint32)
mats[0] = arr
x = np.uint8(0)
for y in np.array([0, 4, 8, 12], dtype=np.uint8):
    for z in np.array([0, 4, 8, 12], dtype=np.uint8):
        mats[1, y:y+np.uint8(4), z:z+np.uint8(4)] = mats[0, x].reshape(4, 4)
        #mats[2, y:y+np.uint8(4), z:z+np.uint8(4)] = mats[0, x].reshape(4, 4).T
        x += np.uint8(1)
x = np.uint8(0)
for z in np.array([0,1,2,3]):
    for y in np.array([0,4,8,12]):
        mats[2,:,x] = mats[0,:,(y+z)]
        x += np.uint8(1)

[calcHils(mats[0]),calcHils(mats[1]),calcHils(mats[2])]

['Parental taxa are more closely related than hybrid. Discard this.',
 '0.401917492623',
 '0.397271047622']

In [157]:
sum(sum(mats[0]))

6108

In [146]:
mats[0,:,0]

array([  0,  79, 235,  96,  55,   2,   0,   1, 213,   1,   5,   0, 106,
         1,   0,   1], dtype=uint32)

In [104]:
mats = np.zeros((3, 16, 16), dtype=np.uint32)
mats[0] = arr
x = np.uint8(0)

In [115]:
mats[0,0].reshape(4,4).T

array([[  0,  55, 211,  82],
       [ 65,   3,   1,   0],
       [219,   0,  12,   1],
       [ 90,   0,   0,   2]], dtype=uint32)

## New Hils 

In [3]:
import h5py
import numba
import toytree
import toyplot
import numpy as np
import pandas as pd
import ipyparallel as ipp
import ipyrad.analysis as ipa
from scipy.stats import norm

In [4]:
# %load /home/deren/local/src/ipyrad/ipyrad/analysis/hils.py



class Hils(object):
    """
    A Class to calculate the Hils statistic given a matrix of invariants.
    """
    def __init__(self, database, boot=0, tree=None, root=None):
        ## open file handles for accessing database
        self._open = True
        self._boot = boot
        self.hdf5 = h5py.File(database)
        self.matrix = self.hdf5["invariants"]["boot{}".format(self._boot)]
        self.quartets = self.hdf5["quartets"]
        self.nquartets = self.quartets.shape[0]
        self.tree = tree
        self.root = root
        if self.tree:
            self.snames = sorted(self.tree.get_tip_labels())
            self.sidx = {i:j for i,j in enumerate(snames)}
        

    def close_db(self):
        """close the database file"""
        self.hdf5.close()
    
    

    def get_counts_by_idx(self, idx, altmat=None):
        """
        Return site counts for a given index (quartet). Chooses the 
        'correct' matrix based on the name order in self.quartets. 
        But this can be overridden during testing by entering a 
        altmat index.
        """

        ## the matrix is stored in default order format (e.g., 0,1|2,3)
        mat = self.matrix[idx, :, :]

        ## the correct quartet is stored separate (e.g., 0,3|1,2)
        qrt = self.quartets[idx]
        
        ## the matrix needs to be arranged to be in the right order.
        ## if taxon 1 is the second lowest (e.g., 0,1|2,3) then no reorder
        ## if taxon 1 is the third lowest (e.g., 0,2|1,3) then reorder mat1
        ## if taxon 1 is the highest (e.g., 0,3|1,2) then reorder to mat2
        if isinstance(altmat, int):
            assert altmat in [0, 1, 2], "altmat must be an index in [0,1,2]"
            mat = alt_mats(mat, altmat)
        else:
            if qrt[1] > qrt[2]:
                if qrt[1] > qrt[3]:
                    mat = alt_mats(mat, 2)
                else:
                    mat = alt_mats(mat, 1)
            
        ## return counts as a dataframe with column names
        df = pd.DataFrame(
            data=count_snps(mat), 
            index=["aabb", "abba", "baba", "aaab"], 
            columns=[idx]).T
        return df
    

    
    def get_h_by_idx(self, idx, altmat=None):
        """
        calculate Hils. This could be numba-fied, but you'd have to work
        with arrays instead of dataframes. This is fine for now.
        """

        ## get counts and convert to site frequencies
        df = self.get_counts_by_idx(idx, altmat)
        nsites = df.sum(axis=1).values[0]
        pdf = df/nsites
        pdf.columns = ["p"+i for i in df.columns]
        data = pd.concat([df, pdf], axis=1)

        ## avoid zero div errors
        if data.pabba.equals(data.pbaba):
            H = 0.0
            f1 = 1.0
            f2 = 0.0

        else:
            ## get H and f1 and f2 for these data
            H, f1, f2 = calc_h(data, nsites)

            ## f1 and f2 measure differences/distances, should be positive
            f1, f2 = [abs(i) for i in (f1, f2)]

        ## return as a dataframe 
        res = pd.DataFrame(
             {"Hils":H,
              "gamma": 1. - (f1/(f1+f2)),
              "pval": norm.pdf(H, 0, 1)}, 
             index=[idx],
             )
        return pd.concat([df, pdf, res], axis=1)



    def run(self):
        """calculate Hils and return table for all idxs in database"""
        stats = pd.concat([self.get_h_by_idx(idx) for idx in xrange(self.nquartets)])
        qrts = ["{},{}|{},{}".format(*i) for i in self.quartets[:]]
        qrts = pd.DataFrame(np.array(qrts), columns=["qrts"])
        return pd.concat([stats, qrts], axis=1)




    def svds(self, idx):
        """
        returns the svd scores for the three resolutions of the matrix
        as calculated by tetrad. 
        """
        mats = np.zeros((3, 16, 16), dtype=np.uint32)
        mats[0] = self.matrix[idx]
        mats[1] = alt_mats(mats[0], 1)
        mats[2] = alt_mats(mats[0], 2)

        svds = np.zeros((3, 16), dtype=np.float64)
        scor = np.zeros(3, dtype=np.float64)
        rank = np.zeros(3, dtype=np.float64)

        ## why svd and rank?
        for test in range(3):
            svds[test] = np.linalg.svd(mats[test].astype(np.float64))[1]
            rank[test] = np.linalg.matrix_rank(mats[test].astype(np.float64))

        ## get minrank, or 11
        minrank = int(min(11, rank.min()))
        for test in range(3):
            scor[test] = np.sqrt(np.sum(svds[test, minrank:]**2))

        ## sort to find the best qorder
        return scor

    

def calc_h(data, nsites):
    """ 
    Calculate Hils statistic from site counts/frequencies.
    """

    f1 = data.paabb - data.pbaba
    f2 = data.pabba - data.pbaba           

    sigmaf1 = (1. / nsites) * (data.paabb * (1. - data.paabb) \
        + data.pbaba * (1. - data.pbaba) \
        + 2. * data.paabb * data.pbaba)

    sigmaf2 = (1. / nsites) * (data.pabba * (1. - data.pabba) \
        + data.pbaba * (1. - data.pbaba) \
        + 2. * data.pabba * data.pbaba)

    covf1f2 = (1. / nsites) * (data.pabba * (1. - data.paabb) \
        + data.paabb * data.pbaba \
        + data.pabba * data.pbaba \
        + data.pbaba * (1. - data.pbaba)) 

    num = f2 * ((f1 / f2) - 0.)
    p1 = (sigmaf2 * (f1/f2)**2)
    p2 = ((2. * covf1f2 * (f1/f2) + sigmaf1))
    denom = p1 - p2

    ## calculate hils
    H = num/np.sqrt(abs(denom))
    return H, f1, f2

    

@numba.jit(nopython=True)   
def alt_mats(mat, idx):
    """ return alternate rearrangements of matrix"""
    mats = np.zeros((3, 16, 16), dtype=np.uint32)
    mats[0] = mat
    x = np.uint8(0)
    for y in np.array([0, 4, 8, 12], dtype=np.uint8):
        for z in np.array([0, 4, 8, 12], dtype=np.uint8):
            mats[1, y:y+np.uint8(4), z:z+np.uint8(4)] = mats[0, x].reshape(4, 4)
            mats[2, y:y+np.uint8(4), z:z+np.uint8(4)] = mats[0, x].reshape(4, 4).T
            x += np.uint8(1)
    #x = np.uint8(0)
    #for z in np.array([0,1,2,3]):
    #    for y in np.array([0,4,8,12]):
    #        mats[2,:,x] = mats[0,:,(y+z)]
    #        x += np.uint8(1)
    return mats[idx]
        
        

@numba.jit(nopython=True)
def count_snps(mat):
    """JIT func to return counts quickly"""
    ## array to store results
    snps = np.zeros(4, dtype=np.uint16)

    ## get concordant (aabb) pis sites
    snps[0] = np.uint16(\
           mat[0, 5] + mat[0, 10] + mat[0, 15] + \
           mat[5, 0] + mat[5, 10] + mat[5, 15] + \
           mat[10, 0] + mat[10, 5] + mat[10, 15] + \
           mat[15, 0] + mat[15, 5] + mat[15, 10])

    ## get discordant (baba) sites
    for i in range(16):
        if i % 5:
            snps[1] += mat[i, i]

    ## get discordant (abba) sites
    snps[2] = mat[1, 4] + mat[2, 8] + mat[3, 12] +\
              mat[4, 1] + mat[6, 9] + mat[7, 13] +\
              mat[8, 2] + mat[9, 6] + mat[11, 14] +\
              mat[12, 3] + mat[13, 7] + mat[14, 11]

    ## get autapomorphy sites
    snps[3] = (mat.sum() - (snps[0] + np.diag(mat).sum() + snps[2]))
    return snps

In [5]:
## parallel client **(requires an ipcluster instance to be running)**
ipyclient = ipp.Client()

## ipyrad output files for ped assembly
fphy = "/Users/pmckenz1/Desktop/projects/intro_python/analysis-ipyrad/min4_outfiles/min4.snps.phy"
fmap = "/Users/pmckenz1/Desktop/projects/intro_python/analysis-ipyrad/min4_outfiles/min4.snps.map"

## init tetrad object with data
tet = ipa.tetrad(
    name="pedicularis",
    data=fphy,
    mapfile=fmap,           ## <- to sample unlinked SNPs
    save_invariants=True,   ## <- need this
    nboots=100,             ## <- several replicates
    )

## run tetrad inference
tet.run(ipyclient=ipyclient, force=True)

loading seq array [13 taxa x 173131 bp]
max unlinked SNPs per quartet (nloci): 39634
inferring 715 quartet tree sets
host compute node: [10 cores] on Patricks-MacBook-Pro.local
[####################] 100% generating q-sets | 0:00:05 |  
[####################] 100% initial tree      | 0:00:33 |  
[####################] 100% bootstrap trees   | 0:01:55 |  
[####################] 100% calculating stats | 0:00:00 |  


In [6]:
hils = Hils(tet.database.output)
## calculate for all idxs
result = hils.run()

In [7]:
## parse the tree
ptre = toytree.tree(tet.trees.nhx)
ptre.root(wildcard="prz")

## convert names back into indexes
snames = sorted(ptre.get_tip_labels())
for node in ptre.tree.traverse():
    if node.is_leaf():
        node.name = "{} -- {}".format(snames.index(node.name), node.name)

## plot the tree         
ptre.draw(
    node_labels=ptre.get_node_values("support"),
    height=300, 
    width=350);

In [8]:
## distribution of Hils across all quartet edges in dataset
canvas = toyplot.Canvas(width=650, height=300)

for idx, val in enumerate(["Hils", "gamma"]):
    axes = canvas.cartesian(
        grid=(1, 2, idx),
        xlabel=val,
        ylabel="Frequency")

    mark = axes.bars(
        np.histogram(
            result[val][~result[val].isnull()],
            density=True,
            ),
        )

## style axes
axes.x.ticks.show = True
axes.y.ticks.show = True

In [9]:
## get most significant results
result.sort_values(by="Hils", ascending=False).head(10)

Unnamed: 0,aabb,abba,baba,aaab,paabb,pabba,pbaba,paaab,Hils,gamma,pval,qrts
336,191,162,142,8470,0.021,0.018,0.016,0.945,29.845,0.29,1.543e-194,"1,5|7,9"
428,380,228,135,7711,0.045,0.027,0.016,0.912,23.81,0.275,3.1320000000000002e-124,"2,4|6,7"
143,302,197,113,6333,0.043,0.028,0.016,0.912,12.453,0.308,8.425e-35,"0,4|6,7"
146,316,176,102,6458,0.045,0.025,0.014,0.916,12.115,0.257,5.367e-33,"0,4|6,10"
515,154,96,62,5543,0.026,0.016,0.011,0.947,9.77,0.27,7.492999999999999e-22,"3,4|6,10"
318,349,249,152,7874,0.04,0.029,0.018,0.913,9.118,0.33,3.533e-19,"1,7|4,12"
317,364,271,171,8418,0.039,0.029,0.019,0.913,7.968,0.341,6.501e-15,"1,7|4,11"
512,150,101,57,5507,0.026,0.017,0.01,0.947,7.217,0.321,1.96e-12,"3,4|6,7"
326,356,274,166,8584,0.038,0.029,0.018,0.915,7.178,0.362,2.587e-12,"1,10|4,11"
513,148,97,68,5529,0.025,0.017,0.012,0.946,6.976,0.266,1.077e-11,"3,4|6,8"


In [10]:
whichone = []
for i in range(len(result.gamma)):
    if (abs(result.gamma[i]-.5) == min(abs(result.gamma-.5))):
        whichone.append(i)
whichone

[351]

In [11]:
## get H for some test across many bootstraps
idx = 22

## concat bootstrap reps
reps = []
for boot in range(tet.params.nboots):
    hils = Hils(tet.database.output, boot=boot, tree=ptre)
    reps.append(hils.get_h_by_idx(idx))
bootsarr = pd.concat(reps)
bootsarr.index = range(tet.params.nboots)

## print quartet and plot 
print "{}, {} | {}, {}".format(*[hils.sidx[idx] for idx in hils.quartets[idx]])
toyplot.bars(
    np.histogram(bootsarr.Hils, density=True), 
    width=300, height=300, xlabel="Hils");
toyplot.bars(
    np.histogram(bootsarr.gamma, density=True), 
    width=300, height=300, xlabel="gamma");
toyplot.bars(
    np.histogram(bootsarr.pval, density=True), 
    width=300, height=300, xlabel="p-value");

## show head of boots array
bootsarr.head(10)

29154_superba, 30556_thamno | 33413_thamno, 38362_rex


Unnamed: 0,aabb,abba,baba,aaab,paabb,pabba,pbaba,paaab,Hils,gamma,pval
0,118,273,221,6197,0.017,0.04,0.032,0.91,-1.726,0.335,0.09
1,123,286,236,6287,0.018,0.041,0.034,0.907,-1.654,0.307,0.102
2,149,258,218,6263,0.022,0.037,0.032,0.909,-1.33,0.367,0.165
3,120,287,208,6201,0.018,0.042,0.031,0.91,-2.359,0.473,0.025
4,108,278,196,6237,0.016,0.041,0.029,0.915,-2.478,0.482,0.019
5,119,259,220,6226,0.017,0.038,0.032,0.912,-1.38,0.279,0.154
6,133,299,212,6246,0.019,0.043,0.031,0.907,-2.491,0.524,0.018
7,124,274,219,6130,0.018,0.041,0.032,0.909,-1.785,0.367,0.081
8,107,283,203,6369,0.015,0.041,0.029,0.915,-2.431,0.455,0.021
9,120,279,199,6148,0.018,0.041,0.029,0.911,-2.39,0.503,0.023


## To do:

*  Get all taxa implicated as hybrids, show this on tree
*  Compare to results from D and partitioned D

In [12]:
result.pval

0      3.989e-01
1      3.724e-01
2      3.001e-01
3      3.329e-01
4      3.967e-01
5      3.801e-01
6      3.784e-01
7      3.465e-01
8      1.239e-01
9      1.447e-01
10     3.453e-01
11     3.886e-01
12     2.763e-01
13     3.970e-01
14     2.585e-01
15     2.480e-01
16     3.814e-01
17     1.049e-01
18     6.367e-02
19     3.011e-01
20     3.244e-01
21     9.163e-07
22     8.988e-02
23     9.792e-02
24     7.192e-07
25     3.213e-01
26     2.337e-01
27     1.486e-01
28     3.914e-01
29     9.470e-02
         ...    
685    2.786e-02
686    3.217e-01
687    3.986e-01
688    3.986e-01
689    3.714e-01
690    3.787e-01
691    5.596e-02
692    3.331e-02
693    2.986e-01
694    2.997e-01
695    2.419e-01
696    2.026e-01
697    2.085e-01
698    3.989e-01
699    3.989e-01
700    4.064e-02
701    3.041e-01
702    2.309e-01
703    3.519e-01
704    3.395e-01
705    2.296e-03
706    3.985e-01
707    3.985e-01
708    2.096e-01
709    2.649e-01
710    3.724e-01
711    3.811e-01
712    3.989e-

In [14]:
signif_indices = [result.index[i] for i in range(len(result)) if (result.pval[i] < (.05/(715*2)))]
signif_indices

[21,
 24,
 34,
 143,
 144,
 145,
 146,
 232,
 233,
 234,
 269,
 302,
 317,
 318,
 321,
 322,
 324,
 325,
 326,
 327,
 329,
 331,
 332,
 336,
 428,
 431,
 512,
 513,
 515]

In [99]:
for q in signif_indices:
    ## get H for some test across many bootstraps
    idx = q

    ## concat bootstrap reps
    reps = []
    for boot in range(tet.params.nboots):
        hils = Hils(tet.database.output, boot=boot, tree=ptre)
        reps.append(hils.get_h_by_idx(idx))
    bootsarr = pd.concat(reps)
    bootsarr.index = range(tet.params.nboots)
    
    # print quartet and plot 
    print "{}, {} | {}, {}".format(*[hils.sidx[idx] for idx in hils.quartets[idx]])
    toyplot.bars(
        np.histogram(bootsarr.Hils, density=True), 
        width=300, height=300, xlabel="Hils");
    toyplot.bars(
        np.histogram(bootsarr.gamma, density=True), 
        width=300, height=300, xlabel="gamma");
    toyplot.bars(
        np.histogram(bootsarr.pval, density=True), 
        width=300, height=300, xlabel="p-value");

    # show head of boots array
    bootsarr.head(10)

29154_superba, 33413_thamno | 30556_thamno, 35855_rex
29154_superba, 33413_thamno | 30556_thamno, 40578_rex
29154_superba, 35855_rex | 30556_thamno, 35236_rex
29154_superba, 33413_thamno | 35236_rex, 35855_rex
29154_superba, 33413_thamno | 35236_rex, 38362_rex
29154_superba, 33413_thamno | 35236_rex, 39618_rex
29154_superba, 33413_thamno | 35236_rex, 40578_rex
30556_thamno, 30686_cyathophylla | 33413_thamno, 38362_rex
30556_thamno, 30686_cyathophylla | 33413_thamno, 39618_rex
30556_thamno, 40578_rex | 30686_cyathophylla, 33413_thamno
30556_thamno, 39618_rex | 32082_przewalskii, 33413_thamno
30556_thamno, 35855_rex | 33413_thamno, 33588_przewalskii
30556_thamno, 35855_rex | 33413_thamno, 41478_cyathophylloides
30556_thamno, 35855_rex | 33413_thamno, 41954_cyathophylloides
30556_thamno, 41478_cyathophylloides | 33413_thamno, 38362_rex
30556_thamno, 41954_cyathophylloides | 33413_thamno, 38362_rex
30556_thamno, 41478_cyathophylloides | 33413_thamno, 39618_rex
30556_thamno, 41954_cyathophy

In [48]:
Davg = []
hilsavg = []
for q in signif_indices:
    idx = q
    reps = []
    for boot in range(tet.params.nboots):
        hils = Hils(tet.database.output, boot=boot, tree=ptre)
        reps.append(hils.get_h_by_idx(idx))
    bootsarr = pd.concat(reps)
    bootsarr.index = range(tet.params.nboots)
    numer = [float(bootsarr.abba[i])-float(bootsarr.baba[i]) for i in range(len(bootsarr))]
    denomer = [float(bootsarr.abba[i])+float(bootsarr.baba[i]) for i in range(len(bootsarr))]
    
    hilsavg.append(sum(bootsarr.Hils)/len(bootsarr.Hils))
    Ds = [(numer[i] / denomer[i]) for i in range(len(numer))]
    Davg.append(sum(Ds)/len(Ds))

In [55]:
toyplot.bars(Davg, 
        width=300, height=300, xlabel="index")
toyplot.bars(hilsavg, width=300, height=300, xlabel="index")

(<toyplot.canvas.Canvas at 0x1138a1e10>,
 <toyplot.coordinates.Cartesian at 0x11428d390>,
 <toyplot.mark.BarMagnitudes at 0x11428d2d0>)

In [63]:
meanDs = sum(Ds)/len(Ds)

In [73]:
sum([(Ds[i] - meanDs)**2 for i in range(len(Ds))])/(len(Ds)-1)**.5 #standard dev

0.07488673040234464

In [86]:
norm.cdf()

0.99999971334842808

In [40]:
numer = [float(bootsarr.abba[i])-float(bootsarr.baba[i]) for i in range(len(bootsarr))]
denomer = [float(bootsarr.abba[i])+float(bootsarr.baba[i]) for i in range(len(bootsarr))]

Ds = [(numer[i] / denomer[i]) for i in range(len(numer))]
sum(Ds)/len(Ds)

-0.22365691859230638

In [87]:
## create a baba object linked to a data file and newick tree
bb = ipa.baba(data="/Users/pmckenz1/Desktop/projects/intro_python/analysis-ipyrad/min4_outfiles/min4.loci", newick="/Users/pmckenz1/Desktop/projects/quartet_proj/analysis-tetrad/pedicularis.tree")

## generate all possible abba-baba tests meeting a set of constraints
bb.generate_tests_from_tree(
    constraint_dict={
        "p4": ["32082_przewalskii", "33588_przewalskii"],
        "p3": ["33413_thamno"],
    })

## run all tests linked to bb 
bb.run(ipyclient)

## save the results table to a csv file
#bb.results_table.to_csv("bb.abba-baba.csv", sep="\t")

## show the results table in notebook
bb.results_table

44 tests generated from tree
[####################] 100%  calculating D-stats  | 0:05:52 |  


Unnamed: 0,dstat,bootmean,bootstd,Z,ABBA,BABA,nloci
0,0.016,0.016,0.028,0.587,376.969,364.844,9392
1,-0.012,-0.014,0.032,0.382,335.812,344.125,9006
2,0.023,0.024,0.031,0.734,359.562,343.312,9114
3,0.075,0.076,0.032,2.329,395.062,340.125,9197
4,0.062,0.063,0.035,1.773,323.125,285.188,7907
5,0.071,0.072,0.033,2.141,380.125,329.938,9062
6,0.172,0.171,0.033,5.222,413.375,292.062,8905
7,0.15,0.15,0.038,3.912,375.25,277.625,8569
8,0.178,0.178,0.037,4.831,398.625,278.25,8679
9,0.225,0.225,0.036,6.227,435.688,275.688,8735


In [88]:
bb.plot(height=800, 
        pct_tree_y=0.15,  
        tree_style='c',
        ewidth=2, 
        alpha=4.,
        style_test_labels={"font-size":"10px"},
        );

In [111]:
bb = ipa.baba(data="/Users/pmckenz1/Desktop/projects/intro_python/analysis-ipyrad/min4_outfiles/min4.loci", newick="/Users/pmckenz1/Desktop/projects/quartet_proj/analysis-tetrad/pedicularis.tree")

## generate all possible abba-baba tests meeting a set of constraints
bb.generate_tests_from_tree(
    constraint_dict={
        "p4": ["32082_przewalskii", "33588_przewalskii"],
        "p3": ["33413_thamno"],
    })

## run all tests linked to bb 
bb.run(ipyclient)

## save the results table to a csv file
#bb.results_table.to_csv("bb.abba-baba.csv", sep="\t")

## show the results table in notebook
bb.results_table

44 tests generated from tree
[####################] 100%  calculating D-stats  | 0:05:04 |  


Unnamed: 0,dstat,bootmean,bootstd,Z,ABBA,BABA,nloci
0,0.016,0.017,0.029,0.564,376.969,364.844,9392
1,-0.012,-0.012,0.031,0.395,335.812,344.125,9006
2,0.023,0.023,0.032,0.723,359.562,343.312,9114
3,0.075,0.076,0.031,2.373,395.062,340.125,9197
4,0.062,0.062,0.035,1.757,323.125,285.188,7907
5,0.071,0.07,0.033,2.132,380.125,329.938,9062
6,0.172,0.172,0.034,5.037,413.375,292.062,8905
7,0.15,0.149,0.037,4.089,375.25,277.625,8569
8,0.178,0.178,0.036,4.981,398.625,278.25,8679
9,0.225,0.226,0.037,6.099,435.688,275.688,8735


In [94]:
bb.plot()

(<toyplot.canvas.Canvas at 0x114476c90>,
 <toyplot.coordinates.Cartesian at 0x114321b10>,
 <ipyrad.plotting.baba_panel_plot.Panel at 0x114476b10>)

In [95]:
aa = ipa.baba(data="/Users/pmckenz1/Desktop/projects/intro_python/analysis-ipyrad/min4_outfiles/min4.loci")
#bb = aa.copy()
cc = aa.copy()

In [96]:
cc.tests = [
    {
     "p4": ["32082_przewalskii", "33588_przewalskii"],
     "p3": ["41954_cyathophylloides"], 
     "p2": ["33413_thamno"], 
     "p1": ["40578_rex"],
    },
    {
     "p4": ["32082_przewalskii", "33588_przewalskii"],
     "p3": ["41478_cyathophylloides"], 
     "p2": ["33413_thamno"], 
     "p1": ["40578_rex"],
    },
]

In [97]:
cc.run(ipyclient)

[####################] 100%  calculating D-stats  | 0:00:16 |  


In [100]:
cc.results_table

cc.tests

[{'p1': ['40578_rex'],
  'p2': ['33413_thamno'],
  'p3': ['41954_cyathophylloides'],
  'p4': ['32082_przewalskii', '33588_przewalskii']},
 {'p1': ['40578_rex'],
  'p2': ['33413_thamno'],
  'p3': ['41478_cyathophylloides'],
  'p4': ['32082_przewalskii', '33588_przewalskii']}]

In [112]:
bb.tests

[{'p1': ['35855_rex', '40578_rex'],
  'p2': ['30556_thamno', '35236_rex'],
  'p3': ['33413_thamno'],
  'p4': ['32082_przewalskii', '33588_przewalskii']},
 {'p1': ['35855_rex'],
  'p2': ['30556_thamno', '35236_rex'],
  'p3': ['33413_thamno'],
  'p4': ['32082_przewalskii', '33588_przewalskii']},
 {'p1': ['40578_rex'],
  'p2': ['30556_thamno', '35236_rex'],
  'p3': ['33413_thamno'],
  'p4': ['32082_przewalskii', '33588_przewalskii']},
 {'p1': ['39618_rex', '38362_rex'],
  'p2': ['30556_thamno', '35236_rex'],
  'p3': ['33413_thamno'],
  'p4': ['32082_przewalskii', '33588_przewalskii']},
 {'p1': ['39618_rex'],
  'p2': ['30556_thamno', '35236_rex'],
  'p3': ['33413_thamno'],
  'p4': ['32082_przewalskii', '33588_przewalskii']},
 {'p1': ['38362_rex'],
  'p2': ['30556_thamno', '35236_rex'],
  'p3': ['33413_thamno'],
  'p4': ['32082_przewalskii', '33588_przewalskii']},
 {'p1': ['35855_rex', '40578_rex'],
  'p2': ['30556_thamno'],
  'p3': ['33413_thamno'],
  'p4': ['32082_przewalskii', '33588_prz

In [122]:
def _loci_to_arr(loci, taxdict, mindict):
    """
    return a frequency array from a loci file for all loci with taxa from 
    taxdict and min coverage from mindict. 
    """

    ## make the array (4 or 5) and a mask array to remove loci without cov
    nloci = len(loci)
    keep = np.zeros(nloci, dtype=np.bool_)
    arr = np.zeros((nloci, 4, 300), dtype=np.float64)
    if len(taxdict) == 5:
        arr = np.zeros((nloci, 6, 300), dtype=np.float64)

    ## if not mindict, make one that requires 1 in each taxon
    if isinstance(mindict, int):
        mindict = {i: mindict for i in taxdict}
    elif isinstance(mindict, dict):
        mindict = {i: mindict[i] for i in taxdict}
    else:
        mindict = {i: 1 for i in taxdict}

    ## raise error if names are not 'p[int]' 
    allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5']
    if any([i not in allowed_names for i in taxdict]):
        raise IPyradError(\
            "keys in taxdict must be named 'p1' through 'p4' or 'p5'")

    ## parse key names
    keys = sorted([i for i in taxdict.keys() if i[0] == 'p'])
    outg = keys[-1]

    ## grab seqs just for the good guys
    for loc in xrange(nloci):

        ## parse the locus
        lines = loci[loc].split("\n")[:-1]
        names = [i.split()[0] for i in lines]
        seqs = np.array([list(i.split()[1]) for i in lines])

        ## check that names cover the taxdict (still need to check by site)
        covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \
                for tax in taxdict]

        ## keep locus
        if all(covs):
            keep[loc] = True

            ## get the refseq
            refidx = np.where([i in taxdict[outg] for i in names])[0]
            refseq = seqs[refidx].view(np.uint8)
            ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]])

            ## freq of ref in outgroup
            iseq = _reffreq2(ancestral, refseq, GETCONS2)
            arr[loc, -1, :iseq.shape[1]] = iseq 

            ## enter 4-taxon freqs
            if len(taxdict) == 4:
                for tidx, key in enumerate(keys[:-1]):

                    ## get idx of names in test tax
                    nidx = np.where([i in taxdict[key] for i in names])[0]
                    sidx = seqs[nidx].view(np.uint8)
                   
                    ## get freq of sidx
                    iseq = _reffreq2(ancestral, sidx, GETCONS2)
                   
                    ## fill it in 
                    arr[loc, tidx, :iseq.shape[1]] = iseq

            else:

                ## entere p5; and fill it in
                iseq = _reffreq2(ancestral, refseq, GETCONS2) 
                arr[loc, -1, :iseq.shape[1]] = iseq 
                
                ## enter p1
                nidx = np.where([i in taxdict['p1'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 0, :iseq.shape[1]] = iseq
                
                ## enter p2
                nidx = np.where([i in taxdict['p2'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 1, :iseq.shape[1]] = iseq
                
                ## enter p3 with p4 masked, and p4 with p3 masked
                nidx = np.where([i in taxdict['p3'] for i in names])[0]
                nidy = np.where([i in taxdict['p4'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                sidy = seqs[nidy].view(np.uint8)
                xseq = _reffreq2(ancestral, sidx, GETCONS2)
                yseq = _reffreq2(ancestral, sidy, GETCONS2)
                mask3 = xseq != 0
                mask4 = yseq != 0
                xseq[mask4] = 0
                yseq[mask3] = 0
                arr[loc, 2, :xseq.shape[1]] = xseq
                arr[loc, 3, :yseq.shape[1]] = yseq
                
                ## enter p34 
                nidx = nidx.tolist() + nidy.tolist()
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 4, :iseq.shape[1]] = iseq


    ## size-down array to the number of loci that have taxa for the test
    arr = arr[keep, :, :]

    ## size-down sites to 
    arr = masknulls(arr)

    return arr, keep

@numba.jit(nopython=True)
def _reffreq2(ancestral, iseq, consdict):
    ## empty arrays
    freq = np.zeros((1, iseq.shape[1]), dtype=np.float64)
    amseq = np.zeros((iseq.shape[0]*2, iseq.shape[1]), dtype=np.uint8)
    
    ## fill in both copies
    for seq in xrange(iseq.shape[0]):
        for col in xrange(iseq.shape[1]):  

            ## get this base and check if it is hetero
            base = iseq[seq][col]
            who = consdict[:, 0] == base
            
            ## if not hetero then enter it
            if not np.any(who):
                amseq[seq*2][col] = base
                amseq[seq*2+1][col] = base        
            ## if hetero then enter the 2 resolutions
            else:
                amseq[seq*2][col] = consdict[who, 1][0]
                amseq[seq*2+1][col] = consdict[who, 2][0]

    ## amseq may have N or -, these need to be masked
    for i in xrange(amseq.shape[1]):
        ## without N or -
        reduced = amseq[:, i][amseq[:, i] != 9]
        counts = reduced != ancestral[0][i]
        if reduced.shape[0]:
            freq[:, i] = counts.sum() / reduced.shape[0]
        else:
            freq[:, i] = 9
    return freq
@numba.jit(nopython=True)
def masknulls(arr):
    nvarr = np.zeros(arr.shape[0], dtype=np.int8)
    trimarr = np.zeros(arr.shape, dtype=np.float64)
    for loc in xrange(arr.shape[0]):
        nvars = 0
        for site in xrange(arr.shape[2]):
            col = arr[loc, :, site]
            ## mask cols with 9s
            if not np.any(col == 9):
                ## any non-outgroup shows variation?
                ## todo: check whether BBBBA is ever info?
                if np.any(col[:-1] != col[0]):
                    trimarr[loc, :, nvars] = col
                    nvars += 1
        nvarr[loc] = nvars        
    return trimarr[:, :, :nvarr.max()]

In [118]:
from ipyrad.assemble.write_outfiles import reftrick, GETCONS2

In [124]:
test1 = _loci_to_arr(loci, {'p1': ['35855_rex', '40578_rex'],'p2': ['30556_thamno', '35236_rex'],'p3': ['33413_thamno'],'p4': ['32082_przewalskii', '33588_przewalskii']}, 1)

In [287]:
def dstat(inarr, taxdict, mindict=1, nboots=1000, name=0):
    """ private function to perform a single D-stat test"""

    # ## get data as an array from loci file
    # ## if loci-list then parse arr from loci
    if isinstance(inarr, list):
        arr, _ = _loci_to_arr(inarr, taxdict, mindict)
    
    # ## if it's an array already then go ahead
    # elif isinstance(inarr, np.ndarray):
    #     arr = inarr
    # ## if it's a simulation object get freqs from array
    # elif isinstance(inarr, Sim):
    #     arr = _msp_to_arr(inarr, taxdict)

    #elif isinstance(inarr, types.GeneratorType):
    #    arr = _msp_to_arr(inarr, taxdict)
    #elif isinstance(inarr, list):
    #    arr = _msp_to_arr(inarr, taxdict)
    ## get data from Sim object, do not digest the ms generator
    #else:
    #    raise Exception("Must enter either a 'locifile' or 'arr'")

    ## run tests
    if len(taxdict) == 4:

        ## get results
        res, boots, hilsboots = _get_signif_4(arr, nboots)
    
        ## make res into a nice DataFrame
        res = pd.DataFrame(res, 
            columns=[name],
            index=["Dstat", "bootmean", "bootstd", "Z", "ABBA", "BABA", "nloci","gamma_ratio","Hils"])

    else:
        ## get results
        res, boots = _get_signif_5(arr, nboots)
         ## make int a DataFrame
        res = pd.DataFrame(res,
            index=["p3", "p4", "shared"], 
            columns=["Dstat", "bootmean", "bootstd", "Z", "ABxxA", "BAxxA", "nloci"]
            )

    return res.T, boots, hilsboots
@numba.jit(nopython=True)
def _get_signif_4(arr, nboots):
    """
    returns a list of stats and an array of dstat boots. Stats includes
    z-score and two-sided P-value. 
    """
    abba, baba, dst, gamrat,H = _prop_dstat(arr) #new
    boots, hilsboots = _get_boots(arr, nboots)
    estimate, stddev = (boots.mean(), boots.std())
    zscore = 0.
    if stddev:
        zscore = np.abs(dst) / stddev
    stats = [dst, estimate, stddev, zscore, abba, baba, arr.shape[0],gamrat,H] #new
    return np.array(stats), boots, hilsboots
@numba.jit(nopython=True)
def _prop_dstat(arr):
    
    ## numerator
    abba = ((1.-arr[:, 0]) * (arr[:, 1]) * (arr[:, 2]) * (1.-arr[:, 3]))  
    baba = ((arr[:, 0]) * (1.-arr[:, 1]) * (arr[:, 2]) * (1.-arr[:, 3]))
    bbaa = ((arr[:, 0]) * (arr[:, 1]) * (1.-arr[:, 2]) * (1.-arr[:, 3]))
    top = abba - baba
    bot = abba + baba
    gamma = bbaa - baba #new
    oneminusgamma = abba-baba #new
    
    ## get statistic and avoid zero div  
    sbot = bot.sum()
    if  sbot != 0:
        dst = top.sum() / float(sbot)
    else:
        dst = 0
    if float(oneminusgamma.sum()):
        gamrat = gamma.sum() / float(oneminusgamma.sum()) #new
    else:
        gamrat = 0
    if float(arr.shape[0]):
        pbbaa = float(bbaa.sum())/float(arr.shape[0])
        pabba = float(abba.sum())/float(arr.shape[0])
        pbaba = float(baba.sum())/float(arr.shape[0])

        f1 = pbbaa - pbaba
        f2 = pabba - pbaba           
        if not f2:
            f2 = 1. / float(arr.shape[0])
        sigmaf1 = (1. / float(arr.shape[0])) * (pbbaa * (1. - pbbaa) \
            + pbaba * (1. - pbaba) \
            + 2. * pbbaa * pbaba)

        sigmaf2 = (1. / float(arr.shape[0])) * (pabba * (1. - pabba) \
            + pbaba * (1. - pbaba) \
            + 2. * pabba * pbaba)

        covf1f2 = (1. / float(arr.shape[0])) * (pabba * (1. - pbbaa) \
            + pbbaa * pbaba \
            + pabba * pbaba \
            + pbaba * (1. - pbaba)) 

        num = f2 * ((f1 / f2) - 0.)
        p1 = (sigmaf2 * (f1/f2)**2)
        p2 = ((2. * covf1f2 * (f1/f2) + sigmaf1))
        denom = p1 - p2
        if np.sqrt(abs(denom)):
            ## calculate hils
            H = num/np.sqrt(abs(denom))
        else:
            H = 0
    else:
        H = 0
    
    return abba.sum(), baba.sum(), dst, gamrat, H
@numba.jit(nopython=True)
def _get_boots(arr, nboots):
    """
    return array of bootstrap D-stats
    """
    ## hold results (nboots, [dstat, ])
    boots = np.zeros((nboots,))
    hilsboots = np.zeros((nboots,))
    
    ## iterate to fill boots
    for bidx in xrange(nboots):
        ## sample with replacement
        lidx = np.random.randint(0, arr.shape[0], arr.shape[0])
        tarr = arr[lidx]
        _, _, dst, _, H = _prop_dstat(tarr)
        boots[bidx] = dst
        hilsboots[bidx] = H
    
    ## return bootarr
    return boots, hilsboots

In [289]:
testinghils = dstat(inarr=loci,taxdict={'p1': ['35855_rex', '40578_rex'],'p2': ['30556_thamno', '35236_rex'],'p3': ['33413_thamno'],'p4': ['32082_przewalskii', '33588_przewalskii']} )



In [290]:
toyplot.bars(np.histogram(testinghils[2]))

(<toyplot.canvas.Canvas at 0x113786b50>,
 <toyplot.coordinates.Cartesian at 0x11a77b710>,
 <toyplot.mark.BarMagnitudes at 0x11646b610>)

In [204]:
sum(sum(((1.-test1[0][:, 0]) * (test1[0][:, 1]) * (test1[0][:, 2]) * (1.-test1[0][:, 3])) ))
sum(sum(((test1[0][:, 0]) * (test1[0][:, 1]) * (1.-test1[0][:, 2]) * (1.-test1[0][:, 3])) ))




0.0

In [219]:
(((test1[0][:, 0]) * (test1[0][:, 1]) * (1.-test1[0][:, 2]) * (1.-test1[0][:, 3])) - ((test1[0][:, 0]) * (1.-test1[0][:, 1]) * (test1[0][:, 2]) * (1.-test1[0][:, 3]))).sum()
(((1.-test1[0][:, 0]) * (test1[0][:, 1]) * (test1[0][:, 2]) * (1.-test1[0][:, 3]))   - ((test1[0][:, 0]) * (1.-test1[0][:, 1]) * (test1[0][:, 2]) * (1.-test1[0][:, 3])))




-69.0

In [273]:
_prop_dstat(test1[0])

(333.0, 402.0, -0.09387755102040816, 0.6956521739130435, -1.272700099882565)

In [192]:
sum(sum(((1.-test1[0][:, 0]) * (test1[0][:, 1]) * (test1[0][:, 2]) * (1.-test1[0][:, 3])) +((test1[0][:, 0]) * (1.-test1[0][:, 1]) * (test1[0][:, 2]) * (1.-test1[0][:, 3]))))

735.0

In [None]:
((test1[0][:, 0]) * (1.-test1[0][:, 1]) * (test1[0][:, 2]) * (1.-test1[0][:, 3]))

In [114]:
with open("/Users/pmckenz1/Desktop/projects/intro_python/analysis-ipyrad/min4_outfiles/min4.loci", 'r') as infile:
    loci = infile.read().strip().split("|\n")

In [286]:
_get_boots(test1[0],1000)

ZeroDivisionError: division by zero