## Imports

In [2]:
import h5py
import math
import numpy as np
from operator import itemgetter
import scipy.stats as st
from itertools import compress
import re
import random
import itertools
import sys

## Functions

In [1]:
def calcHils(invmat, Nreq = 10, returnf = False, returnp = False, returnall = False,returnnum = False):
    comb_dict = dict(zip([00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33], [0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15]))
    num_iijj = (invmat[comb_dict[00],comb_dict[11]] + invmat[comb_dict[00],comb_dict[22]] +
        invmat[comb_dict[00],comb_dict[33]] + invmat[comb_dict[11],comb_dict[00]] + invmat[comb_dict[11],comb_dict[22]] +
        invmat[comb_dict[11],comb_dict[33]] + invmat[comb_dict[22],comb_dict[00]] + invmat[comb_dict[22],comb_dict[11]] +
        invmat[comb_dict[22],comb_dict[33]] + invmat[comb_dict[33],comb_dict[00]] + invmat[comb_dict[33],comb_dict[11]] +
        invmat[comb_dict[33],comb_dict[22]])

    num_ijji = (invmat[comb_dict[01],comb_dict[10]] + invmat[comb_dict[02],comb_dict[20]] +
        invmat[comb_dict[03],comb_dict[30]] + invmat[comb_dict[10],comb_dict[01]] + invmat[comb_dict[12],comb_dict[21]] +
        invmat[comb_dict[13],comb_dict[31]] + invmat[comb_dict[20],comb_dict[02]] + invmat[comb_dict[21],comb_dict[12]] +
        invmat[comb_dict[23],comb_dict[32]] + invmat[comb_dict[30],comb_dict[03]] + invmat[comb_dict[31],comb_dict[13]] +
        invmat[comb_dict[32],comb_dict[23]])

    num_ijij = (invmat[comb_dict[01],comb_dict[01]] + invmat[comb_dict[02],comb_dict[02]] +
        invmat[comb_dict[03],comb_dict[03]] + invmat[comb_dict[10],comb_dict[10]] + invmat[comb_dict[12],comb_dict[12]] +
        invmat[comb_dict[13],comb_dict[13]] + invmat[comb_dict[20],comb_dict[20]] + invmat[comb_dict[21],comb_dict[21]] +
        invmat[comb_dict[23],comb_dict[23]] + invmat[comb_dict[30],comb_dict[30]] + invmat[comb_dict[31],comb_dict[31]] +
        invmat[comb_dict[32],comb_dict[32]])
    [num_iijj,num_ijji,num_ijij]
    if (num_ijij == 0 and num_ijji == 0):
        return("No ijij or ijji are present in data (not enough data)")
    N = sum(map(sum, invmat))
    if (N <= Nreq):
        return("Not enough snps.")
    # calculate probability, add .05 to counts in case some of them are 0
    p_iijj = (num_iijj + .05)/N
    p_ijji = (num_ijji + .05)/N
    p_ijij = (num_ijij + .05)/N
    
    if (p_ijij > max([p_iijj,p_ijji])):
        return("Parental taxa are more closely related than hybrid. Discard this.")
    
    f1 = p_iijj - p_ijij
    f2 = p_ijji - p_ijij
    if not(f2):
        p_ijji = (num_ijji + 1 + .05)/N
        f2 = p_ijji - p_ijij
    rat_f1_f2 = f1/f2

    var_f1 = (1./N) * ( p_iijj*(1-p_iijj) + p_ijij*(1-p_ijij) + 2*p_iijj*p_ijij )
    var_f2 = (1./N) * ( p_ijji*(1-p_ijji) + p_ijij*(1-p_ijij) + 2*p_ijji*p_ijij )

    cov_f1_f2 = (1./N) * ( -p_iijj*p_ijji + p_iijj*p_ijij + p_ijji*p_ijij + p_ijij*(1-p_ijij))

    H = abs(f2 * rat_f1_f2) / math.sqrt( var_f2*(rat_f1_f2**2) - 2*cov_f1_f2*rat_f1_f2 + var_f1 )
    if returnf:
        return [H, f1, f2];
    if returnp:
        return [H, p_iijj,p_ijji,p_ijij];
    if returnall:
        return [H, f1, f2, p_iijj,p_ijji,p_ijij];
    if returnnum:
        return [num_iijj,num_ijji,num_ijij];
    if(num_ijij-num_ijji == 0):
        return('*'+str(H))
    else:
        return str(H);
def calcp(z):
    p = st.norm.sf(abs(z))*2
    return p;
def isfloat(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

# Read in snp data

In [3]:
fname = "analysis-ipyrad/min4_outfiles/min4.snps.phy"
with open(fname) as f:
    snps = f.readlines()
# remove whitespace characters like `\n` at the end of each line
snps = [x.strip() for x in snps] 
snps.pop(0)

#read in map
fname = "analysis-ipyrad/min4_outfiles/min4.snps.map"
with open(fname) as f:
    snpmap = f.readlines()
# remove whitespace characters like `\n` at the end of each line
snpmap = [x.strip() for x in snpmap] 
snpmap = [i.split('\t') for i in snpmap]
snpmap = np.array(snpmap)
# get rid of inner column, convert to int
reducedmap = snpmap[:,[0,2,3]].astype(int)

# save names by themselves and make list of corresponding integers
names = [snps[i][0:27].replace(" ", "") for i in range(len(snps))]
namevals = range(len(names))
#namealias = dict(zip(namevals, names))

# make snp seq object without names
full_snp_seqs = [snps[i][27:] for i in range(len(snps))]

In [150]:
names[8]

'38362_rex'

In [169]:
tested4 = [3,1,4,6]
results = []
theouts = [[0,1,2,3],[1,0,2,3],[2,0,1,3],[3,0,1,2]]

for z in range(4):
    #outgroup = tested4[z]
    allcombns = [x for q,x in enumerate(tested4) if q != z]
    for loop in [0]:
        currentcombn = allcombns
        current_outgroup = theouts[z]
        currentarr = [tested4[i] for i in current_outgroup]
        outgroup = currentarr[0]
        tempobj = [full_snp_seqs[theseq] for theseq in currentarr]
        possible_configs = [[0,1,2,3],[0,2,1,3],[0,1,3,2]]
        ind_samples = []
        for p in range(int(snpmap[:,0][-1])):
            index = p+1
            which_bases = reducedmap[(reducedmap[:,0] == index),2]
            snps_at_locus = [tempobj[i][(which_bases[0]-1):which_bases[-1]] for i in range(len(tempobj))]
            sample_indices = []
            for i in range(len(snps_at_locus[0])):
                if ((len(set([snps_at_locus[q][i] for q in range(len(snps_at_locus))]).intersection(['A','G','C','T'])) > 1) & (sum([[snps_at_locus[q][i] for q in range(len(snps_at_locus))][k] in ['A','G','C','T'] for k in range(4)]) == 4)):
                    sample_indices.append(i)
            if (len(sample_indices) > 0):
                randombase = int(np.random.choice(sample_indices,1))
                selectedbases = [snps_at_locus[i][randombase] for i in range(len(snps_at_locus))]
                ind_samples.append(selectedbases)

        snps = np.array(ind_samples)
        possible_configs = [[0,1,2,3],[0,2,1,3],[0,1,3,2]]
        snps = np.where(snps=='A',0,snps)
        snps = np.where(snps=='C',1,snps)
        snps = np.where(snps=='G',2,snps)
        snps = np.where(snps=='T',3,snps)
        snps = snps.astype(int)
        finalsnps = snps

        # make index matrix for each pair of bases. This assigns row / col number for full 16x16 matrix
        indexmat = np.array(range(16))
        indexmat.shape=(4,4)

                # make 16x16 matrix of zeroes
                # order across matrix is 00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
                # not good use of space
        fullmat0123 = np.zeros(shape=(16,16))
        arr0123 = finalsnps[:,possible_configs[0]]
        for i in range(len(arr0123)):
                    # get row number 
            rownum = int(indexmat[arr0123[i][0:2][0],arr0123[i][0:2][1]])
                    # get col number
            colnum = int(indexmat[arr0123[i][2:4][0],arr0123[i][2:4][1]])
            fullmat0123[rownum,colnum] = fullmat0123[rownum,colnum] + 1

        fullmat0213 = np.zeros(shape=(16,16))
        arr0213 = finalsnps[:,possible_configs[1]]
        for i in range(len(arr0213)):
                    # get row number 
            rownum = int(indexmat[arr0213[i][0:2][0],arr0213[i][0:2][1]])
                    # get col number
            colnum = int(indexmat[arr0213[i][2:4][0],arr0213[i][2:4][1]])
            fullmat0213[rownum,colnum] = fullmat0213[rownum,colnum] + 1

        fullmat0132 = np.zeros(shape=(16,16))
        arr0132 = finalsnps[:,possible_configs[2]]
        for i in range(len(arr0132)):
                    # get row number 
            rownum = int(indexmat[arr0132[i][0:2][0],arr0132[i][0:2][1]])
                    # get col number
            colnum = int(indexmat[arr0132[i][2:4][0],arr0132[i][2:4][1]])
            fullmat0132[rownum,colnum] = fullmat0132[rownum,colnum] + 1

        allhils = [calcHils(fullmat0123),calcHils(fullmat0213),calcHils(fullmat0132)]
        fil = [isfloat(element.replace('*','')) for element in allhils]
        hilsnumeric = list(compress(allhils,fil))
        #results.append(allhils)
        if (len(hilsnumeric) > 0):
            printed_results = max(hilsnumeric)
        else:
            printed_results = ("No answer")
        the_config = list(compress(possible_configs,[allhils[w] == printed_results for w in range(len(allhils))]))
        if isfloat(printed_results):
            results.append('Outgroup: '+str(outgroup)+' Hybrid: '+ str(currentcombn[the_config[0][2]-1])+ ' Parents: '+str(currentcombn[the_config[0][1]-1])+' '+str(currentcombn[the_config[0][3]-1])+ ' Hils: '+str(printed_results)+' Pvalue: '+ str(calcp(float(printed_results))))
        else:
            results.append('Outgroup: '+str(outgroup)+' Hybrid: '+ str(currentcombn[the_config[0][2]-1])+ ' Parents: '+str(currentcombn[the_config[0][1]-1])+' '+str(currentcombn[the_config[0][3]-1])+ ' Hils: '+str(printed_results)+' Pvalue: *'+ str(calcp(float(printed_results.replace('*','')))))
        print(loop)


0
0
0
0


In [170]:
results

['Outgroup: 3 Hybrid: 1 Parents: 4 6 Hils: 3.82204918974 Pvalue: 0.000132347280582',
 'Outgroup: 1 Hybrid: 3 Parents: 4 6 Hils: 3.26605115988 Pvalue: 0.00109058544413',
 'Outgroup: 4 Hybrid: 6 Parents: 3 1 Hils: 4.00274534336 Pvalue: 6.26116846757e-05',
 'Outgroup: 6 Hybrid: 4 Parents: 3 1 Hils: 2.97675288547 Pvalue: 0.00291318716204']

In [168]:
results

['Outgroup: 3 Hybrid: 1 Parents: 4 6 Hils: 3.27983995851 Pvalue: 0.00103865988604',
 'Outgroup: 1 Hybrid: 3 Parents: 4 6 Hils: 2.98059614159 Pvalue: 0.00287687917311',
 'Outgroup: 4 Hybrid: 6 Parents: 3 1 Hils: 4.21111656955 Pvalue: 2.54111571583e-05',
 'Outgroup: 6 Hybrid: 4 Parents: 3 1 Hils: 3.69646288074 Pvalue: 0.000218624207353']

In [166]:
results

['Outgroup: 3 Hybrid: 1 Parents: 4 6 Hils: 3.8068659738 Pvalue: 0.000140738965773',
 'Outgroup: 1 Hybrid: 3 Parents: 4 6 Hils: 3.97153271904 Pvalue: 7.14116902682e-05',
 'Outgroup: 4 Hybrid: 6 Parents: 3 1 Hils: 3.83050788597 Pvalue: 0.00012787903467',
 'Outgroup: 6 Hybrid: 4 Parents: 3 1 Hils: 3.88633987512 Pvalue: 0.000101767000059']

In [154]:
[names[i] for i in [6,7,8,9]]

['35236_rex', '35855_rex', '38362_rex', '39618_rex']

In [112]:
tested4 = [1,6,7,9]
i = 2
[x for q,x in enumerate(tested4) if q != i]

[1, 6, 9]

## Name outgroup and tested taxa

In [4]:
#Outgroup: 3, Tested: 1, 4, 6, 7, 8, 9, 10
outgroup = 3
allcombns = list(itertools.combinations([1, 4, 6, 7, 8, 9, 10], 3))

## Run hybridization test

In [92]:
results = []
#for loop in range(len(allcombns)):
for loop in [0]:
    currentcombn = allcombns[loop]
    tempobj = itemgetter(*([outgroup] + map(int, currentcombn)))(full_snp_seqs)
    possible_configs = [[0,1,2,3],[0,2,1,3],[0,1,3,2]]
    ind_samples = []
    for p in range(int(snpmap[:,0][-1])):
        index = p+1
        which_bases = reducedmap[(reducedmap[:,0] == index),2]
        snps_at_locus = [tempobj[i][(which_bases[0]-1):which_bases[-1]] for i in range(len(tempobj))]
        sample_indices = []
        for i in range(len(snps_at_locus[0])):
            if ((len(set([snps_at_locus[q][i] for q in range(len(snps_at_locus))]).intersection(['A','G','C','T'])) > 1) & (sum([[snps_at_locus[q][i] for q in range(len(snps_at_locus))][k] in ['A','G','C','T'] for k in range(4)]) == 4)):
                sample_indices.append(i)
        if (len(sample_indices) > 0):
            randombase = int(np.random.choice(sample_indices,1))
            selectedbases = [snps_at_locus[i][randombase] for i in range(len(snps_at_locus))]
            ind_samples.append(selectedbases)
            
    snps = np.array(ind_samples)
    possible_configs = [[0,1,2,3],[0,2,1,3],[0,1,3,2]]
    snps = np.where(snps=='A',0,snps)
    snps = np.where(snps=='C',1,snps)
    snps = np.where(snps=='G',2,snps)
    snps = np.where(snps=='T',3,snps)
    snps = snps.astype(int)
    finalsnps = snps
    
    # make index matrix for each pair of bases. This assigns row / col number for full 16x16 matrix
    indexmat = np.array(range(16))
    indexmat.shape=(4,4)

            # make 16x16 matrix of zeroes
            # order across matrix is 00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
            # not good use of space
    fullmat0123 = np.zeros(shape=(16,16))
    arr0123 = finalsnps[:,possible_configs[0]]
    for i in range(len(arr0123)):
                # get row number 
        rownum = int(indexmat[arr0123[i][0:2][0],arr0123[i][0:2][1]])
                # get col number
        colnum = int(indexmat[arr0123[i][2:4][0],arr0123[i][2:4][1]])
        fullmat0123[rownum,colnum] = fullmat0123[rownum,colnum] + 1

    fullmat0213 = np.zeros(shape=(16,16))
    arr0213 = finalsnps[:,possible_configs[1]]
    for i in range(len(arr0213)):
                # get row number 
        rownum = int(indexmat[arr0213[i][0:2][0],arr0213[i][0:2][1]])
                # get col number
        colnum = int(indexmat[arr0213[i][2:4][0],arr0213[i][2:4][1]])
        fullmat0213[rownum,colnum] = fullmat0213[rownum,colnum] + 1

    fullmat0132 = np.zeros(shape=(16,16))
    arr0132 = finalsnps[:,possible_configs[2]]
    for i in range(len(arr0132)):
                # get row number 
        rownum = int(indexmat[arr0132[i][0:2][0],arr0132[i][0:2][1]])
                # get col number
        colnum = int(indexmat[arr0132[i][2:4][0],arr0132[i][2:4][1]])
        fullmat0132[rownum,colnum] = fullmat0132[rownum,colnum] + 1

    allhils = [calcHils(fullmat0123),calcHils(fullmat0213),calcHils(fullmat0132)]
    fil = [isfloat(element.replace('*','')) for element in allhils]
    hilsnumeric = list(compress(allhils,fil))
    if (len(hilsnumeric) > 0):
        printed_results = max(hilsnumeric)
    else:
        printed_results = ("No answer")
    the_config = list(compress(possible_configs,[allhils[w] == printed_results for w in range(len(allhils))]))
    results.append('Outgroup: '+str(outgroup)+' Hybrid: '+ str(currentcombn[the_config[0][2]-1])+ ' Parents: '+str(currentcombn[the_config[0][1]-1])+' '+str(currentcombn[the_config[0][3]-1])+ ' Hils: '+str(printed_results)+' Pvalue: '+ str(calcp(float(printed_results))))
    print(loop)

0


## Write to a file

In [326]:
key = [str(range(13)[i]) + ': ' + names[i] for i in range(13)]
thefile = open('pedic_hybridization.txt', 'w')
for item in key:
    thefile.write("%s\n" % item)
for item in results:
    thefile.write("%s\n" % item)
thefile.close()