## Imports

In [277]:
import h5py
import math
import numpy as np
from operator import itemgetter
import scipy.stats as st
from itertools import compress
import re
import random
import itertools
import sys

## Read in seqence data, organize it

In [278]:
fname = "analysis-ipyrad/min4_outfiles/min4.snps.phy"
with open(fname) as f:
    snps = f.readlines()
# remove whitespace characters like `\n` at the end of each line
snps = [x.strip() for x in snps] 
snps.pop(0)

#read in map
fname = "analysis-ipyrad/min4_outfiles/min4.snps.map"
with open(fname) as f:
    snpmap = f.readlines()
# remove whitespace characters like `\n` at the end of each line
snpmap = [x.strip() for x in snpmap] 
snpmap = [i.split('\t') for i in snpmap]
snpmap = np.array(snpmap)
# get rid of inner column, convert to int
reducedmap = snpmap[:,[0,2,3]].astype(int)

# save names by themselves and make list of corresponding integers
names = [snps[i][0:27].replace(" ", "") for i in range(len(snps))]
namevals = range(len(names))
#namealias = dict(zip(namevals, names))

# make snp seq object without names
full_snp_seqs = [snps[i][27:] for i in range(len(snps))]

In [279]:
names

['29154_superba',
 '30556_thamno',
 '30686_cyathophylla',
 '32082_przewalskii',
 '33413_thamno',
 '33588_przewalskii',
 '35236_rex',
 '35855_rex',
 '38362_rex',
 '39618_rex',
 '40578_rex',
 '41478_cyathophylloides',
 '41954_cyathophylloides']

In [20]:
taxa_ids = [0,2,4,6,12]
fivetaxa = [names[i] for i in taxa_ids]

In [165]:
fivetaxa
#12_0_2_4_6

['29154_superba',
 '30686_cyathophylla',
 '33413_thamno',
 '35236_rex',
 '41954_cyathophylloides']

In [26]:
tempobj = [full_snp_seqs[i] for i in taxa_ids]

In [151]:
# get sampled snp from each locus for a list of sequences (tempobj), given reduced snp map
tempobj = [full_snp_seqs[i] for i in taxa_ids]
ind_samples = []
for p in range(int(snpmap[:,0][-1])):
    index = p+1
    which_bases = reducedmap[(reducedmap[:,0] == index),2]
    snps_at_locus = [tempobj[i][(which_bases[0]-1):which_bases[-1]] for i in range(len(tempobj))]
    sample_indices = []
    for i in range(len(snps_at_locus[0])):
        if ((len(set([snps_at_locus[q][i] for q in range(len(snps_at_locus))]).intersection(['A','G','C','T'])) > 1) & (sum([[snps_at_locus[q][i] for q in range(len(snps_at_locus))][k] in ['A','G','C','T'] for k in range(len(tempobj))]) == len(tempobj))):
            sample_indices.append(i)
    if (len(sample_indices) > 0):
        randombase = int(np.random.choice(sample_indices,1))
        selectedbases = [snps_at_locus[i][randombase] for i in range(len(snps_at_locus))]
        ind_samples.append(selectedbases)
ind_samples_reset = ind_samples

In [89]:
ind_samples = np.array(ind_samples_reset)
ind_samples = np.where(ind_samples=='A',0,ind_samples)
ind_samples = np.where(ind_samples=='C',1,ind_samples)
ind_samples = np.where(ind_samples=='G',2,ind_samples)
ind_samples = np.where(ind_samples=='T',3,ind_samples)
ind_samples = ind_samples.astype(int)


Asamples = np.array([ind_samples[i] for i in range(len(ind_samples)) if ind_samples[i][0] == 0])[:,[1,2,3,4]]
Csamples = np.array([ind_samples[i] for i in range(len(ind_samples)) if ind_samples[i][0] == 1])[:,[1,2,3,4]]
Gsamples = np.array([ind_samples[i] for i in range(len(ind_samples)) if ind_samples[i][0] == 2])[:,[1,2,3,4]]
Tsamples = np.array([ind_samples[i] for i in range(len(ind_samples)) if ind_samples[i][0] == 3])[:,[1,2,3,4]]

## Generate quintet matrices

This takes every quintet arrangement and generates a standard quartet matrix for each quintet where the lone quintet taxon takes either A, C, G, or T.

In [226]:
allmats = []
info = []
reshuffle_fifth = [[0,1,2,3,4],[1,0,2,3,4],[2,0,1,3,4],[3,0,1,2,4],[4,0,1,2,3]]

for w in range(len(reshuffle_fifth)):

    current_taxa_ids = [taxa_ids[i] for i in reshuffle_fifth[w]]

    ind_samples = np.array(ind_samples_reset)[:,reshuffle_fifth[w]]
    ind_samples = np.where(ind_samples=='A',0,ind_samples)
    ind_samples = np.where(ind_samples=='C',1,ind_samples)
    ind_samples = np.where(ind_samples=='G',2,ind_samples)
    ind_samples = np.where(ind_samples=='T',3,ind_samples)
    ind_samples = ind_samples.astype(int)


    Asamples = np.array([ind_samples[i] for i in range(len(ind_samples)) if ind_samples[i][0] == 0])[:,[1,2,3,4]]
    Csamples = np.array([ind_samples[i] for i in range(len(ind_samples)) if ind_samples[i][0] == 1])[:,[1,2,3,4]]
    Gsamples = np.array([ind_samples[i] for i in range(len(ind_samples)) if ind_samples[i][0] == 2])[:,[1,2,3,4]]
    Tsamples = np.array([ind_samples[i] for i in range(len(ind_samples)) if ind_samples[i][0] == 3])[:,[1,2,3,4]]

    snparrays = [Asamples,Csamples,Gsamples,Tsamples]
    for q in range(len(snparrays)):
        for p in [[0,1,2,3],[0,2,1,3],[0,3,1,2]]:
            indexmat = np.array(range(16))
            indexmat.shape=(4,4)
            # order across matrix is 00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
            fullmat0123 = np.zeros(shape=(16,16))
            arr0123 = snparrays[q][:,p]
            for i in range(len(arr0123)):
                        # get row number 
                rownum = int(indexmat[arr0123[i][0],arr0123[i][1]])
                        # get col number
                colnum = int(indexmat[arr0123[i][2],arr0123[i][3]])
                fullmat0123[rownum,colnum] = fullmat0123[rownum,colnum] + 1
            allmats.append(fullmat0123)
            info.append([current_taxa_ids,q,np.array(current_taxa_ids[1:5])[p]])


In [221]:
ind_samples[:,reshuffle_fifth[]]

array([[2, 2, 2, 1, 2],
       [3, 3, 3, 1, 3],
       [3, 1, 3, 3, 3],
       ..., 
       [0, 0, 2, 2, 2],
       [0, 2, 2, 2, 2],
       [2, 2, 0, 2, 2]])

In [164]:
print(info[0])
print(allmats[0].astype(int))

[[0, 2, 4, 6, 12], 0, array([ 2,  4,  6, 12])]
[[  0  36 124  63  33   2   0   0  90   0   1   1  43   0   0   0]
 [ 33   0   0   0  38  18   0   1   0   0   0   0   0   1   0   0]
 [ 88   0   2   0   1   0   0   0 116   0  45   2   0   0   0   0]
 [ 41   0   1   2   0   0   0   0   0   0   0   0  49   0   0  13]
 [ 26   3   0   0   0   1   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   2  68   1   2   0   0   0   0   0   1   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   1   1   0]
 [101   0  12   0   0   0   0   0   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0  12   0 142   1   0   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0   0   0   1   0   1   0]
 [ 38   0   1   4   0   0   0   1   0   0   0   0   2   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  1   0   0  

In [227]:
info

[[[0, 2, 4, 6, 12], 0, array([ 2,  4,  6, 12])],
 [[0, 2, 4, 6, 12], 0, array([ 2,  6,  4, 12])],
 [[0, 2, 4, 6, 12], 0, array([ 2, 12,  4,  6])],
 [[0, 2, 4, 6, 12], 1, array([ 2,  4,  6, 12])],
 [[0, 2, 4, 6, 12], 1, array([ 2,  6,  4, 12])],
 [[0, 2, 4, 6, 12], 1, array([ 2, 12,  4,  6])],
 [[0, 2, 4, 6, 12], 2, array([ 2,  4,  6, 12])],
 [[0, 2, 4, 6, 12], 2, array([ 2,  6,  4, 12])],
 [[0, 2, 4, 6, 12], 2, array([ 2, 12,  4,  6])],
 [[0, 2, 4, 6, 12], 3, array([ 2,  4,  6, 12])],
 [[0, 2, 4, 6, 12], 3, array([ 2,  6,  4, 12])],
 [[0, 2, 4, 6, 12], 3, array([ 2, 12,  4,  6])],
 [[2, 0, 4, 6, 12], 0, array([ 0,  4,  6, 12])],
 [[2, 0, 4, 6, 12], 0, array([ 0,  6,  4, 12])],
 [[2, 0, 4, 6, 12], 0, array([ 0, 12,  4,  6])],
 [[2, 0, 4, 6, 12], 1, array([ 0,  4,  6, 12])],
 [[2, 0, 4, 6, 12], 1, array([ 0,  6,  4, 12])],
 [[2, 0, 4, 6, 12], 1, array([ 0, 12,  4,  6])],
 [[2, 0, 4, 6, 12], 2, array([ 0,  4,  6, 12])],
 [[2, 0, 4, 6, 12], 2, array([ 0,  6,  4, 12])],
 [[2, 0, 4, 6, 12], 

In [229]:
# 57, 54, 51, 48 are true configs
sum(sum(allmats[57] + allmats[54] + allmats[51] + allmats[48]))

5589.0

In [239]:
sum(sum(allmats[58] + allmats[55] + allmats[52] + allmats[49]))

5589.0

In [238]:
sum(sum(allmats[45] + allmats[42] + allmats[39] + allmats[36]))

5589.0

In [205]:
math.sqrt(np.sum(np.square(np.linalg.svd(allmats[57])[1][10:15])))

1.4143611334878379

In [241]:
[math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)]

[1.7320921829917224,
 1.7321343096335553,
 0.015677839488573025,
 1.7221593457235986,
 1.7232261592208047,
 0.035684433709404335,
 0.00012072328131176196,
 0.01177764544159291,
 0.016814304722510654,
 1.4143611334878379,
 1.4144046946387008,
 0.0347707504542551,
 0.9993054332650212,
 0.9994424312829834,
 0.0011637667783448854,
 0.156579913188151,
 0.15101072833279816,
 0.024156781336025227,
 1.0008157657997336,
 1.0006507827546314,
 7.097372254211233e-15,
 1.4095374505869671,
 0.9998696263717481,
 0.06202794632802843,
 0.0744512764822138,
 0.2750822134166317,
 0.8245842867838327,
 0.00011514335777699925,
 0.04080083479513208,
 0.030167812939555855,
 0.009867092655924396,
 0.1730820462507168,
 0.11382751486828117,
 0.019737589691062027,
 0.03749094882069752,
 0.041805753928708146,
 0.01545715592691087,
 0.23039635948486478,
 0.09377886816603721,
 0.054905721694836394,
 0.007037149510397565,
 0.6998464143715751,
 0.008857773547582269,
 0.010971601562657793,
 0.00038108664226244266,
 0.01

In [271]:
testing = [math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)][0:12]
print(sum([testing[i] for i in [1,2,4,5,7,8,10,11]])/sum([testing[i] for i in [0,3,6,9]]))
print(sum([testing[i] for i in [0,2,3,5,6,8,9,11]])/sum([testing[i] for i in [1,4,7,10]]))
print(sum([testing[i] for i in [0,1,3,4,6,7,9,10]])/sum([testing[i] for i in [2,5,8,11]]))
testing = [math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)][12:24]
print(sum([testing[i] for i in [1,2,4,5,7,8,10,11]])/sum([testing[i] for i in [0,3,6,9]]))
print(sum([testing[i] for i in [0,2,3,5,6,8,9,11]])/sum([testing[i] for i in [1,4,7,10]]))
print(sum([testing[i] for i in [0,1,3,4,6,7,9,10]])/sum([testing[i] for i in [2,5,8,11]]))
testing = [math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)][24:36]
print(sum([testing[i] for i in [1,2,4,5,7,8,10,11]])/sum([testing[i] for i in [0,3,6,9]]))
print(sum([testing[i] for i in [0,2,3,5,6,8,9,11]])/sum([testing[i] for i in [1,4,7,10]]))
print(sum([testing[i] for i in [0,1,3,4,6,7,9,10]])/sum([testing[i] for i in [2,5,8,11]]))
testing = [math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)][36:48]
print(sum([testing[i] for i in [1,2,4,5,7,8,10,11]])/sum([testing[i] for i in [0,3,6,9]]))
print(sum([testing[i] for i in [0,2,3,5,6,8,9,11]])/sum([testing[i] for i in [1,4,7,10]]))
print(sum([testing[i] for i in [0,1,3,4,6,7,9,10]])/sum([testing[i] for i in [2,5,8,11]]))
testing = [math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)][48:60]
print(sum([testing[i] for i in [1,2,4,5,7,8,10,11]])/sum([testing[i] for i in [0,3,6,9]]))
print(sum([testing[i] for i in [0,2,3,5,6,8,9,11]])/sum([testing[i] for i in [1,4,7,10]]))
print(sum([testing[i] for i in [0,1,3,4,6,7,9,10]])/sum([testing[i] for i in [2,5,8,11]]))

1.02377553722
1.01846504444
94.7113086697
0.908049757783
1.1595105378
76.9012926263
14.7530493538
2.11709312663
0.624145167891
15.7574802277
2.30388724337
0.568255031858
386.060794088
1.00637671408
1.00398577383


In [276]:
testing = [math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)][0:12]
print([sum([testing[i] for i in [1,2,4,5,7,8,10,11]])/sum([testing[i] for i in [0,3,6,9]]),
 sum([testing[i] for i in [0,2,3,5,6,8,9,11]])/sum([testing[i] for i in [1,4,7,10]]),
 sum([testing[i] for i in [0,1,3,4,6,7,9,10]])/sum([testing[i] for i in [2,5,8,11]])])
testing = [math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)][12:24]
print([sum([testing[i] for i in [1,2,4,5,7,8,10,11]])/sum([testing[i] for i in [0,3,6,9]]),
 sum([testing[i] for i in [0,2,3,5,6,8,9,11]])/sum([testing[i] for i in [1,4,7,10]]),
 sum([testing[i] for i in [0,1,3,4,6,7,9,10]])/sum([testing[i] for i in [2,5,8,11]])])
testing = [math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)][24:36]
print([sum([testing[i] for i in [1,2,4,5,7,8,10,11]])/sum([testing[i] for i in [0,3,6,9]]),
 sum([testing[i] for i in [0,2,3,5,6,8,9,11]])/sum([testing[i] for i in [1,4,7,10]]),
 sum([testing[i] for i in [0,1,3,4,6,7,9,10]])/sum([testing[i] for i in [2,5,8,11]])])
testing = [math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)][36:48]
print([sum([testing[i] for i in [1,2,4,5,7,8,10,11]])/sum([testing[i] for i in [0,3,6,9]]),
 sum([testing[i] for i in [0,2,3,5,6,8,9,11]])/sum([testing[i] for i in [1,4,7,10]]),
 sum([testing[i] for i in [0,1,3,4,6,7,9,10]])/sum([testing[i] for i in [2,5,8,11]])])
testing = [math.sqrt(np.sum(np.square(np.linalg.svd(allmats[i])[1][10:15]))) for i in range(60)][48:60]
print([sum([testing[i] for i in [1,2,4,5,7,8,10,11]])/sum([testing[i] for i in [0,3,6,9]]),
 sum([testing[i] for i in [0,2,3,5,6,8,9,11]])/sum([testing[i] for i in [1,4,7,10]]),
 sum([testing[i] for i in [0,1,3,4,6,7,9,10]])/sum([testing[i] for i in [2,5,8,11]])])

[1.0237755372208388, 1.0184650444444696, 94.71130866967927]
[0.9080497577834, 1.15951053779888, 76.90129262629839]
[14.753049353793648, 2.117093126629455, 0.62414516789139]
[15.757480227702116, 2.3038872433715274, 0.5682550318578424]
[386.06079408847046, 1.006376714080884, 1.0039857738327647]


In [272]:
info[0:12]

[[[0, 2, 4, 6, 12], 0, array([ 2,  4,  6, 12])],
 [[0, 2, 4, 6, 12], 0, array([ 2,  6,  4, 12])],
 [[0, 2, 4, 6, 12], 0, array([ 2, 12,  4,  6])],
 [[0, 2, 4, 6, 12], 1, array([ 2,  4,  6, 12])],
 [[0, 2, 4, 6, 12], 1, array([ 2,  6,  4, 12])],
 [[0, 2, 4, 6, 12], 1, array([ 2, 12,  4,  6])],
 [[0, 2, 4, 6, 12], 2, array([ 2,  4,  6, 12])],
 [[0, 2, 4, 6, 12], 2, array([ 2,  6,  4, 12])],
 [[0, 2, 4, 6, 12], 2, array([ 2, 12,  4,  6])],
 [[0, 2, 4, 6, 12], 3, array([ 2,  4,  6, 12])],
 [[0, 2, 4, 6, 12], 3, array([ 2,  6,  4, 12])],
 [[0, 2, 4, 6, 12], 3, array([ 2, 12,  4,  6])]]