In [92]:
import numpy as np
#import ipyrad as ip
#import ipyrad.analysis as ipa
#import ipyparallel as ipp
import h5py
import re
import random
from itertools import compress
import itertools
import math
from operator import itemgetter
import sys

In [118]:
def save_quartet_matrices(snpsfile,mapfile,outputfile):
    
    # read in snps
    fname = snpsfile
    with open(fname) as f:
        snps = f.readlines()
    # remove whitespace characters like `\n` at the end of each line
    snps = [x.strip() for x in snps] 
    snps.pop(0)
    
    #read in map
    fname = mapfile
    with open(fname) as f:
        snpmap = f.readlines()
    # remove whitespace characters like `\n` at the end of each line
    snpmap = [x.strip() for x in snpmap] 
    snpmap = [i.split('\t') for i in snpmap]
    snpmap = np.array(snpmap)
    # get rid of inner column, convert to int
    reducedmap = snpmap[:,[0,2,3]].astype(int)
    
    # save names by themselves and make list of corresponding integers
    names = [snps[i][0:27].replace(" ", "") for i in range(len(snps))]
    namevals = range(len(names))
    #namealias = dict(zip(namevals, names))
    
    # make snp seq object without names
    full_snp_seqs = [snps[i][27:] for i in range(len(snps))]
    
    # Get an array of single base from each locus
    ind_samples = []
    for p in range(int(snpmap[:,0][-1])):
        index = p+1
        # get the characters in snpseqs that are part of that locus
        which_bases = reducedmap[(reducedmap[:,0] == index),2]
        snps_at_locus = [full_snp_seqs[i][(which_bases[0]-1):which_bases[-1]] for i in range(len(snps))]

        # exclude species that have bad reads
        # [True in [snps_at_locus[p][i] not in ['A','G','C','T'] for i in range(len(snps_at_locus[p]))] for p in range(len(snps_at_locus))]

        # pick a random base from each locus
        randombase = random.randint(0, (len(snps_at_locus[0])-1))
        selectedbases = [snps_at_locus[i][randombase] for i in range(len(snps_at_locus))]
        ind_samples.append(selectedbases)
        #fil = [i in ['A','G','C','T'] for i in selectedbases]
        #np.array([list(compress(namevals,fil)),list(compress(selectedbases, fil))])

    ind_samples = np.array(ind_samples)
    
    # get all quartets to be evaluated
    # ...should try to not create this as a list yet...
    all_quartets = list(itertools.combinations(namevals,4))

    # these are the possible arrangements of each quartets
    possible_configs = [[0,1,2,3],[0,2,1,3],[0,3,1,2]]

    # fill this with selected quartet
    quartet_decisions = []


    all_matrices = h5py.File(outputfile, "w")
    num_quartets = len(all_quartets)
    for q in range(num_quartets):

        #fil = [i in ['A','G','C','T'] for i in ind_samples[all_quartets[0],0:10]]

        # boolean list of loci that are each a subset of AGCT
        fil = [( len(set(ind_samples[i,all_quartets[q]]) | {'A','G','C','T'}) == 4) for i in range(len(ind_samples))]

        # array of informative loci
        finalsnps = ind_samples[:,all_quartets[q]][fil]

        # substitute integer values for AGCT
        finalsnps = np.where(finalsnps=='A',0,finalsnps)
        finalsnps = np.where(finalsnps=='G',1,finalsnps)
        finalsnps = np.where(finalsnps=='C',2,finalsnps)
        finalsnps = np.where(finalsnps=='T',3,finalsnps)
        finalsnps = finalsnps.astype(int)

        # make index matrix for each pair of bases. This assigns row / col number for full 16x16 matrix
        indexmat = np.array(range(16))
        indexmat.shape=(4,4)

        # make 16x16 matrix of zeroes
        # order across matrix is 00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
        # not good use of space
        fullmat0123 = np.zeros(shape=(16,16))
        arr0123 = finalsnps[:,possible_configs[0]]
        for i in range(len(arr0123)):
            # get row number 
            rownum = int(indexmat[arr0123[i][0:2][0],arr0123[i][0:2][1]])
            # get col number
            colnum = int(indexmat[arr0123[i][2:4][0],arr0123[i][2:4][1]])
            fullmat0123[rownum,colnum] = fullmat0123[rownum,colnum] + 1

        fullmat0213 = np.zeros(shape=(16,16))
        arr0213 = finalsnps[:,possible_configs[1]]
        for i in range(len(arr0213)):
            # get row number 
            rownum = int(indexmat[arr0213[i][0:2][0],arr0213[i][0:2][1]])
            # get col number
            colnum = int(indexmat[arr0213[i][2:4][0],arr0213[i][2:4][1]])
            fullmat0213[rownum,colnum] = fullmat0213[rownum,colnum] + 1

        fullmat0312 = np.zeros(shape=(16,16))
        arr0312 = finalsnps[:,possible_configs[2]]
        for i in range(len(arr0312)):
            # get row number 
            rownum = int(indexmat[arr0312[i][0:2][0],arr0312[i][0:2][1]])
            # get col number
            colnum = int(indexmat[arr0312[i][2:4][0],arr0312[i][2:4][1]])
            fullmat0312[rownum,colnum] = fullmat0312[rownum,colnum] + 1
        # # get the scores
        # scores = [math.sqrt(np.sum(np.square(np.linalg.svd(fullmat0123)[1][10:15]))),math.sqrt(np.sum(np.square(np.linalg.svd(fullmat0213)[1][10:15]))),math.sqrt(np.sum(np.square(np.linalg.svd(fullmat0312)[1][10:15])))]

        # # get index of minimum score, via <https://stackoverflow.com/questions/2474015/getting-the-index-of-the-returned-max-or-min-item-using-max-min-on-a-list>
        # min_index, min_value = min(enumerate(scores), key=itemgetter(1))
        # quartet_decisions.append([names[p] for p in [all_quartets[q][i] for i in possible_configs[min_index]]])

        # save datasets in HDF5
        dset1 = all_matrices.create_dataset(('_'.join([str(i) for i in all_quartets[q]]) + '/0123'), data=fullmat0123,chunks=True)
        dset2 = all_matrices.create_dataset(('_'.join([str(i) for i in all_quartets[q]]) + '/0213'), data=fullmat0213,chunks=True)
        dset3 = all_matrices.create_dataset(('_'.join([str(i) for i in all_quartets[q]]) + '/0312'), data=fullmat0312,chunks=True)
        
        # write out progress
        sys.stdout.write('\r'+"{0:.2f}".format(float(q)*100/float(num_quartets))+'%')
    
    all_matrices.close()
    sys.stdout.write('\r'+'Done.')
    return;

In [None]:
save_quartet_matrices(snpsfile = "analysis-ipyrad/min4_outfiles/min4.snps.phy",
                      mapfile = "analysis-ipyrad/min4_outfiles/min4.snps.map",
                      outputfile = "all_matrices.hdf5")