In [1]:
def dump_qmc(quartets,tempfiledir):
    """
    Writes the inferred quartet sets from the database to a text 
    file to be used as input for QMC. Quartets that had no information
    available (i.e., no SNPs) were written to the database as 0,0,0,0
    and are excluded here from the output.
    """

    ## open the h5 database
    #with h5py.File(self.database.output, 'r') as io5:

        ## create an output file for writing
    tempfile = os.path.join(tempfiledir,"quartets.txt")
    with open(tempfile, 'w') as qdump:

        ## pull from db
        #for idx in xrange(0, self.params.nquartets, self._chunksize):
            #quarts = quartets

            ## shuffle and format for qmc
            #np.random.shuffle(quarts)
            chunk = ["{},{}|{},{}".format(*i) for i in quartets]
            qdump.write("\n".join(chunk)+"\n")


def _run_qmc(tempfiledir, tempfilename,treename,tipnames):
    """
    Runs quartet max-cut QMC on the quartets qdump file.
    """

    ## build command
    thetmptree = os.path.join(tempfiledir, "tmptre.phy")
    cmd = [ip.bins.qmc, "qrtt="+tempfilename, "otre="+thetmptree]

    ## run it
    proc = subprocess.Popen(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    res = proc.communicate()
    #if proc.returncode:
    #    print(proc.returncode)
    #    raise IPyradWarningExit(res[1])

    ## parse tmp file written by qmc into a tree and rename it
    with open(thetmptree, 'r') as intree:
        tre = ete3.Tree(intree.read().strip())
        names = tre.get_leaves()
        for name in names:
            name.name = tipnames[(int(name.name)-1)]
        tmptre = tre.write(format=9)

    ## save the tree to file
    #if boot:
    #    self.trees.boots = os.path.join(self.dirs, self.name+".boots")
    #    with open(self.trees.boots, 'a') as outboot:
    #        outboot.write(tmptre+"\n")
    #else:
    treepath  = os.path.join(tempfiledir, treename+".tree")
    with open(treepath, 'w') as outtree:
        outtree.write(tmptre)

    ## save the file
    #treepath._save()
#@contextlib.contextmanager
#def nostdout():
#    save_stdout = sys.stdout
#    sys.stdout = io.BytesIO()
#    yield
#    sys.stdout = save_stdout

def run_mammal_inf(totalseqs_path,
                     snpmap_path,
                     output_path,
                     writing_interval = 200,
                     starting_combo = 0
                    ):
    totalseqs = np.genfromtxt(totalseqs_path,dtype='str')
    snpmap = np.loadtxt(snpmap_path).astype(int)

    alltipcombns=np.array(list(itertools.combinations(range(len(totalseqs[0])), 4)))
    alltipcombns = alltipcombns.astype(int)

    genes_alltaxa = [totalseqs[snpmap[0][i]:snpmap[1][i]] for i in range(len(snpmap[0]))]

    if not starting_combo:
        combocounter = 0
        orig_file = np.empty(shape = (0,4))
        np.savetxt(output_path,orig_file)
    else:
        combocounter = starting_combo
    targetlen = len(alltipcombns)

    # the first time you run this, run the next two lines to make a new file:

    #orig_file = np.empty(shape = (0,4))
    #np.savetxt("download_simseqs/mammal_quarts.txt",orig_file)

    while combocounter < (targetlen -1):
        allpredictedquarts = np.empty(shape = (0,4))
        savecounter = 0 # this will be reset
        while (savecounter < writing_interval) and (combocounter < (targetlen)):
            # set your current combination of four taxa
            fourtaxa= alltipcombns[combocounter]

            # get one snp at each locus -- might eventually be better to build a distribution at each locus, or 
            # at least compare quality of inference done both ways

            # before, I'd been getting all informative, complete SNPs at each locus and then randomly selecting. 
            # Much more efficient this way, shuffling each locus randomly and then selecting first informative SNP

            reducedgene = np.empty(shape = (0,4))
            for geneidx in range(len(genes_alltaxa)):
                currentgene = genes_alltaxa[geneidx]
                orderedsamples = range(len(currentgene))
                np.random.shuffle(orderedsamples)
                trigger = 0
                q = 0
                while (trigger == 0) and (q < (len(currentgene)-1)):
                    currentgene = [genes_alltaxa[geneidx][orderedsamples[q]][taxon] for taxon in fourtaxa]
                    q += 1
                    if ((len(set(currentgene).union(set(['A','G','C','T']))) == 4) and (len(set(currentgene)) > 1)):
                        reducedgene = np.vstack([reducedgene,currentgene])
                        trigger = 1

            arr0123 = np.array(reducedgene)
            possible_configs = [[0,1,2,3],[0,2,1,3],[0,3,1,2]]
            arr0123 = np.where(arr0123=='A',0,arr0123)
            arr0123 = np.where(arr0123=='C',1,arr0123)
            arr0123 = np.where(arr0123=='G',2,arr0123)
            arr0123 = np.where(arr0123=='T',3,arr0123)
            arr0123 = arr0123.astype(int)

            # make index matrix for each pair of bases. This assigns row / col number for full 16x16 matrix
            indexmat = np.array(range(16))
            indexmat.shape=(4,4)

                    # make 16x16 matrix of zeroes
                    # order across matrix is 00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
                    # not good use of space
            three_possible = []
            for q in possible_configs:
                temp_rearrangement = arr0123[:,q]
                fullmat0123 = np.zeros(shape=(16,16))
                for i in range(len(temp_rearrangement)):
                            # get row number 
                    rownum = int(indexmat[temp_rearrangement[i][0:2][0],temp_rearrangement[i][0:2][1]])
                            # get col number
                    colnum = int(indexmat[temp_rearrangement[i][2:4][0],temp_rearrangement[i][2:4][1]])
                    fullmat0123[rownum,colnum] = fullmat0123[rownum,colnum] + 1
                three_possible.append((fullmat0123.flatten()/max(fullmat0123.flatten())))
            # comment this as TEST
            #prediction = sess.run(y, feed_dict={x: [(fullmat0123.flatten()/max(fullmat0123.flatten()))]})
            tf.reset_default_graph()
            x = tf.placeholder(tf.float32, [None, 256])
            W = tf.Variable(tf.zeros([256, 2]))
            b = tf.Variable(tf.zeros([2]))

            y = tf.nn.softmax(tf.matmul(x, W) + b)

            y_ = tf.placeholder(tf.float32, [None, 2])
            # Add ops to save and restore all the variables.
            saver = tf.train.Saver()
            # Later, launch the model, use the saver to restore variables from disk, and
            # do some work with the model.
            with tf.Session() as sess:
                # Restore variables from disk.
#                with nostdout():
                saver.restore(sess, "download_simseqs/saved_mo.ckpt")
                #print("Model restored.")
                predictions = sess.run(y, feed_dict={x: three_possible[0:3]})
#                prediction = predictions[0];
            
            #print(str(savecounter) +" " + str(combocounter))
            
            savecounter += 1
            combocounter += 1
            print predictions
            maxval = max([predictions[i][0] for i in range(len(predictions))])
            max_index = [predictions[i][0] for i in range(len(predictions))].index(maxval)
#            print max_index
            allpredictedquarts = np.vstack([allpredictedquarts,np.array([fourtaxa[i] for i in [[0,1,2,3],[0,2,1,3],[0,3,1,2]][max_index]]).astype(int)])
#            print np.array([fourtaxa[i] for i in [[0,1,2,3],[0,2,1,3],[0,3,1,2]][max_index]]).astype(int)
        f_handle = file(output_path,"a")
        np.savetxt(f_handle,allpredictedquarts.astype(int))
        f_handle.close()
#            print [fourtaxa[i] for i in [[0,1,2,3],[0,2,1,3],[0,3,1,2]][max_index]]
    return;


In [5]:
run_mammal_inf(totalseqs_path = "download_simseqs/concat_mammal_genes.gz",
                 snpmap_path = "download_simseqs/concat_mammal_map.gz",
                 output_path = "download_simseqs/testingfunc.txt",
                 writing_interval = 200,
                 starting_combo = 0)

In [None]:
allpredictedquarts = np.loadtxt("download_simseqs/testingfunc.txt")
with open("download_simseqs/song-mammalian-bio_completely_processed/taxa_dict.txt") as f:
    test = f.readlines()
test = [x.strip() for x in test]
nameskey = [test[i].split("\t") for i in range(len(test))]
dump_qmc(quartets = (allpredictedquarts.astype(int)+1),tempfiledir= "download_simseqs/")
_run_qmc(tempfiledir = "download_simseqs/", 
         tempfilename="download_simseqs/quartets.txt",
         treename="treeFULLagain.phy",
         tipnames=[i[0] for i in nameskey])

### I think this is outdated...

In [None]:
def plot_mammal_quarts(totalseqs_path,
                     snpmap_path,
                     output_path,
                     writing_interval = 200,
                     starting_combo = 0
                    ):
    totalseqs = np.genfromtxt(totalseqs_path,dtype='str')
    snpmap = np.loadtxt(snpmap_path).astype(int)

    alltipcombns=np.array(list(itertools.combinations(range(len(totalseqs[0])), 4)))
    alltipcombns = alltipcombns.astype(int)

    genes_alltaxa = [totalseqs[snpmap[0][i]:snpmap[1][i]] for i in range(len(snpmap[0]))]

    if not starting_combo:
        combocounter = 0
        orig_file = np.empty(shape = (0,4))
        np.savetxt(output_path,orig_file)
    else:
        combocounter = starting_combo
    targetlen = len(alltipcombns)

    # the first time you run this, run the next two lines to make a new file:

    #orig_file = np.empty(shape = (0,4))
    #np.savetxt("download_simseqs/mammal_quarts.txt",orig_file)


    allpredictedquarts = np.empty(shape = (0,4))
    savecounter = 0 # this will be reset
    
    # set your current combination of four taxa
    fourtaxa= alltipcombns[np.random.choice(range(targetlen))]

    # get one snp at each locus -- might eventually be better to build a distribution at each locus, or 
    # at least compare quality of inference done both ways

    # before, I'd been getting all informative, complete SNPs at each locus and then randomly selecting. 
    # Much more efficient this way, shuffling each locus randomly and then selecting first informative SNP

    reducedgene = np.empty(shape = (0,4))
    for geneidx in range(len(genes_alltaxa)):
        currentgene = genes_alltaxa[geneidx]
        orderedsamples = range(len(currentgene))
        np.random.shuffle(orderedsamples)
        trigger = 0
        q = 0
        while (trigger == 0) and (q < (len(currentgene)-1)):
            currentgene = [genes_alltaxa[geneidx][orderedsamples[q]][taxon] for taxon in fourtaxa]
            q += 1
            if ((len(set(currentgene).union(set(['A','G','C','T']))) == 4) and (len(set(currentgene)) > 1)):
                reducedgene = np.vstack([reducedgene,currentgene])
                trigger = 1

    arr0123 = np.array(reducedgene)
    possible_configs = [[0,1,2,3],[0,2,1,3],[0,3,1,2]]
    arr0123 = np.where(arr0123=='A',0,arr0123)
    arr0123 = np.where(arr0123=='C',1,arr0123)
    arr0123 = np.where(arr0123=='G',2,arr0123)
    arr0123 = np.where(arr0123=='T',3,arr0123)
    arr0123 = arr0123.astype(int)

    # make index matrix for each pair of bases. This assigns row / col number for full 16x16 matrix
    indexmat = np.array(range(16))
    indexmat.shape=(4,4)

            # make 16x16 matrix of zeroes
            # order across matrix is 00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
            # not good use of space
    three_possible = []
    for q in possible_configs:
        temp_rearrangement = arr0123[:,q]
        fullmat0123 = np.zeros(shape=(16,16))
        for i in range(len(temp_rearrangement)):
                    # get row number 
            rownum = int(indexmat[temp_rearrangement[i][0:2][0],temp_rearrangement[i][0:2][1]])
                    # get col number
            colnum = int(indexmat[temp_rearrangement[i][2:4][0],temp_rearrangement[i][2:4][1]])
            fullmat0123[rownum,colnum] = fullmat0123[rownum,colnum] + 1
        three_possible.append((fullmat0123.flatten()/max(fullmat0123.flatten())))
    print "Four taxa: " + str(fourtaxa)
    toyplot.matrix(three_possible[0].reshape(16,16))
    toyplot.matrix(three_possible[1].reshape(16,16))
    toyplot.matrix(three_possible[2].reshape(16,16))
    return;

In [None]:
def plot_mammal_quarts(totalseqs_path,
                     snpmap_path,
                     output_path,
                     writing_interval = 200,
                     starting_combo = 0
                    ):
    totalseqs = np.genfromtxt(totalseqs_path,dtype='str')
    snpmap = np.loadtxt(snpmap_path).astype(int)

    alltipcombns=np.array(list(itertools.combinations(range(len(totalseqs[0])), 4)))
    alltipcombns = alltipcombns.astype(int)

    genes_alltaxa = [totalseqs[snpmap[0][i]:snpmap[1][i]] for i in range(len(snpmap[0]))]

    if not starting_combo:
        combocounter = 0
        orig_file = np.empty(shape = (0,4))
        np.savetxt(output_path,orig_file)
    else:
        combocounter = starting_combo
    targetlen = len(alltipcombns)

    # the first time you run this, run the next two lines to make a new file:

    #orig_file = np.empty(shape = (0,4))
    #np.savetxt("download_simseqs/mammal_quarts.txt",orig_file)


    allpredictedquarts = np.empty(shape = (0,4))
    savecounter = 0 # this will be reset
    
    # set your current combination of four taxa
    fourtaxa= alltipcombns[np.random.choice(range(targetlen))]

    # get one snp at each locus -- might eventually be better to build a distribution at each locus, or 
    # at least compare quality of inference done both ways

    # before, I'd been getting all informative, complete SNPs at each locus and then randomly selecting. 
    # Much more efficient this way, shuffling each locus randomly and then selecting first informative SNP

    reducedgene = np.empty(shape = (0,4))
    for geneidx in range(len(genes_alltaxa)):
        currentgene = genes_alltaxa[geneidx]
        orderedsamples = range(len(currentgene))
        np.random.shuffle(orderedsamples)
        trigger = 0
        q = 0
        while (trigger == 0) and (q < (len(currentgene)-1)):
            currentgene = [genes_alltaxa[geneidx][orderedsamples[q]][taxon] for taxon in fourtaxa]
            q += 1
            if ((len(set(currentgene).union(set(['A','G','C','T']))) == 4) and (len(set(currentgene)) > 1)):
                reducedgene = np.vstack([reducedgene,currentgene])
                trigger = 1

    arr0123 = np.array(reducedgene)
    possible_configs = [[0,1,2,3],[0,2,1,3],[0,3,1,2]]
    arr0123 = np.where(arr0123=='A',0,arr0123)
    arr0123 = np.where(arr0123=='C',1,arr0123)
    arr0123 = np.where(arr0123=='G',2,arr0123)
    arr0123 = np.where(arr0123=='T',3,arr0123)
    arr0123 = arr0123.astype(int)

    # make index matrix for each pair of bases. This assigns row / col number for full 16x16 matrix
    indexmat = np.array(range(16))
    indexmat.shape=(4,4)

            # make 16x16 matrix of zeroes
            # order across matrix is 00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
            # not good use of space
    three_possible = []
    for q in possible_configs:
        temp_rearrangement = arr0123[:,q]
        fullmat0123 = np.zeros(shape=(16,16))
        for i in range(len(temp_rearrangement)):
                    # get row number 
            rownum = int(indexmat[temp_rearrangement[i][0:2][0],temp_rearrangement[i][0:2][1]])
                    # get col number
            colnum = int(indexmat[temp_rearrangement[i][2:4][0],temp_rearrangement[i][2:4][1]])
            fullmat0123[rownum,colnum] = fullmat0123[rownum,colnum] + 1
        three_possible.append((fullmat0123.flatten()/max(fullmat0123.flatten())))
    print "Four taxa: " + str(fourtaxa)
    toyplot.matrix(three_possible[0].reshape(16,16))
    toyplot.matrix(three_possible[1].reshape(16,16))
    toyplot.matrix(three_possible[2].reshape(16,16))
    return;

In [None]:
totalseqs = np.genfromtxt("download_simseqs/concat_mammal_genes.gz",dtype='str')
snpmap = np.loadtxt("download_simseqs/concat_mammal_map.gz").astype(int)

In [None]:
def get_patterns(goodbases):
    patterns = np.empty(shape = (0,4))
    for base in range(len(goodbases)):
        d = {ni: indi for indi, ni in enumerate(set(goodbases[base]))}
        patterns = np.vstack([patterns,(np.vectorize(d.__getitem__)(goodbases[base]))])
    return(patterns.astype(int))
@jit
def most_freq_pattern(the_patterns):
    unique_patterns, freqs  = np.unique(the_patterns,axis = 0,return_counts=True)
    return unique_patterns[np.argmax(freqs)]
@jit
def f(genes_alltaxa,geneidx,fourtaxa):
        currentgene = [[genes_alltaxa[geneidx][base][taxon] for taxon in fourtaxa] for base in range(len(genes_alltaxa[geneidx]))]
        return np.array(currentgene)
@jit
def exclude(fullgene):
    return np.array([sum(fullgene[q])<= 12 and len(set(fullgene[q])) > 1 for q in xrange(len(fullgene))])

totalseqs = totalseqs.view(np.uint8)
totalseqs = np.where(totalseqs==65,0,totalseqs)
totalseqs = np.where(totalseqs==67,1,totalseqs)
totalseqs = np.where(totalseqs==71,2,totalseqs)
totalseqs = np.where(totalseqs==84,3,totalseqs)
genes_alltaxa = [totalseqs[snpmap[0][i]:snpmap[1][i]] for i in range(len(snpmap[0]))]
alltipcombns=np.array(list(itertools.combinations(range(len(totalseqs[0])), 4)))
alltipcombns = alltipcombns.astype(int)

combocounter = 0
orig_file = np.empty(shape = (0,4))
#    np.savetxt(output_path,orig_file)

targetlen = len(alltipcombns)



allpredictedquarts = np.empty(shape = (0,4))
savecounter = 0 # this will be reset

# set your current combination of four taxa
fourtaxa= alltipcombns[np.random.choice(range(targetlen))]

# get one snp at each locus -- might eventually be better to build a distribution at each locus, or 
# at least compare quality of inference done both ways

# before, I'd been getting all informative, complete SNPs at each locus and then randomly selecting. 
# Much more efficient this way, shuffling each locus randomly and then selecting first informative SNP

reducedgene = np.empty(shape = (0,4))
for geneidx in range(len(genes_alltaxa)):
    fullgene = f(genes_alltaxa,geneidx,fourtaxa)
    goodbases = fullgene[exclude(fullgene)]
    if len(goodbases) > 0:
        the_patterns = get_patterns(goodbases)
        indices = np.where((the_patterns == most_freq_pattern(the_patterns)).all(axis=1))[0]
        reducedgene = np.vstack([reducedgene,goodbases[int(np.random.choice(indices,1))]])
        reducedgene = np.vstack([reducedgene,goodbases[np.random.choice(range(len(goodbases)),10)]])
    print(geneidx)
# make index matrix for each pair of bases. This assigns row / col number for full 16x16 matrix
indexmat = np.array(range(16))
indexmat.shape=(4,4)

        # make 16x16 matrix of zeroes
        # order across matrix is 00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
        # not good use of space
three_possible = []
possible_configs = [[0,1,2,3],[0,2,1,3],[0,3,1,2]]
arr0123 = copy.deepcopy(reducedgene)
arr0123 = arr0123.astype(int)
for q in possible_configs:
    temp_rearrangement = arr0123[:,q]
    fullmat0123 = np.zeros(shape=(16,16))
    for i in range(len(temp_rearrangement)):
                # get row number 
        rownum = int(indexmat[temp_rearrangement[i][0:2][0],temp_rearrangement[i][0:2][1]])
                # get col number
        colnum = int(indexmat[temp_rearrangement[i][2:4][0],temp_rearrangement[i][2:4][1]])
        fullmat0123[rownum,colnum] = fullmat0123[rownum,colnum] + 1
    three_possible.append((fullmat0123.flatten()/max(fullmat0123.flatten())))
print "Four taxa: " + str(fourtaxa)
toyplot.matrix(three_possible[0].reshape(16,16))
toyplot.matrix(three_possible[1].reshape(16,16))
toyplot.matrix(three_possible[2].reshape(16,16))


In [None]:
            tf.reset_default_graph()
            x = tf.placeholder(tf.float32, [None, 256])
            W = tf.Variable(tf.zeros([256, 2]))
            b = tf.Variable(tf.zeros([2]))

            y = tf.nn.softmax(tf.matmul(x, W) + b)

            y_ = tf.placeholder(tf.float32, [None, 2])
            # Add ops to save and restore all the variables.
            saver = tf.train.Saver()
            # Later, launch the model, use the saver to restore variables from disk, and
            # do some work with the model.
            with tf.Session() as sess:
                # Restore variables from disk.
#                with nostdout():
                saver.restore(sess, "download_simseqs/saved_mo.ckpt")
                #print("Model restored.")
                predictions = sess.run(y, feed_dict={x: three_possible[0:3]})
#                prediction = predictions[0];