## Can a machine learning algorithm do a good job of predicting correct quartet configurations?

### imports:

In [19]:
import numpy as np
import toytree
import itertools
import copy
from itertools import compress
import toyplot

### functions:

In [3]:
def decompose_to_quartets(tre):

    ## set to compared ordered sets, and set to store quartet
    qrts = set()
    stored = set()

    ## get all tips in tree as a set
    n_all = set(tre.tree.get_leaf_names())

    ## traverse tree
    for node in tre.tree.traverse():
        ## skip root or tip nodes
        if not (node.is_root() or node.is_leaf()):

            ## get all tips below this node
            below = set(node.get_leaf_names())
            above = n_all - below

            ## get all combinations of 2 above and 2 below this node
            for qrt in get_all_combs(tre, above, below):

                ## add to quartet set
                sqrt = tuple(sorted(qrt))
                if sqrt not in qrts:
                    stored.add(qrt)
                    qrts.add(sqrt)

    ## store qrts  
    return stored
def get_all_combs(self, set1, set2, as_list=False):
    #qiter = (sorted(i) + sorted(j) for (i, j) in itertools.product(
    #            itertools.combinations(set1, 2), 
    #            itertools.combinations(set2, 2),
    #        ))
    qiter = (tuple(i+j) for (i, j) in itertools.product(
        itertools.combinations(set1, 2), 
        itertools.combinations(set2, 2),
    ))

    ## option to return as list 
    if as_list:
        return list(qiter)
    ## but returning as generator is more efficient
    else:
        return qiter

## Show that we should be able to recognize the true quartet visually

Here, we import a random tree, a sequence simulated on that tree, and a random quartet from the tree. The tree has random branch lengths, and the sequences paired with the trees evolve at varying rates and are of varying lengths.

In [24]:
# pick a random tree from file
treenum = np.random.choice(range(2000))+1
print "Tree number: " + str(treenum)
thetree = toytree.tree('random_trees/samp'+str(treenum+1)+'.tre')
# get all TRUE splits on the tree
treeqrts = list(decompose_to_quartets(thetree))

fname = ('tree_seqs/test'+str(treenum+1)+'.dat')
with open(fname) as f:
    sequences = f.readlines()
# remove whitespace characters like `\n` at the end of each line
sequences = [x.strip() for x in sequences] 
sequences.pop(0)

# separate all sequences and tip names
names = [sequences[i][0:10].strip(" ") for i in range(len(sequences))]
iso_sequences = [sequences[i][10:].strip(" ") for i in range(len(sequences))]

# pick a random quartet
num = int((np.random.choice(range(len(treeqrts)),1)))
print "Quartet number = " + str(num)

# visualize this quartet on the tree
colors = [thetree.colors[0] if i==True else thetree.colors[1] \
          for i in [thetree.get_node_labels()[q] in treeqrts[num] for q in range(len(thetree.get_node_labels()))]]
thetree.draw(
    width=300,
    height = 490,
    node_labels=False, 
    node_color=colors,
    node_size=15,
);


print "the TRUE split is: " + str(treeqrts[num])

Tree number: 269
Quartet number = 37921
the TRUE split is: ('T36', 'T32', 'T14', 'T25')


Let's pretend we don't know the true quartet and are just taking a random set of four tips and shuffling them three ways:

In [25]:
qrtnum = num
true_qrt = np.array(treeqrts[qrtnum])
tipnames = copy.deepcopy(true_qrt)
np.random.shuffle(tipnames)
# is correct config of these tips [0123],[0213], or [0312]
correct_config = [int( ((set([tipnames[i] for i in q[0:2]]) == set(true_qrt[0:2])) or (set([tipnames[i] for i in q[0:2]]) == set(true_qrt[2:4]))) ) for q in [[0,1,2,3],[0,2,1,3],[0,3,1,2]]]
print "The true quartet is: " + str(true_qrt)
print "The random configuration of this (i.e. when true quartet unknown) is: " + str(tipnames)
print "The correct configuration of the random configuration is: " + str([[0,1,2,3],[0,2,1,3], [0,3,1,2]][correct_config.index(1)])


The true quartet is: ['T36' 'T32' 'T14' 'T25']
The random configuration of this (i.e. when true quartet unknown) is: ['T32' 'T36' 'T25' 'T14']
The correct configuration of the random configuration is: [0, 1, 2, 3]


### Now, we can shuffle our random arrangement of the four tips into the three possible splits: [0,1,2,3],[0,2,1,3], and [0,3,1,2]. 

In [26]:
interestednames = tipnames # this is a list of four tip names... e.g. ["t1","t2","t3","t4]
taxa_ids = list(itertools.chain.from_iterable([list(compress(range(len(names)),i)) for i in [[q == i for i in names] for q in interestednames]]))

tempobj = [iso_sequences[i] for i in taxa_ids]

# eliminate non-snps

ind_samples = []
for i in range(len(tempobj[0])):
    currentbase = ([tempobj[q][i] for q in range(len(tempobj))])
    if (len(set(currentbase)) > 1):
        ind_samples.append(currentbase)
ind_samples_reset = ind_samples

# separate sequences by fifth taxon

ind_samples = np.array(ind_samples_reset)
ind_samples = np.where(ind_samples=='A',0,ind_samples)
ind_samples = np.where(ind_samples=='C',1,ind_samples)
ind_samples = np.where(ind_samples=='G',2,ind_samples)
ind_samples = np.where(ind_samples=='T',3,ind_samples)
ind_samples = ind_samples.astype(int)


print(correct_config)
for q in [[0,1,2,3],[0,2,1,3],[0,3,1,2]]:
    # get the matrices
    indexmat = np.array(range(16))
    indexmat.shape=(4,4)
    # order across matrix is 00,01,02,03,10,11,12,13,20,21,22,23,30,31,32,33
    fullmat0123 = np.zeros(shape=(16,16))
    arr0123 = ind_samples[:,q]
    for i in range(len(arr0123)):
                # get row number 
        rownum = int(indexmat[arr0123[i][0],arr0123[i][1]])
                # get col number
        colnum = int(indexmat[arr0123[i][2],arr0123[i][3]])
        fullmat0123[rownum,colnum] = fullmat0123[rownum,colnum] + 1
    #images.append(fullmat0123.flatten()/max(fullmat0123.flatten()))
    #labels.append(correct_config)
    toyplot.matrix((fullmat0123.flatten()/max(fullmat0123.flatten())).reshape(16,16))

[1, 0, 0]


We see that it's basically always easy to recognize which is the correct split: the high values tend to line up on columns/rows 0, 5, 10, and 15. 

Run this many times and see -- it's consistently pretty easy to guess which of the three is correct just at a glance.

### *Machine learning is most famously used for image recognition, and here we have a very straightforward image classification test!*