In [9]:
import h5py
import simcat   
import toytree
import random
import numpy as np
import itertools as itt
import future
from copy import deepcopy
from numba import jit
import multiprocessing as mp

In [2]:
def _node_slider(tree):
        """
        Returns a toytree copy with node heights modified while retaining the 
        same topology but not necessarily node branching order. Node heights are
        moved up or down uniformly between their parent and highest child node 
        heights in 'levelorder' from root to tips. The total tree height is 
        retained at 1.0, only relative edge lengths change.
        ## for example run:
        c, a = node_slide(ctree).draw(
            width=400,
            orient='down', 
            node_labels='idx',
            node_size=15,
            tip_labels=False
            );
        a.show = True
        a.x.show = False
        a.y.ticks.show = True
        """
        ctree = deepcopy(tree)
        for node in ctree.tree.traverse():

            ## slide internal nodes 
            if node.up and node.children:

                ## get min and max slides
                minjit = max([i.dist for i in node.children]) * 0.99
                maxjit = (node.up.height * 0.99) - node.height
                newheight = np.random.uniform(low=-minjit, high=maxjit)

                ## slide children
                for child in node.children:
                    child.dist += newheight

                ## slide self to match
                node.dist -= newheight

        ## make max height = 1
        mod = ctree.tree.height
        for node in ctree.tree.traverse():
            node.dist = node.dist / float(mod)

        return ctree
def get_all_admix_edges(ttree):
    """
    Find all possible admixture edges on a tree. Edges are unidirectional, 
    so the source and dest need to overlap in time interval.    
    """
    ## for all nodes map the potential admixture interval
    for snode in ttree.tree.traverse():
        if snode.is_root():
            snode.interval = (None, None)
        else:
            snode.interval = (snode.height, snode.up.height)

    ## for all nodes find overlapping intervals
    intervals = {}
    for snode in ttree.tree.traverse():
        for dnode in ttree.tree.traverse():
            if not any([snode.is_root(), dnode.is_root(), dnode == snode]):
                ## check for overlap
                smin, smax = snode.interval
                dmin, dmax = dnode.interval

                ## find if nodes have interval where admixture can occur
                low_bin = np.max([smin, dmin])
                top_bin = np.min([smax, dmax])
                if top_bin > low_bin:
                    intervals[(snode.idx, dnode.idx)] = (low_bin, top_bin)
    return intervals

In [3]:
## generate a random tree
tree = toytree.rtree.unittree(ntips=5, treeheight=3, seed=12345)
c, a = tree.draw(tree_style='c')

In [28]:
c, a = _node_slider(tree).draw(tree_style='c')

In [85]:
## get observed data w/o migration
observed = simcat.Model(
    tree=tree, 
    admixture_edges=((2, 1, 0.5, 0.7, 0.00001)),
    theta=0.01,
    ntests=1,
    debug=True,
    )

In [79]:
## get observed data w/o migration
observed = simcat.Model(
    tree=tree, 
    admixture_edges=((1, 4, 0.5, 0.7, 0.00001)),
    theta=0.01,
    ntests=1,
    debug=True,
    )

In [99]:
dict = get_all_admix_edges(currtree)

In [114]:
dict[tuple([1,4])]

(0.0, 0.13313371358665282)

In [103]:
dict[(2,1)]

(0.0, 0.1551836516443822)

In [147]:
dict.keys()[np.random.randint(len(dict.keys()))]

(3, 1)

In [154]:
test=dict.keys()[10]

In [155]:
tuple([test[1],test[0]])

(0, 2)

In [156]:
test

(2, 0)

# make training data with all classes:

In [197]:
@jit
def make_data(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
        edges1 = edges
        if np.random.randint(2):
            edges1 = tuple([edges[1],edges[0]])
        ## get observed data w/o migration
        start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0],edgedict[tuple(edges)][1],2))
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0,.1))),
            theta=0.01,
            ntests=1,
            debug=False,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 1000 == 0):
            print(run)
    return([dat,labels])

In [199]:
dat,labs = make_data(5000,tree)

0

1000

2000

3000

4000



In [222]:
strlabs = [str(labs[i][0])+str(labs[i][1]) for i in range(len(labs))]
flat = [obj.flatten() for obj in dat]
traindat = flat[:4500]
trainlabs = strlabs[:4500]
testdat = flat[4500:]
testlabs = strlabs[4500:]

In [206]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=300,max_depth=None, random_state=1,warm_start = True)

random_forest.fit(traindat, trainlabs)

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(testdat)
accuracy_score(testlabs, y_predict)

0.616

In [207]:
dat1,labs1 = make_data(5000,tree)

0

1000

2000

3000

4000



In [208]:
strlabs1 = [str(labs1[i][0])+str(labs1[i][1]) for i in range(len(labs1))]
flat1 = [obj.flatten() for obj in dat1]
traindat1 = flat1
trainlabs1 = strlabs1


In [227]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=700,max_depth=None, random_state=1,warm_start = True)

random_forest.fit(np.vstack([traindat,traindat1]), np.hstack([np.array(trainlabs),np.array(trainlabs1)]))

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(testdat)
accuracy_score(testlabs, y_predict)

0.644

In [238]:
from sklearn.svm import SVC
clf = SVC(C=200)
clf.fit(np.vstack([traindat,traindat1]), np.hstack([np.array(trainlabs),np.array(trainlabs1)])) 

SVC(C=200, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [239]:
from sklearn.metrics import accuracy_score

y_predict = clf.predict(testdat)
accuracy_score(testlabs,y_predict)#_conv.ravel(), y_predict)

0.656

In [None]:
from sklearn.metrics import confusion_matrix
import toyplot
ckwargs = {
    "height": 750,
    "width": 750,
}
canvas = toyplot.Canvas(**ckwargs)
table = canvas.matrix(confusion_matrix(testlabs, y_predict,labels = clf.classes_), margin=20)

# Now we'll try a convolutional network

In [244]:
alllabs = np.hstack([np.array(strlabs),np.array(strlabs1)])
alldat = np.vstack([dat,dat1])

In [246]:
import tensorflow as tf
import time
from datetime import timedelta
import math
import random
import numpy as np
import h5py

#Adding Seed so that random initialization is consistent
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


batch_size = 40
img_size = 16
num_channels = 5

In [247]:
classes = alllabs
unique_classes=np.unique(classes)
num_classes = len(unique_classes)

data = alldat

dat_new = np.array([np.stack([data[samp][i] for i in range(len(data[samp]))],axis=2) for samp in range(len(data))])

In [259]:
session = tf.Session()
x = tf.placeholder(tf.float32, shape=[None, img_size,img_size,num_channels], name='x')

## labels
y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls = tf.argmax(y_true, axis=1)



##Network graph params
filter_size_conv1 = 3
num_filters_conv1 = 128

filter_size_conv2 = 3
num_filters_conv2 = 128

filter_size_conv3 = 3
num_filters_conv3 = 256
    
fc_layer_size = 512

def create_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))

def create_biases(size):
    return tf.Variable(tf.constant(0.05, shape=[size]))



def create_convolutional_layer(input,
               num_input_channels, 
               conv_filter_size,        
               num_filters):  
    
    ## We shall define the weights that will be trained using create_weights function.
    weights = create_weights(shape=[conv_filter_size, conv_filter_size, num_input_channels, num_filters])
    ## We create biases using the create_biases function. These are also trained.
    biases = create_biases(num_filters)

    ## Creating the convolutional layer
    layer = tf.nn.conv2d(input=input,
                     filter=weights,
                     strides=[1, 1, 1, 1],
                     padding='SAME')

    layer += biases

    ## We shall be using max-pooling.  
    layer = tf.nn.max_pool(value=layer,
                            ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1],
                            padding='SAME')
    ## Output of pooling is fed to Relu which is the activation function for us.
    layer = tf.nn.relu(layer)

    return layer

    

def create_flatten_layer(layer):
    #We know that the shape of the layer will be [batch_size img_size img_size num_channels] 
    # But let's get it from the previous layer.
    layer_shape = layer.get_shape()

    ## Number of features will be img_height * img_width* num_channels. But we shall calculate it in place of hard-coding it.
    num_features = layer_shape[1:4].num_elements()

    ## Now, we Flatten the layer so we shall have to reshape to num_features
    layer = tf.reshape(layer, [-1, num_features])

    return layer


def create_fc_layer(input,          
             num_inputs,    
             num_outputs,
             use_relu=True):
    
    #Let's define trainable weights and biases.
    weights = create_weights(shape=[num_inputs, num_outputs])
    biases = create_biases(num_outputs)

    # Fully connected layer takes input x and produces wx+b.Since, these are matrices, we use matmul function in Tensorflow
    layer = tf.matmul(input, weights) + biases
    if use_relu:
        layer = tf.nn.relu(layer)

    return layer


layer_conv1 = create_convolutional_layer(input=x,
               num_input_channels=num_channels,
               conv_filter_size=filter_size_conv1,
               num_filters=num_filters_conv1)
layer_conv2 = create_convolutional_layer(input=layer_conv1,
               num_input_channels=num_filters_conv1,
               conv_filter_size=filter_size_conv2,
               num_filters=num_filters_conv2)

layer_conv3= create_convolutional_layer(input=layer_conv2,
               num_input_channels=num_filters_conv2,
               conv_filter_size=filter_size_conv3,
               num_filters=num_filters_conv3)
          
layer_flat = create_flatten_layer(layer_conv3)

layer_fc1 = create_fc_layer(input=layer_flat,
                     num_inputs=layer_flat.get_shape()[1:4].num_elements(),
                     num_outputs=fc_layer_size,
                     use_relu=True)

layer_fc2 = create_fc_layer(input=layer_fc1,
                     num_inputs=fc_layer_size,
                     num_outputs=num_classes,
                     use_relu=False) 

y_pred = tf.nn.softmax(layer_fc2,name='y_pred')

y_pred_cls = tf.argmax(y_pred, axis=1)
session.run(tf.global_variables_initializer())
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_fc2,
                                                    labels=y_true)
cost = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


session.run(tf.global_variables_initializer()) 

In [260]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
#data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
#values = array(data)
#print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(classes)
print(integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded).astype(np.int8)

[15  2 30 ...  7 26  8]


In [262]:
def get_batch(arr,classes,number):
    idxs = np.random.choice(len(arr),replace=False,size=number)
    return([arr[idxs],classes[idxs]])

def show_progress(epoch, feed_dict_train, feed_dict_validate, val_loss):
    acc = session.run(accuracy, feed_dict=feed_dict_train)
    val_acc = session.run(accuracy, feed_dict=feed_dict_validate)
    msg = "Training Epoch {0} --- Training Accuracy: {1:>6.1%}, Validation Accuracy: {2:>6.1%},  Validation Loss: {3:.3f}"
    print(msg.format(epoch + 1, acc, val_acc, val_loss))

total_iterations = 0
ticker_idx = 0
saver = tf.train.Saver()
def train(num_iteration):
    global total_iterations
    global ticker_idx
    for i in range(total_iterations,
                   total_iterations + num_iteration):

        #x_batch, y_true_batch, _, cls_batch = data.train.next_batch(batch_size)
        x_batch, y_true_batch = get_batch(dat_new[:9500],onehot_encoded[:9500],batch_size)
        #x_valid_batch, y_valid_batch, _, valid_cls_batch = data.valid.next_batch(batch_size)
        x_valid_batch, y_valid_batch = get_batch(dat_new[9500:],onehot_encoded[9500:],batch_size)
        
        feed_dict_tr = {x: x_batch,
                           y_true: y_true_batch}
        feed_dict_val = {x: x_valid_batch,
                              y_true: y_valid_batch}

        session.run(optimizer, feed_dict=feed_dict_tr)

        if i % int(10) == 0: 
            val_loss = session.run(cost, feed_dict=feed_dict_val)
            epoch = int(i / 10)    
            
            show_progress(epoch, feed_dict_tr, feed_dict_val, val_loss)
            saver.save(session, 'dogs-cats-model')
        ticker_idx += batch_size


    total_iterations += num_iteration

train(num_iteration=2000)

Training Epoch 1 --- Training Accuracy:   0.0%, Validation Accuracy:   0.0%,  Validation Loss: 3.712
Training Epoch 2 --- Training Accuracy:   7.5%, Validation Accuracy:  10.0%,  Validation Loss: 3.622
Training Epoch 3 --- Training Accuracy:   2.5%, Validation Accuracy:   0.0%,  Validation Loss: 3.776
Training Epoch 4 --- Training Accuracy:   2.5%, Validation Accuracy:  10.0%,  Validation Loss: 3.510
Training Epoch 5 --- Training Accuracy:   7.5%, Validation Accuracy:   0.0%,  Validation Loss: 3.579
Training Epoch 6 --- Training Accuracy:   2.5%, Validation Accuracy:   5.0%,  Validation Loss: 3.586
Training Epoch 7 --- Training Accuracy:   2.5%, Validation Accuracy:   5.0%,  Validation Loss: 3.587
Training Epoch 8 --- Training Accuracy:  27.5%, Validation Accuracy:  15.0%,  Validation Loss: 3.523
Training Epoch 9 --- Training Accuracy:  12.5%, Validation Accuracy:   2.5%,  Validation Loss: 3.525
Training Epoch 10 --- Training Accuracy:   2.5%, Validation Accuracy:   2.5%,  Validation L

In [264]:
@jit
def make_data(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
        edges1 = edges
        if np.random.randint(2):
            edges1 = tuple([edges[1],edges[0]])
        ## get observed data w/o migration
        start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0],edgedict[tuple(edges)][1],2))
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0,.1))),
            theta=0.01,
            ntests=1,
            debug=False,
            nsnps=10000,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 1000 == 0):
            print(run)
    return([dat,labels])

In [265]:
dat,labs = make_data(5000,tree)

0

1000

2000

3000

4000



In [266]:
strlabs = [str(labs[i][0])+str(labs[i][1]) for i in range(len(labs))]
flat = [obj.flatten() for obj in dat]
traindat = flat[:4500]
trainlabs = strlabs[:4500]
testdat = flat[4500:]
testlabs = strlabs[4500:]

In [267]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=300,max_depth=None, random_state=1,warm_start = True)

random_forest.fit(traindat, trainlabs)

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(testdat)
accuracy_score(testlabs, y_predict)

0.692

In [269]:

# start a pool
pool = mp.Pool(processes=4)

# this does NOT BLOCK and so the code below is executed while 
# the async job is running on a different processor
asyncr = pool.apply_async(make_data, [5000,tree])
print('job was submitted')
print('waiting for result...')
print('executing print statements in this notebook while waiting...')

# collect result
asyncr.get()
print('job finished')

job was submitted
waiting for result...
executing print statements in this notebook while waiting...
0

1000

2000

3000

4000

job finished


In [272]:
dat1, labs1 = asyncr.get()

In [275]:
strlabs1 = [str(labs1[i][0])+str(labs1[i][1]) for i in range(len(labs1))]
flat1 = [obj.flatten() for obj in dat1]
traindat1 = flat1
trainlabs1 = strlabs1


In [276]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=700,max_depth=None, random_state=1,warm_start = True)

random_forest.fit(np.vstack([traindat,traindat1]), np.hstack([np.array(trainlabs),np.array(trainlabs1)]))

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(testdat)
accuracy_score(testlabs, y_predict)

0.706

### save our spot

In [283]:
#db_save = h5py.File("saved_10000snp.hdf5","w")

In [284]:
#db_save.create_dataset(data=np.vstack([dat,dat1]),name="counts")

<HDF5 dataset "counts": shape (10000, 5, 16, 16), type "<f8">

In [285]:
#db_save.create_dataset(data=np.hstack([np.array(strlabs),np.array(strlabs1)]),name="source_dest")

<HDF5 dataset "source_dest": shape (10000,), type "|S2">

In [286]:
#db_save.close()

# Run that CNN!

In [287]:
alllabs = np.hstack([np.array(strlabs),np.array(strlabs1)])
alldat = np.vstack([dat,dat1])

In [288]:
import tensorflow as tf
import time
from datetime import timedelta
import math
import random
import numpy as np
import h5py

#Adding Seed so that random initialization is consistent
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


batch_size = 40
img_size = 16
num_channels = 5

In [289]:
classes = alllabs
unique_classes=np.unique(classes)
num_classes = len(unique_classes)

data = alldat

dat_new = np.array([np.stack([data[samp][i] for i in range(len(data[samp]))],axis=2) for samp in range(len(data))])

In [290]:
session = tf.Session()
x = tf.placeholder(tf.float32, shape=[None, img_size,img_size,num_channels], name='x')

## labels
y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls = tf.argmax(y_true, axis=1)



##Network graph params
filter_size_conv1 = 3
num_filters_conv1 = 128

filter_size_conv2 = 3
num_filters_conv2 = 128

filter_size_conv3 = 3
num_filters_conv3 = 256
    
fc_layer_size = 512

def create_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))

def create_biases(size):
    return tf.Variable(tf.constant(0.05, shape=[size]))



def create_convolutional_layer(input,
               num_input_channels, 
               conv_filter_size,        
               num_filters):  
    
    ## We shall define the weights that will be trained using create_weights function.
    weights = create_weights(shape=[conv_filter_size, conv_filter_size, num_input_channels, num_filters])
    ## We create biases using the create_biases function. These are also trained.
    biases = create_biases(num_filters)

    ## Creating the convolutional layer
    layer = tf.nn.conv2d(input=input,
                     filter=weights,
                     strides=[1, 1, 1, 1],
                     padding='SAME')

    layer += biases

    ## We shall be using max-pooling.  
    layer = tf.nn.max_pool(value=layer,
                            ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1],
                            padding='SAME')
    ## Output of pooling is fed to Relu which is the activation function for us.
    layer = tf.nn.relu(layer)

    return layer

    

def create_flatten_layer(layer):
    #We know that the shape of the layer will be [batch_size img_size img_size num_channels] 
    # But let's get it from the previous layer.
    layer_shape = layer.get_shape()

    ## Number of features will be img_height * img_width* num_channels. But we shall calculate it in place of hard-coding it.
    num_features = layer_shape[1:4].num_elements()

    ## Now, we Flatten the layer so we shall have to reshape to num_features
    layer = tf.reshape(layer, [-1, num_features])

    return layer


def create_fc_layer(input,          
             num_inputs,    
             num_outputs,
             use_relu=True):
    
    #Let's define trainable weights and biases.
    weights = create_weights(shape=[num_inputs, num_outputs])
    biases = create_biases(num_outputs)

    # Fully connected layer takes input x and produces wx+b.Since, these are matrices, we use matmul function in Tensorflow
    layer = tf.matmul(input, weights) + biases
    if use_relu:
        layer = tf.nn.relu(layer)

    return layer


layer_conv1 = create_convolutional_layer(input=x,
               num_input_channels=num_channels,
               conv_filter_size=filter_size_conv1,
               num_filters=num_filters_conv1)
layer_conv2 = create_convolutional_layer(input=layer_conv1,
               num_input_channels=num_filters_conv1,
               conv_filter_size=filter_size_conv2,
               num_filters=num_filters_conv2)

layer_conv3= create_convolutional_layer(input=layer_conv2,
               num_input_channels=num_filters_conv2,
               conv_filter_size=filter_size_conv3,
               num_filters=num_filters_conv3)
          
layer_flat = create_flatten_layer(layer_conv3)

layer_fc1 = create_fc_layer(input=layer_flat,
                     num_inputs=layer_flat.get_shape()[1:4].num_elements(),
                     num_outputs=fc_layer_size,
                     use_relu=True)

layer_fc2 = create_fc_layer(input=layer_fc1,
                     num_inputs=fc_layer_size,
                     num_outputs=num_classes,
                     use_relu=False) 

y_pred = tf.nn.softmax(layer_fc2,name='y_pred')

y_pred_cls = tf.argmax(y_pred, axis=1)
session.run(tf.global_variables_initializer())
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_fc2,
                                                    labels=y_true)
cost = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


session.run(tf.global_variables_initializer()) 

In [291]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
#data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
#values = array(data)
#print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(classes)
print(integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded).astype(np.int8)

[28 10 15 ... 13 39 19]


In [292]:
def get_batch(arr,classes,number):
    idxs = np.random.choice(len(arr),replace=False,size=number)
    return([arr[idxs],classes[idxs]])

def show_progress(epoch, feed_dict_train, feed_dict_validate, val_loss):
    acc = session.run(accuracy, feed_dict=feed_dict_train)
    val_acc = session.run(accuracy, feed_dict=feed_dict_validate)
    msg = "Training Epoch {0} --- Training Accuracy: {1:>6.1%}, Validation Accuracy: {2:>6.1%},  Validation Loss: {3:.3f}"
    print(msg.format(epoch + 1, acc, val_acc, val_loss))

total_iterations = 0
ticker_idx = 0
saver = tf.train.Saver()
def train(num_iteration):
    global total_iterations
    global ticker_idx
    for i in range(total_iterations,
                   total_iterations + num_iteration):

        #x_batch, y_true_batch, _, cls_batch = data.train.next_batch(batch_size)
        x_batch, y_true_batch = get_batch(dat_new[:9500],onehot_encoded[:9500],batch_size)
        #x_valid_batch, y_valid_batch, _, valid_cls_batch = data.valid.next_batch(batch_size)
        x_valid_batch, y_valid_batch = get_batch(dat_new[9500:],onehot_encoded[9500:],batch_size)
        
        feed_dict_tr = {x: x_batch,
                           y_true: y_true_batch}
        feed_dict_val = {x: x_valid_batch,
                              y_true: y_valid_batch}

        session.run(optimizer, feed_dict=feed_dict_tr)

        if i % int(10) == 0: 
            val_loss = session.run(cost, feed_dict=feed_dict_val)
            epoch = int(i / 10)    
            
            show_progress(epoch, feed_dict_tr, feed_dict_val, val_loss)
            saver.save(session, 'dogs-cats-model')
        ticker_idx += batch_size


    total_iterations += num_iteration

train(num_iteration=2000)

Training Epoch 1 --- Training Accuracy:   2.5%, Validation Accuracy:   0.0%,  Validation Loss: 3.725
Training Epoch 2 --- Training Accuracy:   0.0%, Validation Accuracy:   0.0%,  Validation Loss: 3.642
Training Epoch 3 --- Training Accuracy:   2.5%, Validation Accuracy:   2.5%,  Validation Loss: 3.597
Training Epoch 4 --- Training Accuracy:  10.0%, Validation Accuracy:   2.5%,  Validation Loss: 3.578
Training Epoch 5 --- Training Accuracy:   2.5%, Validation Accuracy:   7.5%,  Validation Loss: 3.554
Training Epoch 6 --- Training Accuracy:  10.0%, Validation Accuracy:   7.5%,  Validation Loss: 3.602
Training Epoch 7 --- Training Accuracy:   7.5%, Validation Accuracy:  20.0%,  Validation Loss: 3.446
Training Epoch 8 --- Training Accuracy:  22.5%, Validation Accuracy:  15.0%,  Validation Loss: 3.455
Training Epoch 9 --- Training Accuracy:  17.5%, Validation Accuracy:  15.0%,  Validation Loss: 3.372
Training Epoch 10 --- Training Accuracy:  27.5%, Validation Accuracy:  12.5%,  Validation L

# I kind of think we might benefit from excluding some of the ridiculous simulations that would be hard to detect

In [4]:
#@jit
def make_data_easy(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        start,end = [0,0]
        while (end-start < .05):
            edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
            edges1 = edges
            if np.random.randint(2):
                edges1 = tuple([edges[1],edges[0]])
            ## get observed data w/o migration
            start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0],edgedict[tuple(edges)][1],2))
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0.05,.1))),
            theta=0.01,
            ntests=1,
            debug=False,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 1000 == 0):
            print(run)
    return([dat,labels])

In [14]:
dat,labs = make_data_easy(nruns=5000,tree=tree)

In [15]:
strlabs = [str(labs[i][0])+str(labs[i][1]) for i in range(len(labs))]
flat = [obj.flatten() for obj in dat]
traindat = flat[:4500]
trainlabs = strlabs[:4500]
testdat = flat[4500:]
testlabs = strlabs[4500:]

In [16]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=300,max_depth=None, random_state=1,warm_start = True)

random_forest.fit(traindat, trainlabs)

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(testdat)
accuracy_score(testlabs, y_predict)

0.596

Hmm. That's surprising.

Maybe we train a model on hard data first, then try it on this "easy" data

In [17]:
db_save = h5py.File("saved_10000snp.hdf5","r+")

In [18]:
db_save.keys()

[u'counts', u'source_dest']

In [22]:
dat1= db_save['counts']
flat1 = [obj.flatten() for obj in dat1]

In [20]:
labs1= db_save['source_dest']

In [25]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=50,max_depth=None, random_state=1,warm_start = True)

random_forest.fit(flat1, labs1)

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(flat)
accuracy_score(strlabs, y_predict)

0.5372

# Let's make hard training data for the model, then evaluate on the same easy data (same number snps)

In [26]:
@jit
def make_data(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
        edges1 = edges
        if np.random.randint(2):
            edges1 = tuple([edges[1],edges[0]])
        ## get observed data w/o migration
        start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0],edgedict[tuple(edges)][1],2))
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0,.1))),
            theta=0.01,
            ntests=1,
            debug=False,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 1000 == 0):
            print(run)
    return([dat,labels])

In [27]:
dat1,labs1 = make_data(5000,tree)



0

1000

2000

3000

4000



In [31]:
flat1 = [obj.flatten() for obj in dat1]
strlabs1 = [str(labs1[i][0])+str(labs1[i][1]) for i in range(len(labs1))]

In [32]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=300,max_depth=None, random_state=1,warm_start = True)

random_forest.fit(flat1, strlabs1)

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(flat)
accuracy_score(strlabs, y_predict)

0.5642

This is dumb. Maybe it's overloaded? Let's make test data with just a little introgression and see how this changes...

In [42]:
#@jit
def make_data_hard(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        start,end = [0,0]
        while (end-start < .08):
            edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
            edges1 = edges
            if np.random.randint(2):
                edges1 = tuple([edges[1],edges[0]])
            ## get observed data w/o migration
            start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0],edgedict[tuple(edges)][1],2))
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0,.3))),
            theta=0.01,
            ntests=1,
            debug=False,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 1000 == 0):
            print(run)
    return([dat,labels])

In [43]:
dat,labs = make_data_hard(nruns=5000,tree=tree)

0
1000
2000
3000
4000


In [44]:
strlabs = [str(labs[i][0])+str(labs[i][1]) for i in range(len(labs))]
flat = [obj.flatten() for obj in dat]

In [48]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=300,max_depth=None, random_state=1)

random_forest.fit(flat1, strlabs1)

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(flat)
accuracy_score(strlabs, y_predict)

0.5496

In [49]:
#@jit
def make_data_hard(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        start,end = [0,0]
        while (end-start < .15):
            edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
            edges1 = edges
            if np.random.randint(2):
                edges1 = tuple([edges[1],edges[0]])
            ## get observed data w/o migration
            start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0],edgedict[tuple(edges)][1],2))
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0,.3))),
            theta=0.01,
            ntests=1,
            debug=False,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 1000 == 0):
            print(run)
    return([dat,labels])

In [50]:
dat,labs = make_data_hard(nruns=5000,tree=tree)

0
1000
2000
3000
4000


In [51]:
strlabs = [str(labs[i][0])+str(labs[i][1]) for i in range(len(labs))]
flat = [obj.flatten() for obj in dat]

In [52]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=300,max_depth=None, random_state=1)

random_forest.fit(flat1, strlabs1)

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(flat)
accuracy_score(strlabs, y_predict)

0.5136

Maybe adjust timing relative to nodes instead...

In [53]:
#@jit
def make_data_hard(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        start,end = [0,0]
        while (end-start < .01):
            edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
            edges1 = edges
            if np.random.randint(2):
                edges1 = tuple([edges[1],edges[0]])
            ## get observed data w/o migration
            if (edgedict[tuple(edges)][1] - edgedict[tuple(edges)][0] > .2):
                start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0]+.1,edgedict[tuple(edges)][1],2)-.1)
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0,.2))),
            theta=0.01,
            ntests=1,
            debug=False,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 1000 == 0):
            print(run)
    return([dat,labels])

In [54]:
dat,labs = make_data_hard(nruns=5000,tree=tree)

0
1000
2000
3000
4000


In [55]:
strlabs = [str(labs[i][0])+str(labs[i][1]) for i in range(len(labs))]
flat = [obj.flatten() for obj in dat]

In [56]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=300,max_depth=None, random_state=1)

random_forest.fit(flat1, strlabs1)

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(flat)
accuracy_score(strlabs, y_predict)

0.534

Overnight runs:

In [430]:
@jit
def make_data(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
        edges1 = edges
        if np.random.randint(2):
            edges1 = tuple([edges[1],edges[0]])
        ## get observed data w/o migration
        start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0],edgedict[tuple(edges)][1],2))
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0,.1))),
            theta=0.01,
            ntests=1,
            debug=False,
            nsnps=10000,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 1000 == 0):
            print(run)
    return([dat,labels])

In [431]:
dat,labs = make_data(10000,tree)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [438]:
strlabs = [str(labs[i][0])+str(labs[i][1]) for i in range(len(labs))]

In [434]:
db_save1 = h5py.File("saved_10000snp1.hdf5","r+")

In [439]:
db_save2 = h5py.File("saved_10000snp2.hdf5","w")

In [440]:
db_save2.create_dataset(data=np.vstack([db_save1['counts'],dat]),name='counts')

<HDF5 dataset "counts": shape (30000, 5, 16, 16), type "<f8">

In [442]:
db_save2.create_dataset(data=np.hstack([db_save1['source_dest'],np.array(strlabs)]),name='source_dest')

<HDF5 dataset "source_dest": shape (30000,), type "|S2">

In [443]:
db_save1.close()

In [444]:
db_save2.close()

Now let's see if the extra 10000 helps us...

In [445]:
db_save1 = h5py.File("saved_10000snp2.hdf5","r+")
dat = db_save1['counts']
flat = [obj.flatten() for obj in dat]
strlabs = db_save1['source_dest']

In [455]:
sortedlabs = np.array([''.join(sorted(strlab)) for strlab in strlabs])

In [458]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=300,max_depth=None, random_state=1)

random_forest.fit(flat[:19500], strlabs[:19500])

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(flat[19500:20000])
accuracy_score(strlabs[19500:20000], y_predict)

0.732

In [459]:
from sklearn.metrics import confusion_matrix
import toyplot
ckwargs = {
    "height": 750,
    "width": 750,
}
canvas = toyplot.Canvas(**ckwargs)
table = canvas.matrix(confusion_matrix(strlabs[19500:20000], y_predict,labels = random_forest.classes_), margin=20)

import toyplot.pdf
toyplot.pdf.render(canvas, "conf_rf.pdf")

In [469]:
from sklearn.tree import export_graphviz
import os

In [477]:
for tree_in_forest in random_forest.estimators_[0:1]:
    export_graphviz(tree_in_forest,
                    feature_names=range(1280),
                    filled=True,
                    rounded=True)
os.system('dot -Tpng tree.dot -o tree.png')



0

In [96]:
from sklearn.svm import SVC
clf = SVC(C=500,gamma=1)
clf.fit(flat[:19500], sortedlabs[:19500]) 

SVC(C=500, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [97]:
from sklearn.metrics import accuracy_score

y_predict = clf.predict(flat[19500:])
accuracy_score(sortedlabs[19500:], y_predict)

0.856

In [98]:
import tensorflow as tf
import time
from datetime import timedelta
import math
import random
import numpy as np
import h5py

#Adding Seed so that random initialization is consistent
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


batch_size = 40
img_size = 16
num_channels = 5

In [99]:
classes = strlabs
unique_classes=np.unique(classes)
num_classes = len(unique_classes)

data = dat

dat_new = np.array([np.stack([data[samp][i] for i in range(len(data[samp]))],axis=2) for samp in range(len(data))])

In [100]:
session = tf.Session()
x = tf.placeholder(tf.float32, shape=[None, img_size,img_size,num_channels], name='x')

## labels
y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls = tf.argmax(y_true, axis=1)



##Network graph params
filter_size_conv1 = 5
num_filters_conv1 = 32

filter_size_conv2 = 5
num_filters_conv2 = 32

filter_size_conv3 = 5
num_filters_conv3 = 64
    
fc_layer_size = 128

def create_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))

def create_biases(size):
    return tf.Variable(tf.constant(0.05, shape=[size]))



def create_convolutional_layer(input,
               num_input_channels, 
               conv_filter_size,        
               num_filters):  
    
    ## We shall define the weights that will be trained using create_weights function.
    weights = create_weights(shape=[conv_filter_size, conv_filter_size, num_input_channels, num_filters])
    ## We create biases using the create_biases function. These are also trained.
    biases = create_biases(num_filters)

    ## Creating the convolutional layer
    layer = tf.nn.conv2d(input=input,
                     filter=weights,
                     strides=[1, 1, 1, 1],
                     padding='SAME')

    layer += biases

    ## We shall be using max-pooling.  
    layer = tf.nn.max_pool(value=layer,
                            ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1],
                            padding='SAME')
    ## Output of pooling is fed to Relu which is the activation function for us.
    layer = tf.nn.relu(layer)

    return layer

    

def create_flatten_layer(layer):
    #We know that the shape of the layer will be [batch_size img_size img_size num_channels] 
    # But let's get it from the previous layer.
    layer_shape = layer.get_shape()

    ## Number of features will be img_height * img_width* num_channels. But we shall calculate it in place of hard-coding it.
    num_features = layer_shape[1:4].num_elements()

    ## Now, we Flatten the layer so we shall have to reshape to num_features
    layer = tf.reshape(layer, [-1, num_features])

    return layer


def create_fc_layer(input,          
             num_inputs,    
             num_outputs,
             use_relu=True):
    
    #Let's define trainable weights and biases.
    weights = create_weights(shape=[num_inputs, num_outputs])
    biases = create_biases(num_outputs)

    # Fully connected layer takes input x and produces wx+b.Since, these are matrices, we use matmul function in Tensorflow
    layer = tf.matmul(input, weights) + biases
    if use_relu:
        layer = tf.nn.relu(layer)

    return layer


layer_conv1 = create_convolutional_layer(input=x,
               num_input_channels=num_channels,
               conv_filter_size=filter_size_conv1,
               num_filters=num_filters_conv1)
layer_conv2 = create_convolutional_layer(input=layer_conv1,
               num_input_channels=num_filters_conv1,
               conv_filter_size=filter_size_conv2,
               num_filters=num_filters_conv2)

layer_conv3= create_convolutional_layer(input=layer_conv2,
               num_input_channels=num_filters_conv2,
               conv_filter_size=filter_size_conv3,
               num_filters=num_filters_conv3)
          
layer_flat = create_flatten_layer(layer_conv3)

layer_fc1 = create_fc_layer(input=layer_flat,
                     num_inputs=layer_flat.get_shape()[1:4].num_elements(),
                     num_outputs=fc_layer_size,
                     use_relu=True)

layer_fc2 = create_fc_layer(input=layer_fc1,
                     num_inputs=fc_layer_size,
                     num_outputs=num_classes,
                     use_relu=False) 

y_pred = tf.nn.softmax(layer_fc2,name='y_pred')

y_pred_cls = tf.argmax(y_pred, axis=1)
session.run(tf.global_variables_initializer())
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_fc2,
                                                    labels=y_true)
cost = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


session.run(tf.global_variables_initializer()) 

In [101]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
#data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
#values = array(data)
#print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(classes)
print(integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded).astype(np.int8)

[28 10 15 ... 30 30 16]


In [103]:
def get_batch(arr,classes,number):
    idxs = np.random.choice(len(arr),replace=False,size=number)
    return([arr[idxs],classes[idxs]])

def show_progress(epoch, feed_dict_train, feed_dict_validate, val_loss):
    acc = session.run(accuracy, feed_dict=feed_dict_train)
    val_acc = session.run(accuracy, feed_dict=feed_dict_validate)
    msg = "Training Epoch {0} --- Training Accuracy: {1:>6.1%}, Validation Accuracy: {2:>6.1%},  Validation Loss: {3:.3f}"
    print(msg.format(epoch + 1, acc, val_acc, val_loss))

total_iterations = 0
ticker_idx = 0
saver = tf.train.Saver()
def train(num_iteration):
    global total_iterations
    global ticker_idx
    for i in range(total_iterations,
                   total_iterations + num_iteration):

        #x_batch, y_true_batch, _, cls_batch = data.train.next_batch(batch_size)
        x_batch, y_true_batch = get_batch(dat_new[:9500],onehot_encoded[:9500],batch_size)
        #x_valid_batch, y_valid_batch, _, valid_cls_batch = data.valid.next_batch(batch_size)
        x_valid_batch, y_valid_batch = get_batch(dat_new[9500:],onehot_encoded[9500:],batch_size)
        
        feed_dict_tr = {x: x_batch,
                           y_true: y_true_batch}
        feed_dict_val = {x: x_valid_batch,
                              y_true: y_valid_batch}

        session.run(optimizer, feed_dict=feed_dict_tr)

        if i % int(10) == 0: 
            val_loss = session.run(cost, feed_dict=feed_dict_val)
            epoch = int(i / 10)    
            
            show_progress(epoch, feed_dict_tr, feed_dict_val, val_loss)
            saver.save(session, 'dogs-cats-model')
        ticker_idx += batch_size


    total_iterations += num_iteration

train(num_iteration=10000)

Training Epoch 1 --- Training Accuracy:  72.5%, Validation Accuracy:  67.5%,  Validation Loss: 1.040
Training Epoch 2 --- Training Accuracy:  77.5%, Validation Accuracy:  67.5%,  Validation Loss: 0.993
Training Epoch 3 --- Training Accuracy:  70.0%, Validation Accuracy:  55.0%,  Validation Loss: 1.000
Training Epoch 4 --- Training Accuracy:  57.5%, Validation Accuracy:  70.0%,  Validation Loss: 1.020
Training Epoch 5 --- Training Accuracy:  67.5%, Validation Accuracy:  57.5%,  Validation Loss: 1.050
Training Epoch 6 --- Training Accuracy:  75.0%, Validation Accuracy:  65.0%,  Validation Loss: 1.113
Training Epoch 7 --- Training Accuracy:  65.0%, Validation Accuracy:  65.0%,  Validation Loss: 0.917
Training Epoch 8 --- Training Accuracy:  62.5%, Validation Accuracy:  65.0%,  Validation Loss: 1.068
Training Epoch 9 --- Training Accuracy:  55.0%, Validation Accuracy:  62.5%,  Validation Loss: 0.955
Training Epoch 10 --- Training Accuracy:  62.5%, Validation Accuracy:  65.0%,  Validation L

# New model

In [118]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)


def cnn_model_fn(features, labels, mode):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    input_layer = tf.reshape(features['x'], [-1, 16, 16, 5])

    # Convolutional Layer #1
    # Computes 32 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 28, 28, 1]
    # Output Tensor Shape: [batch_size, 28, 28, 32]
    conv1 = tf.layers.conv2d(
      inputs=input_layer,
      filters=32,
      kernel_size=[5, 5],
      padding="same",
      activation=tf.nn.relu)

    # Pooling Layer #1
    # First max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 16, 16, 32]
    # Output Tensor Shape: [batch_size, 8, 8, 32]
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

    # Convolutional Layer #2
    # Computes 64 features using a 5x5 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 14, 14, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 64]
    conv2 = tf.layers.conv2d(
      inputs=pool1,
      filters=64,
      kernel_size=[5, 5],
      padding="same",
      activation=tf.nn.relu)

    # Pooling Layer #2
    # Second max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 8, 8, 64]
    # Output Tensor Shape: [batch_size, 4, 4, 64]
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

    # Flatten tensor into a batch of vectors
    # Input Tensor Shape: [batch_size, 4, 4, 64]
    # Output Tensor Shape: [batch_size, 4 * 4 * 64]
    pool2_flat = tf.reshape(pool2, [-1, 4 * 4 * 64])

    # Dense Layer
    # Densely connected layer with 1024 neurons
    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
    # Output Tensor Shape: [batch_size, 1024]
    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)

    # Add dropout operation; 0.6 probability that element will be kept
    dropout = tf.layers.dropout(
      inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)

    # Logits layer
    # Input Tensor Shape: [batch_size, 1024]
    # Output Tensor Shape: [batch_size, 40]
    logits = tf.layers.dense(inputs=dropout, units=40)

    predictions = {
      # Generate predictions (for PREDICT and EVAL mode)
      "classes": tf.argmax(input=logits, axis=1),
      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
      # `logging_hook`.
      "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
      "accuracy": tf.metrics.accuracy(
          labels=labels, predictions=predictions["classes"])}
    return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)


def main(dataset,labels):
    # Load training and eval data
    #mnist = tf.contrib.learn.datasets.load_dataset("mnist")
    train_data = dataset[:19500]  # Returns np.array
    train_labels = labels[:19500]
    eval_data = dataset[19500:] # Returns np.array
    eval_labels = labels[19500:]

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
      model_fn=cnn_model_fn, model_dir="./tensorflow/modeltest1")

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(
      tensors=tensors_to_log, every_n_iter=50)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={"x": train_data},
      y=train_labels,
      batch_size=100,
      num_epochs=None,
      shuffle=True)
    mnist_classifier.train(
      input_fn=train_input_fn,
      steps=20000,
      hooks=[logging_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={"x": eval_data},
      y=eval_labels,
      num_epochs=1,
      shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)



In [119]:
integer_encoded = label_encoder.fit_transform(classes)

In [123]:
mod = main(np.array(dat).astype(np.float32), integer_encoded)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_model_dir': './tensorflow/modeltest1', '_save_summary_steps': 100}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from ./tensorflow/modeltest1/model.ckpt-60000
INFO:tensorflow:Saving checkpoints for 60001 into ./tensorflow/modeltest1/model.ckpt.
INFO:tensorflow:probabilities = [[0.         0.11036393 0.12587115 ... 0.00000011 0.00000001 0.00000062]
 [0.00000259 0.03310552 0.01551652 ... 0.00002701 0.00002927 0.00021206]
 [0.00000001 0.         0.         ... 0.00007793 0.00000034 0.00000011]
 ...
 [0.         0.00000072 0.23157246 ... 0.000023   0.00001209 0.00000006]
 [0.         0.00000347 0.         ... 0.         0.00000005 0.00000065]
 [0.00003578 0.         0.         ...

# Let's load a prediction version of our model

In [162]:
def cnn_model_fn_pred(features,mode):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    input_layer = tf.reshape(features['x'], [-1, 16, 16, 5])

    # Convolutional Layer #1
    # Computes 32 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 28, 28, 1]
    # Output Tensor Shape: [batch_size, 28, 28, 32]
    conv1 = tf.layers.conv2d(
      inputs=input_layer,
      filters=32,
      kernel_size=[5, 5],
      padding="same",
      activation=tf.nn.relu)

    # Pooling Layer #1
    # First max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 16, 16, 32]
    # Output Tensor Shape: [batch_size, 8, 8, 32]
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

    # Convolutional Layer #2
    # Computes 64 features using a 5x5 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 14, 14, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 64]
    conv2 = tf.layers.conv2d(
      inputs=pool1,
      filters=64,
      kernel_size=[5, 5],
      padding="same",
      activation=tf.nn.relu)

    # Pooling Layer #2
    # Second max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 8, 8, 64]
    # Output Tensor Shape: [batch_size, 4, 4, 64]
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

    # Flatten tensor into a batch of vectors
    # Input Tensor Shape: [batch_size, 4, 4, 64]
    # Output Tensor Shape: [batch_size, 4 * 4 * 64]
    pool2_flat = tf.reshape(pool2, [-1, 4 * 4 * 64])

    # Dense Layer
    # Densely connected layer with 1024 neurons
    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
    # Output Tensor Shape: [batch_size, 1024]
    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)

    # Add dropout operation; 0.6 probability that element will be kept
    dropout = tf.layers.dropout(
      inputs=dense, rate=0.4, training=False)

    # Logits layer
    # Input Tensor Shape: [batch_size, 1024]
    # Output Tensor Shape: [batch_size, 40]
    logits = tf.layers.dense(inputs=dropout, units=40)
    predictions = {
      # Generate predictions (for PREDICT and EVAL mode)
      "classes": tf.argmax(input=logits, axis=1),
      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
      # `logging_hook`.
      "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    return tf.estimator.EstimatorSpec(mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions)

In [163]:
mnist_classifier = tf.estimator.Estimator(
model_fn=cnn_model_fn_pred, model_dir="./tensorflow/modeltest1")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_model_dir': './tensorflow/modeltest1', '_save_summary_steps': 100}


In [164]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": dat[0].astype(np.float32)},
    num_epochs=1,
    shuffle=False)

In [165]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": dat[0].astype(np.float32)},
    num_epochs=1,
    shuffle=False)
predictions = list(mnist_classifier.predict(input_fn=predict_input_fn))

INFO:tensorflow:Restoring parameters from ./tensorflow/modeltest1/model.ckpt-80000


In [166]:
predictions

[{'classes': 28,
  'probabilities': array([1.50456403e-09, 1.12037268e-09, 4.45148302e-03, 6.31341618e-03,
         2.29570107e-03, 1.00808268e-06, 3.83659815e-09, 1.63813326e-13,
         5.40724088e-09, 6.65748034e-09, 1.19852302e-07, 3.37088157e-09,
         1.13630626e-10, 7.52562431e-15, 3.17103496e-11, 5.54883071e-13,
         2.27754859e-09, 6.13497725e-11, 9.85714723e-06, 1.13124600e-10,
         1.96008140e-12, 6.13012885e-09, 3.05936396e-08, 4.26581119e-06,
         6.14165385e-10, 2.41652943e-13, 9.41949274e-09, 1.25152226e-08,
         9.85912740e-01, 2.78220468e-05, 9.00902630e-10, 7.04370555e-04,
         1.58494245e-07, 4.67399952e-10, 6.12809492e-09, 2.59411437e-10,
         5.54109065e-05, 1.26491941e-04, 9.71220943e-05, 2.03518677e-08],
        dtype=float32)}]

In [174]:
label_encoder.classes_[28]

'50'

# Now let's train a random forest

In [175]:
db_save1 = h5py.File("saved_10000snp1.hdf5","r+")
dat = db_save1['counts']
flat = [obj.flatten() for obj in dat]
strlabs = db_save1['source_dest']

In [176]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=300,max_depth=None, random_state=1)

random_forest.fit(flat, strlabs)

from sklearn.metrics import accuracy_score

y_predict = random_forest.predict(flat[19500:])
accuracy_score(strlabs[19500:], y_predict)

1.0

Now let's see if the models mess up on the same trees

In [397]:
@jit
def make_data(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        vals=edgedict.values()
        #print(max([i[1]-i[0] for i in vals]))
        edges = edgedict.keys()[np.argmin([i[1]-i[0] for i in vals])]
        #edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
        edges1 = edges
        if np.random.randint(2):
            edges1 = tuple([edges[1],edges[0]])
        ## get observed data w/o migration
        start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0],edgedict[tuple(edges)][1],2))
        if (end-start) > .03:
            end = start +.03
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0.8,.1))),
            theta=0.01,
            ntests=1,
            debug=False,
            nsnps=10000,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 1000 == 0):
            print(run)
    return([dat,labels])

In [398]:
testing_dat, testing_labs = make_data(100,tree)

0


In [399]:
rf_predict = random_forest.predict([obj.flatten() for obj in testing_dat])

In [400]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": testing_dat.astype(np.float32)},
    num_epochs=1,
    shuffle=False)
cnn_predictions = list(mnist_classifier.predict(input_fn=predict_input_fn))

INFO:tensorflow:Restoring parameters from ./tensorflow/modeltest1/model.ckpt-80000


In [401]:
int_cnn_preds= [i['classes'] for i in cnn_predictions]

In [402]:
cnn_predict= np.array([label_encoder.classes_[i] for i in int_cnn_preds])

In [403]:
strlabs = np.array([str(testing_labs[i][0])+str(testing_labs[i][1]) for i in range(len(testing_labs))])

In [404]:
rfprobs = random_forest.predict_proba([obj.flatten() for obj in testing_dat])

In [405]:
cnnprobs =[i['probabilities'] for i in cnn_predictions]

In [406]:
mult=cnnprobs*rfprobs

In [407]:
maxmult = [np.argmax(i) for i in mult]

In [408]:
predlabs = np.array([label_encoder.classes_[i] for i in maxmult])

In [409]:
counter = 0
for i in range(len(strlabs)):
    if strlabs[i] == cnn_predict[i]:
        counter += 1
    elif strlabs[i] == rf_predict[i]:
        counter += 1

In [410]:
counter

83

In [411]:
sum(cnn_predict == strlabs)

68

In [412]:
sum(rf_predict == strlabs)

79

In [413]:
sum(np.array(["".join(sorted(i)) for i in cnn_predict]) == np.array(["".join(sorted(i)) for i in strlabs]))

82

In [414]:
sum(np.array(["".join(sorted(i)) for i in rf_predict]) == np.array(["".join(sorted(i)) for i in strlabs]))

86

In [478]:
random_forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)