In [58]:
import h5py
import simcat
import numpy as np
import tensorflow as tf
from numba import jit
import toytree
from copy import deepcopy

Define some basic functions:

In [56]:
# sample new nodes
def _node_slider(tree):
        ctree = deepcopy(tree)
        for node in ctree.tree.traverse():

            ## slide internal nodes 
            if node.up and node.children:

                ## get min and max slides
                minjit = max([i.dist for i in node.children]) * 0.99
                maxjit = (node.up.height * 0.99) - node.height
                newheight = np.random.uniform(low=-minjit, high=maxjit)

                ## slide children
                for child in node.children:
                    child.dist += newheight

                ## slide self to match
                node.dist -= newheight

        ## make max height = 1
        mod = ctree.tree.height
        for node in ctree.tree.traverse():
            node.dist = node.dist / float(mod)

        return ctree

# find all possible admixture edges on the tree
def get_all_admix_edges(ttree):

    ## for all nodes map the potential admixture interval
    for snode in ttree.tree.traverse():
        if snode.is_root():
            snode.interval = (None, None)
        else:
            snode.interval = (snode.height, snode.up.height)

    ## for all nodes find overlapping intervals
    intervals = {}
    for snode in ttree.tree.traverse():
        for dnode in ttree.tree.traverse():
            if not any([snode.is_root(), dnode.is_root(), dnode == snode]):
                ## check for overlap
                smin, smax = snode.interval
                dmin, dmax = dnode.interval

                ## find if nodes have interval where admixture can occur
                low_bin = np.max([smin, dmin])
                top_bin = np.min([smax, dmax])
                if top_bin > low_bin:
                    intervals[(snode.idx, dnode.idx)] = (low_bin, top_bin)
    return intervals

# Create a 5-tip topology

In [35]:
## generate a random tree
tree = toytree.rtree.unittree(ntips=5, treeheight=3, seed=12345)
c, a = tree.draw(tree_style='c')

# Example simcat model:

In [43]:
observed = simcat.Model(
    tree=tree, 
    admixture_edges=((1, 4, 0.5, 0.7, 0.00001)),
    theta=0.01,
    ntests=1,
    debug=True,
    )

In [44]:
observed.run()

demog div: (1500000, 2, 0)
demog div: (750000, 1, 0)
demog div: (1000000, 3, 2)
demog div: (500000, 4, 3)
demog mig: (250000.0, 350000.0) 0.0 (1, 4) 250000.0


In [46]:
observed.counts.shape

(1, 5, 16, 16)

Plot the top layer of the simulated site count matrix:

In [52]:
import toyplot
toyplot.matrix(observed.counts[0][0]);

# simulate training data (run previously)

In [31]:
@jit
def make_data(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
        edges1 = edges
        if np.random.randint(2):
            edges1 = tuple([edges[1],edges[0]])
        ## get observed data w/o migration
        start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0],edgedict[tuple(edges)][1],2))
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0,.1))),
            theta=0.01,
            ntests=1,
            debug=False,
            nsnps=10000,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 5000 == 0):
            print(run)
    return([dat,labels])

In [None]:
dat,labs = make_data(40000,tree)

In [None]:
strlabs = [str(labs[i][0])+str(labs[i][1]) for i in range(len(labs))]

In [None]:
db_save1 = h5py.File("saved_40000.hdf5","w")

In [None]:
db_save1.create_dataset(data=dat,name='counts')
db_save1.create_dataset(data=np.array(strlabs),name='source_dest')

## Read data:

In [12]:
db_save1 = h5py.File("saved_10000snp2.hdf5","r+")


In [13]:
dat = db_save1['counts'][:20000]
flat = [obj.flatten() for obj in dat]
strlabs = db_save1['source_dest'][:20000]

# Optimize random forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

for num_estimators in [50,100,150,200,250,300,350]:
    random_forest = RandomForestClassifier(n_estimators=num_estimators,max_depth=None, random_state=1)

    random_forest.fit(flat[:19500], strlabs[:19500])

    y_predict = random_forest.predict(flat[19500:])
    print(accuracy_score(strlabs[19500:], y_predict))

0.71
0.714
0.73
0.738
0.738
0.732
0.734


So the model with 200 estimators will be used.

In [63]:
random_forest = RandomForestClassifier(n_estimators=200,max_depth=None, random_state=1)
random_forest.fit(flat[:19500], strlabs[:19500])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

# Neural Network Training (run previously)

Adapted from Tensorflow website, https://www.tensorflow.org/tutorials/layers

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)


def cnn_model_fn(features, labels, mode):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    input_layer = tf.reshape(features['x'], [-1, 16, 16, 5])

    # Convolutional Layer #1
    # Computes 32 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 28, 28, 1]
    # Output Tensor Shape: [batch_size, 28, 28, 32]
    conv1 = tf.layers.conv2d(
      inputs=input_layer,
      filters=32,
      kernel_size=[5, 5],
      padding="same",
      activation=tf.nn.relu)

    # Pooling Layer #1
    # First max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 16, 16, 32]
    # Output Tensor Shape: [batch_size, 8, 8, 32]
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

    # Convolutional Layer #2
    # Computes 64 features using a 5x5 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 14, 14, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 64]
    conv2 = tf.layers.conv2d(
      inputs=pool1,
      filters=64,
      kernel_size=[5, 5],
      padding="same",
      activation=tf.nn.relu)

    # Pooling Layer #2
    # Second max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 8, 8, 64]
    # Output Tensor Shape: [batch_size, 4, 4, 64]
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

    # Flatten tensor into a batch of vectors
    # Input Tensor Shape: [batch_size, 4, 4, 64]
    # Output Tensor Shape: [batch_size, 4 * 4 * 64]
    pool2_flat = tf.reshape(pool2, [-1, 4 * 4 * 64])

    # Dense Layer
    # Densely connected layer with 1024 neurons
    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
    # Output Tensor Shape: [batch_size, 1024]
    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)

    # Add dropout operation; 0.6 probability that element will be kept
    dropout = tf.layers.dropout(
      inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)

    # Logits layer
    # Input Tensor Shape: [batch_size, 1024]
    # Output Tensor Shape: [batch_size, 40]
    logits = tf.layers.dense(inputs=dropout, units=40)

    predictions = {
      # Generate predictions (for PREDICT and EVAL mode)
      "classes": tf.argmax(input=logits, axis=1),
      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
      # `logging_hook`.
      "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
      "accuracy": tf.metrics.accuracy(
          labels=labels, predictions=predictions["classes"])}
    return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)


def main(dataset,labels):
    # Load training and eval data
    #mnist = tf.contrib.learn.datasets.load_dataset("mnist")
    train_data = dataset[:19500]  # Returns np.array
    train_labels = labels[:19500]
    eval_data = dataset[19500:] # Returns np.array
    eval_labels = labels[19500:]

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
      model_fn=cnn_model_fn, model_dir="./tensorflow/modeltest1")

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(
      tensors=tensors_to_log, every_n_iter=50)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={"x": train_data},
      y=train_labels,
      batch_size=100,
      num_epochs=None,
      shuffle=True)
    mnist_classifier.train(
      input_fn=train_input_fn,
      steps=20000,
      hooks=[logging_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={"x": eval_data},
      y=eval_labels,
      num_epochs=1,
      shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)


In [23]:
from sklearn.preprocessing import LabelEncoder
unique_classes=np.unique(strlabs)
num_classes = len(unique_classes)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(strlabs)

### Runs the neural network for 20000 steps:

In [None]:
mod = main(np.array(dat).astype(np.float32), integer_encoded)

# Now load the model back up and make predictions:

Redefine, now to generate predictions:

In [17]:
def cnn_model_fn_pred(features,mode):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    input_layer = tf.reshape(features['x'], [-1, 16, 16, 5])

    # Convolutional Layer #1
    # Computes 32 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 28, 28, 1]
    # Output Tensor Shape: [batch_size, 28, 28, 32]
    conv1 = tf.layers.conv2d(
      inputs=input_layer,
      filters=32,
      kernel_size=[5, 5],
      padding="same",
      activation=tf.nn.relu)

    # Pooling Layer #1
    # First max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 16, 16, 32]
    # Output Tensor Shape: [batch_size, 8, 8, 32]
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

    # Convolutional Layer #2
    # Computes 64 features using a 5x5 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 14, 14, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 64]
    conv2 = tf.layers.conv2d(
      inputs=pool1,
      filters=64,
      kernel_size=[5, 5],
      padding="same",
      activation=tf.nn.relu)

    # Pooling Layer #2
    # Second max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 8, 8, 64]
    # Output Tensor Shape: [batch_size, 4, 4, 64]
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

    # Flatten tensor into a batch of vectors
    # Input Tensor Shape: [batch_size, 4, 4, 64]
    # Output Tensor Shape: [batch_size, 4 * 4 * 64]
    pool2_flat = tf.reshape(pool2, [-1, 4 * 4 * 64])

    # Dense Layer
    # Densely connected layer with 1024 neurons
    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
    # Output Tensor Shape: [batch_size, 1024]
    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)

    # Add dropout operation; 0.6 probability that element will be kept
    dropout = tf.layers.dropout(
      inputs=dense, rate=0.4, training=False)

    # Logits layer
    # Input Tensor Shape: [batch_size, 1024]
    # Output Tensor Shape: [batch_size, 40]
    logits = tf.layers.dense(inputs=dropout, units=40)
    predictions = {
      # Generate predictions (for PREDICT and EVAL mode)
      "classes": tf.argmax(input=logits, axis=1),
      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
      # `logging_hook`.
      "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    return tf.estimator.EstimatorSpec(mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions)

In [20]:
mnist_classifier = tf.estimator.Estimator(
model_fn=cnn_model_fn_pred, model_dir="./tensorflow/modeltest1")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_model_dir': './tensorflow/modeltest1', '_save_summary_steps': 100}


In [21]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": dat[19500:].astype(np.float32)},
    num_epochs=1,
    shuffle=False)
cnn_predictions = list(mnist_classifier.predict(input_fn=predict_input_fn))

INFO:tensorflow:Restoring parameters from ./tensorflow/modeltest1/model.ckpt-80000


In [24]:
int_cnn_preds= [i['classes'] for i in cnn_predictions]
cnn_predict= np.array([label_encoder.classes_[i] for i in int_cnn_preds])

### Get cnn score on the 500 test values:

In [28]:
float(sum(cnn_predict == strlabs[19500:]))/500

0.704

# Now test under short introgression lengths

In [81]:
@jit
def make_data(nruns,tree):
    labels = np.zeros((nruns,2),dtype=np.int32)
    dat = np.zeros((nruns,5,16,16))
    for run in range(nruns):
        currtree = _node_slider(tree)
        edgedict = get_all_admix_edges(currtree)
        vals=edgedict.values()
        #print(max([i[1]-i[0] for i in vals]))
        #edges = edgedict.keys()[np.argmin([i[1]-i[0] for i in vals])]
        edges = edgedict.keys()[np.random.randint(len(edgedict.keys()))]
        edges1 = edges
        if np.random.randint(2):
            edges1 = tuple([edges[1],edges[0]])
        # get observed data without migration
        start,end = np.sort(np.random.uniform(edgedict[tuple(edges)][0],edgedict[tuple(edges)][1],2))
        # adjust to make shorter length
        if (end-start) > .03:
            end = start +.03
        observed = simcat.Model(
            tree=currtree, 
            admixture_edges=((edges1[0], edges1[1], start, end, np.random.uniform(0,.1))),
            theta=0.01,
            ntests=1,
            debug=False,
            nsnps=10000,
            )
        observed.run()
        dat[run,:,:,:] = observed.counts[0]
        labels[run,:] = edges1
        if (run % 100 == 0):
            print(run)
    return([dat,labels])

Now create 500 samples with short periods of introgression:

In [82]:
shortintro_dat, shortintro_labs = make_data(500,tree)

0

100

200

300

400



In [83]:
strlabs_short = [str(shortintro_labs[i][0])+str(shortintro_labs[i][1]) for i in range(len(shortintro_labs))]
flat_short = [obj.flatten() for obj in shortintro_dat]

The random forest performs well here

In [84]:
y_predict = random_forest.predict(flat_short)
accuracy_score(strlabs_short, y_predict)

0.762


The cnn performs poorly here

In [85]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": shortintro_dat.astype(np.float32)},
    num_epochs=1,
    shuffle=False)
cnn_predictions = list(mnist_classifier.predict(input_fn=predict_input_fn))
int_cnn_preds= [i['classes'] for i in cnn_predictions]
cnn_predict= np.array([label_encoder.classes_[i] for i in int_cnn_preds])
float(sum(cnn_predict == strlabs_short))/500

INFO:tensorflow:Restoring parameters from ./tensorflow/modeltest1/model.ckpt-80000


0.684

# Now run it without regard to direction:

By definition this can only increase the scores

In [86]:
sorted_strlabs = np.array([''.join(sorted(strlab)) for strlab in strlabs])
sorted_strlabs_short = np.array([''.join(sorted(strlab)) for strlab in strlabs_short])

Scoring without regard for direction with random forest:

In [87]:
y_predict_sorted = np.array([''.join(sorted(strlab)) for strlab in y_predict])
accuracy_score(sorted_strlabs_short, y_predict_sorted)

0.856

Scoring without direction for CNN:

In [88]:
cnn_predict_sorted = np.array([''.join(sorted(strlab)) for strlab in cnn_predict])
float(sum(cnn_predict_sorted == sorted_strlabs_short))/500

0.812