### make a database of very simple simulations

In [1]:
import h5py
import simcat
import numpy as np
import tensorflow as tf
from numba import jit
import toytree
from copy import deepcopy
import ipyparallel as ipp
import toyplot
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import svm

  from ._conv import register_converters as _register_converters


In [2]:
## generate a random tree
tree = toytree.rtree.unittree(ntips=6, treeheight=3, seed=12345)
c, a = tree.draw(tree_style='c',node_labels=tree.get_node_values('name',show_root=True,show_tips=True))

In [3]:
## init a database
db1 = simcat.DataBase(
    name="simple_sims", 
    workdir="./databases", 
    tree=tree, 
    nedges=1,
    ntrees=100,
    ntests=1,
    nreps=2,
    edge_function=None,
    force=True)

stored 9200 labels to /Volumes/My Passport/sims/databases/simple_sims.hdf5


In [4]:
ipyclient = ipp.Client()

In [5]:
ipyclient

<ipyparallel.client.client.Client at 0x1048b7690>

In [6]:
db1.run(ipyclient)

host compute node: [4 cores] on Patricks-MBP.fios-router.home
[                    ]   0% | 0:00:00 | simulating count matrices

submitting jobs


[                    ]   0% | 0:08:53 | simulating count matrices

Done with round: 0 of 3


[                    ]   0% | 0:20:07 | simulating count matrices

Done with round: 1 of 3


[                    ]   0% | 0:25:39 | simulating count matrices

Done with round: 2 of 3


In [7]:
dat=h5py.File("databases/simple_sims.hdf5")

In [8]:
@jit
def dstat(mat):
    '''
    input a 2d square site count matrix, output a d statistic (float)
    '''
    abba = (mat[1,4]+mat[2,8]+mat[3,12]+mat[4,1]+mat[6,9]+mat[7,13]+
            mat[8,2]+mat[9,6]+mat[11,14]+mat[12,3]+mat[13,7]+mat[14,11])
    baba = (mat[1,1]+mat[2,2]+mat[3,3]+mat[4,4]+mat[6,6]+mat[7,7]+
            mat[8,8]+mat[9,9]+mat[11,11]+mat[12,12]+mat[13,13]+mat[14,14])
    return(float(abba-baba)/(abba+baba))
def dstat3d(simcat_arr):
    dstats = np.zeros((len(simcat_arr)))
    counter = 0
    for layer in simcat_arr:
        try:
            dstats[counter] = dstat(layer)
        except:
            # if division by zero error, then return an array of zeros that you can search for later
            dstats = np.zeros((len(simcat_arr)))
            break
        counter += 1
    return(dstats)

In [9]:
all_dstats = np.zeros(dat['counts'].shape[0:2])
for _ in range(len(dat['counts'])):
    all_dstats[_]=dstat3d(dat['counts'][_])
failed = list()
for _ in range(len(all_dstats)):
    if np.alltrue(all_dstats[_] == 0):
        failed.append(_)
np.array(failed)
ind = np.ones((len(all_dstats)), bool)
ind[failed] = False
sources=dat['admix_sources'][:][ind]
targets=dat['admix_targets'][:][ind]
combo=np.hstack([sources,targets])

In [11]:
class_ids=np.array([str(x) for x in combo])
id_dict=dict(enumerate(np.unique(class_ids)))
inv_dict = {v: k for k, v in id_dict.iteritems()}
class_ids_int = np.zeros(class_ids.shape,dtype=np.int32)
counter = 0
for str_id in class_ids:
    class_ids_int[counter] = inv_dict[str_id]
    counter += 1

In [12]:
X = all_dstats[ind]
y = class_ids_int

In [52]:
toyplot.matrix(X[np.random.choice(len(y),100)])

(<toyplot.canvas.Canvas at 0x1535b18590>,
 <toyplot.coordinates.Table at 0x15353e0c90>)

In [18]:
same_ind = X[y==41]
toyplot.matrix(same_ind, label="A matrix");

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [22]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=500, random_state=0)  
classifier.fit(X_train, y_train)  
y_pred = classifier.predict(X_test)  

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[ 4  0  0 ...  1  0  4]
 [ 0 21  0 ...  0  0  0]
 [ 0  0 22 ...  0  0  0]
 ...
 [ 0  0  0 ...  8  0  2]
 [ 0  0  0 ...  0 16  2]
 [ 0  0  0 ...  1  1  7]]
             precision    recall  f1-score   support

          0       0.27      0.16      0.20        25
          1       1.00      1.00      1.00        21
          2       1.00      1.00      1.00        22
          3       1.00      1.00      1.00        23
          4       1.00      1.00      1.00        19
          5       0.19      0.26      0.22        19
          6       0.94      1.00      0.97        16
          7       1.00      0.95      0.98        21
          8       1.00      1.00      1.00        23
          9       1.00      1.00      1.00        21
         10       1.00      1.00      1.00        12
         11       0.96      0.96      0.96        25
         12       0.46      0.46      0.46        24
         13       1.00      1.00      1.00        17
         14       1.00      1.00      1.00      

In [36]:
from sklearn import svm
lin_clf = svm.LinearSVC(C=100,max_iter=100000)
lin_clf.fit(X_train, y_train)

LinearSVC(C=100, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [38]:
y_pred = lin_clf.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[ 0  0  0 ... 10  0  0]
 [ 0 20  0 ...  0  0  0]
 [ 0  0 22 ...  0  0  0]
 ...
 [ 0  0  0 ...  8  0  0]
 [ 0  0  0 ...  1  0  6]
 [ 0  0  0 ...  9  0  2]]
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        25
          1       0.95      0.95      0.95        21
          2       1.00      1.00      1.00        22
          3       1.00      1.00      1.00        23
          4       1.00      1.00      1.00        19
          5       0.12      0.32      0.17        19
          6       1.00      1.00      1.00        16
          7       1.00      1.00      1.00        21
          8       1.00      1.00      1.00        23
          9       1.00      1.00      1.00        21
         10       0.50      0.92      0.65        12
         11       0.55      0.88      0.68        25
         12       0.46      0.25      0.32        24
         13       1.00      1.00      1.00        17
         14       1.00      1.00      1.00      

Now try it on the data alone...

In [8]:
def flattendb(counts):
    newshape=reduce(lambda x,y: x*y, counts[0].shape)
    X = np.zeros((counts.shape[0],newshape))
    for i in range(len(counts)):
        X[i] = counts[i].ravel()
    return(X)

In [9]:
flatdat = flattendb(dat['counts'])

In [10]:
sources=dat['admix_sources'][:]
targets=dat['admix_targets'][:]
combo=np.hstack([sources,targets])
class_ids=np.array([str(x) for x in combo])
id_dict=dict(enumerate(np.unique(class_ids)))
inv_dict = {v: k for k, v in id_dict.iteritems()}
class_ids_int = np.zeros(class_ids.shape,dtype=np.int32)
counter = 0
for str_id in class_ids:
    class_ids_int[counter] = inv_dict[str_id]
    counter += 1

In [11]:
X = flatdat
y = class_ids_int
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [12]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=500, random_state=0)  
classifier.fit(X_train, y_train)  
y_pred = classifier.predict(X_test)  

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[ 9  0  0 ...  0  0  0]
 [ 0 18  0 ...  0  0  0]
 [ 0  0 14 ...  0  0  0]
 ...
 [ 0  0  0 ... 29  0  0]
 [ 0  0  0 ...  0 28  0]
 [ 0  0  0 ...  0  0 10]]
             precision    recall  f1-score   support

          0       0.41      0.69      0.51        13
          1       1.00      1.00      1.00        18
          2       1.00      1.00      1.00        14
          3       1.00      1.00      1.00        17
          4       1.00      1.00      1.00        26
          5       0.69      0.41      0.51        22
          6       1.00      1.00      1.00        29
          7       1.00      1.00      1.00        16
          8       1.00      1.00      1.00        15
          9       1.00      1.00      1.00        22
         10       1.00      1.00      1.00        19
         11       1.00      1.00      1.00        24
         12       0.45      0.36      0.40        14
         13       1.00      1.00      1.00        22
         14       1.00      1.00      1.00      

In [14]:
toyplot.matrix(confusion_matrix(y_test,y_pred))

(<toyplot.canvas.Canvas at 0x1a31dd3d90>,
 <toyplot.coordinates.Table at 0x1a283a0250>)

In [53]:
np.random.uniform(.1,.5,20)

array([0.19530895, 0.20854184, 0.38809278, 0.2576944 , 0.41588968,
       0.34458987, 0.4142651 , 0.34473771, 0.2943015 , 0.43159406,
       0.36998616, 0.30115784, 0.20834401, 0.12289911, 0.30299163,
       0.42930299, 0.16571187, 0.43851701, 0.34616644, 0.12368682])

In [18]:
from sklearn import svm
lin_clf = svm.LinearSVC(C=.1,max_iter=100000)
lin_clf.fit(X_train, y_train)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [19]:
y_pred = lin_clf.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[ 8  0  0 ...  0  0  0]
 [ 0 18  0 ...  0  0  0]
 [ 0  0 14 ...  0  0  0]
 ...
 [ 0  0  0 ... 28  1  0]
 [ 0  0  0 ...  0 27  1]
 [ 0  0  0 ...  3  1  5]]
             precision    recall  f1-score   support

          0       0.38      0.62      0.47        13
          1       1.00      1.00      1.00        18
          2       1.00      1.00      1.00        14
          3       1.00      1.00      1.00        17
          4       1.00      1.00      1.00        26
          5       0.64      0.41      0.50        22
          6       1.00      1.00      1.00        29
          7       1.00      1.00      1.00        16
          8       1.00      1.00      1.00        15
          9       1.00      1.00      1.00        22
         10       1.00      1.00      1.00        19
         11       1.00      1.00      1.00        24
         12       0.44      0.50      0.47        14
         13       1.00      1.00      1.00        22
         14       1.00      1.00      1.00      

In [20]:
toyplot.matrix(confusion_matrix(y_test,y_pred))

(<toyplot.canvas.Canvas at 0x1a31dd3b10>,
 <toyplot.coordinates.Table at 0x1a43a97f50>)

In [21]:
inv_dict

{'[0 1]': 0,
 '[0 2]': 1,
 '[0 3]': 2,
 '[0 4]': 3,
 '[0 5]': 4,
 '[1 0]': 5,
 '[1 2]': 6,
 '[1 3]': 7,
 '[1 4]': 8,
 '[1 5]': 9,
 '[2 0]': 10,
 '[2 1]': 11,
 '[2 3]': 12,
 '[2 4]': 13,
 '[2 5]': 14,
 '[3 0]': 15,
 '[3 1]': 16,
 '[3 2]': 17,
 '[3 4]': 18,
 '[3 5]': 19,
 '[4 0]': 20,
 '[4 1]': 21,
 '[4 2]': 22,
 '[4 3]': 23,
 '[4 5]': 24,
 '[4 6]': 25,
 '[4 7]': 26,
 '[5 0]': 27,
 '[5 1]': 28,
 '[5 2]': 29,
 '[5 3]': 30,
 '[5 4]': 31,
 '[5 6]': 32,
 '[5 7]': 33,
 '[6 4]': 34,
 '[6 5]': 35,
 '[6 7]': 36,
 '[6 9]': 37,
 '[7 4]': 38,
 '[7 5]': 39,
 '[7 6]': 40,
 '[7 9]': 41,
 '[8 9]': 42,
 '[9 6]': 43,
 '[9 7]': 44,
 '[9 8]': 45}

In [23]:
import matplotlib.pyplot as plt

from scipy.ndimage import convolve
from sklearn import linear_model, datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.base import clone

In [45]:
# Models we will use
logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=10000,
                                           multi_class='multinomial')
rbm = BernoulliRBM(random_state=0, verbose=True)

rbm_features_classifier = Pipeline(
    steps=[('rbm', rbm), ('logistic', logistic)])

In [46]:
# Hyper-parameters. These were set by cross-validation,
# using a GridSearchCV. Here we are not performing cross-validation to
# save time.
rbm.learning_rate = 0.05
rbm.n_iter = 500
rbm.batch_size = 100

# More components tend to give better prediction performance, but larger
# fitting time
rbm.n_components = 1000
logistic.C = 6000

In [47]:
# Training RBM-Logistic Pipeline
rbm_features_classifier.fit(X_train, y_train)

[BernoulliRBM] Iteration 1, pseudo-likelihood = -486.80, time = 22.55s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -285.12, time = 26.94s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -264.02, time = 31.03s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -294.98, time = 36.77s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -258.70, time = 32.32s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -272.56, time = 29.06s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -224.69, time = 27.77s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -235.28, time = 31.47s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -255.79, time = 33.51s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -227.85, time = 30.66s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -198.20, time = 31.39s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -234.74, time = 34.48s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -214.27, time = 30.80s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -251.35, time = 29.32s
[

Pipeline(memory=None,
     steps=[('rbm', BernoulliRBM(batch_size=100, learning_rate=0.05, n_components=1000,
       n_iter=500, random_state=0, verbose=True)), ('logistic', LogisticRegression(C=6000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [48]:
# Training the Logistic regression classifier directly on the pixel
raw_pixel_classifier = clone(logistic)
raw_pixel_classifier.C = 100.
raw_pixel_classifier.fit(X_train, y_train)

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [49]:
# Evaluation

y_pred = rbm_features_classifier.predict(X_test)
print("Logistic regression using RBM features:\n%s\n" % (
    metrics.classification_report(y_test, y_pred)))

y_pred = raw_pixel_classifier.predict(X_test)
print("Logistic regression using raw pixel features:\n%s\n" % (
    metrics.classification_report(y_test, y_pred)))

Logistic regression using RBM features:
             precision    recall  f1-score   support

          0       0.41      0.69      0.51        13
          1       1.00      1.00      1.00        18
          2       1.00      1.00      1.00        14
          3       1.00      1.00      1.00        17
          4       1.00      1.00      1.00        26
          5       0.69      0.41      0.51        22
          6       1.00      1.00      1.00        29
          7       1.00      1.00      1.00        16
          8       1.00      1.00      1.00        15
          9       1.00      1.00      1.00        22
         10       0.95      1.00      0.97        19
         11       1.00      0.96      0.98        24
         12       0.38      0.36      0.37        14
         13       1.00      1.00      1.00        22
         14       1.00      1.00      1.00        27
         15       1.00      1.00      1.00        17
         16       1.00      1.00      1.00        18
     

In [50]:
from sklearn.ensemble import GradientBoostingClassifier

In [51]:
gbc=GradientBoostingClassifier()

In [52]:
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
print("Logistic regression using raw pixel features:\n%s\n" % (
    metrics.classification_report(y_test, y_pred)))

KeyboardInterrupt: 