In [16]:
import h5py
import simcat
import numpy as np
import tensorflow as tf
from numba import jit
import toytree
from copy import deepcopy
import ipyparallel as ipp

In [17]:
## generate a random tree
tree = toytree.rtree.unittree(ntips=4, treeheight=3, seed=12345)
c, a = tree.draw(tree_style='c',node_labels=tree.get_node_values('name',show_root=True,show_tips=True))

In [28]:
## init a database
db1 = simcat.DataBase(
    name="bounded_05_4", 
    workdir="./databases", 
    tree=tree, 
    nedges=1,
    ntrees=500,
    ntests=1,
    nreps=2,
    edge_function="node_slider",
    constrained_times=2,
    mig_rate_bounds = [.05,.4],
    force=True)

stored 18000 labels to /Volumes/My Passport/sims/databases/bounded_05_4.hdf5


In [29]:
ipyclient = ipp.Client()

In [30]:
ipyclient

<ipyparallel.client.client.Client at 0x182cb14850>

In [31]:
db1.run(ipyclient)

host compute node: [4 cores] on Patricks-MBP.fios-router.home
[                    ]   0% | 0:00:00 | simulating count matrices

submitting jobs


[                    ]   0% | 0:03:04 | simulating count matrices

Done with round: 0 of 5


[                    ]   0% | 0:05:44 | simulating count matrices

Done with round: 1 of 5


[                    ]   0% | 0:08:32 | simulating count matrices

Done with round: 2 of 5


[                    ]   0% | 0:11:46 | simulating count matrices

Done with round: 3 of 5


[                    ]   0% | 0:13:44 | simulating count matrices

Done with round: 4 of 5


In [27]:
2+2

4

In [8]:
dat=h5py.File("databases/starting_sims.hdf5")

In [9]:
dat['counts'].shape

(460000, 15, 16, 16)

In [26]:
mat = dat['counts'][0][0]

In [27]:
abba = mat[1,4]+mat[2,8]+mat[3,12]+mat[4,1]+mat[6,9]+mat[7,13]+mat[8,2]+mat[9,6]+mat[11,14]+mat[12,3]+mat[13,7]+mat[14,11]
baba = mat[1,1]+mat[2,2]+mat[3,3]+mat[4,4]+mat[6,6]+mat[7,7]+mat[8,8]+mat[9,9]+mat[11,11]+mat[12,12]+mat[13,13]+mat[14,14]

In [28]:
float(abba-baba)/(abba+baba)

0.07692308547581485

In [11]:
dat['counts'][0][0]

array([[0.7113402 , 0.11340206, 0.12371134, 0.12371134, 0.11340206,
        0.19587629, 0.        , 0.        , 0.11340206, 0.        ,
        0.0927835 , 0.        , 0.13402061, 0.        , 0.        ,
        0.13402061],
       [0.12371134, 0.        , 0.        , 0.        , 0.        ,
        0.13402061, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.08247422, 0.        , 0.01030928, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.15463917, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.16494845, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.02061856, 0.        , 0.        ,
        0.11340206],
       [0.03092784, 0.        , 0.        , 0.        , 0.        ,
        0.06185567, 0.        , 

In [35]:
@jit
def dstat(mat):
    '''
    input a 2d square site count matrix, output a d statistic (float)
    '''
    abba = (mat[1,4]+mat[2,8]+mat[3,12]+mat[4,1]+mat[6,9]+mat[7,13]+
            mat[8,2]+mat[9,6]+mat[11,14]+mat[12,3]+mat[13,7]+mat[14,11])
    baba = (mat[1,1]+mat[2,2]+mat[3,3]+mat[4,4]+mat[6,6]+mat[7,7]+
            mat[8,8]+mat[9,9]+mat[11,11]+mat[12,12]+mat[13,13]+mat[14,14])
    return(float(abba-baba)/(abba+baba))

In [75]:
def dstat3d(simcat_arr):
    dstats = np.zeros((len(simcat_arr)))
    counter = 0
    for layer in simcat_arr:
        try:
            dstats[counter] = dstat(layer)
        except:
            # if division by zero error, then return an array of zeros that you can search for later
            dstats = np.zeros((len(simcat_arr)))
            break
        counter += 1
    return(dstats)

In [78]:
all_dstats = np.zeros(dat['counts'].shape[0:2])

In [79]:
for _ in range(len(dat['counts'])):
    all_dstats[_]=dstat3d(dat['counts'][_])

In [81]:
failed = list()
for _ in range(len(all_dstats)):
    if np.alltrue(all_dstats[_] == 0):
        failed.append(_)

In [84]:
np.array(failed)

array([    76,    153,    243, ..., 459851, 459905, 459906])

In [97]:
ind = np.ones((len(all_dstats)), bool)
ind[failed] = False

In [98]:
ind

array([ True,  True,  True, ...,  True,  True,  True])

In [108]:
sources=dat['admix_sources'][:][ind]
targets=dat['admix_targets'][:][ind]
combo=np.hstack([sources,targets])

In [123]:
class_ids=np.array([str(x) for x in combo])

In [131]:
id_dict=dict(enumerate(np.unique(class_ids)))

In [133]:
inv_dict = {v: k for k, v in id_dict.iteritems()}

In [138]:
class_ids_int = np.zeros(class_ids.shape,dtype=np.int32)
counter = 0
for str_id in class_ids:
    class_ids_int[counter] = inv_dict[str_id]
    counter += 1

In [147]:
X = all_dstats[ind]
y = class_ids_int

In [154]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [155]:
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  

In [159]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, random_state=0)  
classifier.fit(X_train, y_train)  
y_pred = classifier.predict(X_test)  

In [160]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[ 97  22  16 ...   5   5  33]
 [  6 431 147 ...   0   1   7]
 [  3 125 469 ...   2   1   3]
 ...
 [ 13   7   5 ...  14   7  27]
 [ 24   8  11 ...   5  10  24]
 [ 50  26  16 ...  13  15  38]]
             precision    recall  f1-score   support

          0       0.11      0.10      0.10       973
          1       0.31      0.44      0.36       980
          2       0.35      0.49      0.41       952
          3       0.63      0.85      0.72      1024
          4       0.00      0.00      0.00       163
          5       0.14      0.04      0.06       215
          6       0.09      0.08      0.09       980
          7       0.28      0.44      0.34       975
          8       0.32      0.46      0.37       951
          9       0.57      0.75      0.65       981
         10       0.61      0.74      0.67       991
         11       0.09      0.03      0.05       538
         12       0.12      0.02      0.04       323
         13       0.24      0.32      0.27       985
         14 

In [162]:
from sklearn import svm
lin_clf = svm.LinearSVC()
lin_clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [164]:
y_pred = lin_clf.predict(X_test)

In [165]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[  0  53  33 ...   0   0   0]
 [  0 309 198 ...   0   0   1]
 [  0  63 529 ...   0   0   0]
 ...
 [  0  16  26 ...   0   0   0]
 [  0  35  32 ...   0   0   0]
 [  0  46  41 ...   0   0   2]]
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       973
          1       0.17      0.32      0.22       980
          2       0.22      0.56      0.31       952
          3       0.48      0.91      0.63      1024
          4       0.00      0.00      0.00       163
          5       0.05      0.00      0.01       215
          6       0.00      0.00      0.00       980
          7       0.16      0.36      0.23       975
          8       0.20      0.56      0.30       951
          9       0.44      0.80      0.57       981
         10       0.44      0.81      0.57       991
         11       0.00      0.00      0.00       538
         12       0.00      0.00      0.00       323
         13       0.06      0.05      0.06       985
         14 

  'precision', 'predicted', average, warn_for)
