# Second stage model training:

The probabilities of different outcomes output from the first model might be hierarchically related (due to nonindependence from the species tree).  
This second model trains on the predicted probabilities from the first model to improve inference.

In [1]:
import simcat
import toytree
import toyplot
import toyplot.svg
import numpy as np
import pandas as pd
import h5py
import ipcoal
import pandas as pd
import csv

from keras.models import Sequential,load_model
from keras.layers import Dense
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import normalize

Using TensorFlow backend.


### Load up our trained model from the first stage of training.

In [2]:
nn_model = load_model("../models/bal_10tip_2mil/firststage_mod.h5")

### Load up the original simulated training data again!

In [4]:
mod = simcat.Analysis(
    name="cleaned",
    workdir="../merged/",
    mask_admixture_min=0.04,
    mask_sisters=True,
    scale=1,
)

[init] cleaned
[load] (63740, 210, 16, 16)
[filter] (63740, 210, 16, 16)
[vectorize] (63740, 53760)
[train/test] (42705, 53760)/(21035, 53760)


In [5]:
mod.train_test_split(prop=0.1)

[train/test] (57366, 53760)/(6374, 53760)


### Re-encode the labels for use by the neural network.

In [6]:
# encode labels as ints:
unique_labs = np.unique(mod.y)
onehot_dict = dict(zip(range(len(unique_labs)),unique_labs))
inv_onehot_dict = dict(zip(unique_labs,range(len(unique_labs))))

In [7]:
# number of non-sister admixture scenarios in our training data
len(onehot_dict)

177

In [8]:
# one-hot encode training labels
y_idxs = [inv_onehot_dict[i] for i in np.array(mod.y_train)]
y = np.zeros((len(y_idxs),len(onehot_dict)))
for rowidx in range(y.shape[0]):
    y[rowidx,y_idxs[rowidx]] += 1

In [9]:
# one-hot encode test labels
y_test_idxs = [inv_onehot_dict[i] for i in np.array(mod.y_test)]
y_test = np.zeros((len(y_test_idxs),len(onehot_dict)))
for rowidx in range(y_test.shape[0]):
    y_test[rowidx,y_test_idxs[rowidx]] += 1

#### Again, we want to exclude introgression between sister edges.

In [10]:
# for excluding NaN from the analysis -- which integer value is NaN?
nanval = {onehot_dict[i]:i for i in onehot_dict.keys()}["NaN"]

## Now, let's make predictions from the simulated training data with the first model stage.  
### These predictions are the training data for our second stage model.

In [11]:
training_preds = nn_model.predict(mod.X_train[~(np.argmax(y,1)==nanval)])
training_y = y[~(np.argmax(y,1)==nanval)]
testing_preds = nn_model.predict(mod.X_test[~(np.argmax(y_test,1)==nanval)])
testing_y = y_test[~(np.argmax(y_test,1)==nanval)]

In [12]:
testing_preds.shape

(5649, 177)

In [13]:
testing_y.shape

(5649, 177)

In [14]:
training_preds.shape

(50808, 177)

In [15]:
training_y.shape

(50808, 177)

## Define the second stage model

In [16]:
# Neural network architecture
model = Sequential()
model.add(Dense(500, input_dim=training_preds.shape[1], activation='relu'))
#model.add(Dense(500, activation='relu'))
model.add(Dense(training_y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train the model, reporting results on a separate test dataset along the way.

In [17]:
num_epochs = 300
epoch_accuracies = []
test_accuracies = []
for i in range(num_epochs):
    print("~~~~~~~~~~~~~~ Training epoch "+ str(i) + ": ~~~~~~~~~~~~~~")
    history = model.fit(training_preds, 
                        training_y, 
                        epochs=1, 
                        batch_size=512,
                        verbose=False)
    acc = history.history['accuracy']
    print("training accuracy: " + str(round(acc[0],2)))
    epoch_accuracies.append(acc[0])
    
    # now make predictions on the test data
    y_pred = model.predict(testing_preds)
    #Convert predictions to label
    pred = list()
    for i in range(len(y_pred)):
        pred.append(np.argmax(y_pred[i]))
        
    #Converting one hot encoded test label to label
    test = list()
    for i in range(len(testing_y)):
        test.append(np.argmax(testing_y[i]))
        
    a = accuracy_score(pred,test)
    print("test accuracy: "+str(round(a,2)))
    test_accuracies.append(a)
    

~~~~~~~~~~~~~~ Training epoch 0: ~~~~~~~~~~~~~~
training accuracy: 0.67
test accuracy: 0.73
~~~~~~~~~~~~~~ Training epoch 1: ~~~~~~~~~~~~~~
training accuracy: 0.77
test accuracy: 0.8
~~~~~~~~~~~~~~ Training epoch 2: ~~~~~~~~~~~~~~
training accuracy: 0.8
test accuracy: 0.81
~~~~~~~~~~~~~~ Training epoch 3: ~~~~~~~~~~~~~~
training accuracy: 0.81
test accuracy: 0.81
~~~~~~~~~~~~~~ Training epoch 4: ~~~~~~~~~~~~~~
training accuracy: 0.81
test accuracy: 0.81
~~~~~~~~~~~~~~ Training epoch 5: ~~~~~~~~~~~~~~
training accuracy: 0.81
test accuracy: 0.82
~~~~~~~~~~~~~~ Training epoch 6: ~~~~~~~~~~~~~~
training accuracy: 0.81
test accuracy: 0.82
~~~~~~~~~~~~~~ Training epoch 7: ~~~~~~~~~~~~~~
training accuracy: 0.81
test accuracy: 0.82
~~~~~~~~~~~~~~ Training epoch 8: ~~~~~~~~~~~~~~
training accuracy: 0.81
test accuracy: 0.82
~~~~~~~~~~~~~~ Training epoch 9: ~~~~~~~~~~~~~~
training accuracy: 0.82
test accuracy: 0.82
~~~~~~~~~~~~~~ Training epoch 10: ~~~~~~~~~~~~~~
training accuracy: 0.82
test accu

KeyboardInterrupt: 

#### You can see that the model has about 83% success on the test data by the end of model training.

## Save the model

In [19]:
model.save("secondstage_mod.h5")