# Using simcat for network inference
## Three-step process: 
1) Training database simulation  
2) Model training  
3) Model application  

## 1) Training database simulation

### Imports

In [None]:
import ipcoal
import toytree
import simcat

### Read in your species tree:

In [None]:
# we assume we already have a newick
newick = '(beebalm:8.57143e+06,(whale_shark:7.14286e+06,(coelacanth:5.71429e+06,(spotted_salamander:4.28571e+06,(hamster:2.85714e+06,(dragon:1.42857e+06,kinglet:1.42857e+06)0:1.42857e+06)0:1.42857e+06)0:1.42857e+06)0:1.42857e+06)0:1.42857e+06);'
newick

In [None]:
# convert to toytree format
tree = toytree.tree(newick)

# look at it
tree.draw();

In [None]:
# Each edge is named by its tip-ward node number.
tree.draw(node_labels='idx',
          node_sizes=15,
          tip_labels=['0','1','2','3','4','5','6']);

### Use simcat to automatically construct an empty database

For the purposes of the tutorial we will make the training database small -- just 500 simulations with 5000 SNPs each.

In [None]:
db = simcat.Database("tutorial",
                'tutorial_data/',
                tree,
                nrows=500,
                nsnps=5000,
                Ne_min=10000, # how much should Ne vary on the branches?
                Ne_max=50000,
                admix_prop_min=0.3, # how much should the magnitude of admixture event vary?
                admix_prop_max=0.5,
                admix_edge_min=0.5, # how much should the timing of admixture event vary?
                admix_edge_max=0.5,
                exclude_sisters=True, # do we want to include introgression between sister taxa?
                node_slide_prop=0.1, # how much do we want internal nodes to shift around?
                existing_admix_edges=[],) # do we want to assume any existing edges?

### Fill the empty database with simulations

In [None]:
simulator = simcat.Simulator("tutorial","tutorial_data/")  # inits the simulator
simulator.run(500,auto=True) # runs as many simulations as we specify, automatically detects available cores

## 2) Model Training

### Imports

In [None]:
# training automation
from simcat import BatchTrain
# for defining the model
from keras.models import Sequential, load_model
from keras.layers import Dense

#### Define the parameters for training:

* Where is the data?
* What do want to name the outputs?
* What proportion do we want to split into training vs. testing?
* Do we want to exclude scenarios with introgression between sister taxa?
* Do we want to exclude scenarios where introgression is really low?
* Do we want to make a "zero" category that includes all remaining simulations with magnitude under some number?

In [None]:
tutorial_model = BatchTrain(input_name='tutorial', # use the name of the training database
                    output_name='tutorial_model', # this is the name for model-related files
                    directory='tutorial_data/', # point to the same directory as training database
                    prop_training=0.9, # how much of the data should be used for training (vs testing)?
                    exclude_sisters=True, # do we want to exclude any sister-taxon introgression scenarios?
                    exclude_magnitude=0.1, # do we want to exclude events below a certain magnitude?
                    to_zero_magnitude=0, # do we want to label events below a certain magnitude as "zero"?
                   )

#### An "analysis.h5" file has been saved as output. It contains indices for simulations in the training vs testing dataset, as well as some metadata about the training. 
#### A "onehot_dict.csv" file has also been saved, to convert between integer codes and the literal string labels.

### Define a neural network

In [None]:
# Neural network architecture defined with Keras tools
model = Sequential()
model.add(Dense(100, input_dim=tutorial_model.input_shape, activation='relu'))
model.add(Dense(tutorial_model.num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Initialize the network model for the BatchTrain object -- which will also save the model as a file

In [None]:
tutorial_model.init_model(model)

### Now designate the batch size and the number of epochs, and train!

In [None]:
tutorial_model.train(batch_size=10,
             num_epochs=5)

#### The model is automatically saved to disk after each epoch.

## 3) Model application

#### We would normally have empicial data -- but here we will simulate some sequence data with introgression from branch 0 to branch 3.

In [None]:
dat_mod = ipcoal.Model(tree,admixture_edges=[(0,3,.5,.4)])

In [None]:
# simulate an alignment of 5000 SNPs
dat_mod.sim_snps(5000)

#### Here is our SNP alignment -- an array of (ntaxa x nsnps)

In [None]:
dat_mod.seqs

In [None]:
# the rows of the SNP alignment correspond to the alphanumeric ordering of the tree's tip names
dat_mod.alpha_ordered_names

#### Now we can load the model:

In [None]:
# specify the object once again, but indicate that it already exists
tutorial_model = BatchTrain(input_name='tutorial',
                    output_name='tutorial_model',
                    directory='tutorial_data/',
                    exists=True, # specifies that the saved model already exists
                   )
# load the keras model into the object
tutorial_model.load_model()

In [None]:
# pass our sequence data to the model to get a prediction
tutorial_model.pass_alignment_to_model(dat_mod.seqs)