# Simcat demonstration

### Imports

In [1]:
import simcat
import toytree
import os

2023-02-25 19:08:51.466489: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/slurm/20.11.9/lib64/slurm:/cm/shared/apps/slurm/20.11.9/lib64
2023-02-25 19:08:51.466537: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Designate directory for holding results

In [2]:
# get current working directory
cwd = os.getcwd()
# assign a name to our simcat run
name = "simcat_demo"
# make a repo to hold the results
db_path = os.path.join(cwd,name)
os.mkdir(db_path)

In [3]:
db_path

'/moto/eaton/users/pfm2119/projects/simcatfinalizing/simcat_demo'

### Provide our starting species tree

In [4]:
t = toytree.rtree.imbtree(5,treeheight=5e6)

### Create the empty database

In [5]:
db = simcat.Database(name,
                db_path,
                t,
                nrows=10_000, # how many total entries (training + test) to simulate?
                nsnps=5_000, # how many unlinked SNPs per entry?
                Ne_min=50_000, # how much should Ne vary on the branches?
                Ne_max=150_000,
                admix_prop_min=0.3, # how much should the magnitude of admixture event vary?
                admix_prop_max=0.4,
                admix_edge_min=0.3, # how much should the timing of admixture event vary?
                admix_edge_max=0.7,
                exclude_sisters=True, # do we want to exclude introgression between sister taxa?
                node_slide_prop=0.9, # how much do we want internal nodes to shift around?
                existing_admix_edges=[],) # do we want to assume any existing edges?

10000 labels to be stored in: ../simcatfinalizing/simcat_demo/simcat_demo.labels.h5


# Simulating

### (Fill the database)

In [6]:
simulator = simcat.Simulator(name,db_path)  # inits the simulator
simulator.run(10_000,auto=True) # runs as many simulations as we specify, automatically detects available cores

Box(children=(HTML(value="<span style='font-size:14px; font-family:monospace'>Establishing parallel connection…

Box(children=(HTML(value="<span style='font-size:14px; font-family:monospace'>Parallelization: <i>t083</i>: 24…

completed 10000 simulations in 5:13:09.


# Training

### Start a new model in the same directory as before

In [7]:
mod = simcat.BatchTrain(input_name=name, # use the name of the training database
                    output_name=name, # this is the name for model-related files
                    directory=db_path, # point to the same directory as training database
                    prop_training=0.95, # how much of the data should be used for training (vs testing)?
                    exclude_sisters=True, # do we want to exclude any sister-taxon introgression scenarios?
                    exclude_magnitude=0, # do we want to exclude events below a certain magnitude?
                    to_zero_magnitude=0, # do we want to label events below a certain magnitude as "zero"?
                   )

10000 total simulations.
10000 total simulations compatible with parameters.
Data split into 9500 training and 500 testing simulations.

Onehot dictionary file saved to /moto/eaton/users/pfm2119/projects/simcatfinalizing/simcat_demo/simcat_demo.onehot_dict.csv

Analysis reference file saved to /moto/eaton/users/pfm2119/projects/simcatfinalizing/simcat_demo/simcat_demo.analysis.h5


### Load the model into the notebook

In [8]:
mod.load_model()

2023-02-25 14:24:16.509151: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/slurm/20.11.9/lib64/slurm:/cm/shared/apps/slurm/20.11.9/lib64
2023-02-25 14:24:16.509252: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2023-02-25 14:24:16.509296: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (t083): /proc/driver/nvidia/version does not exist
2023-02-25 14:24:16.510060: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


New neural network saved to: /moto/eaton/users/pfm2119/projects/simcatfinalizing/simcat_demo/simcat_demo.model.h5


### Train the model

In [9]:
mod.train(batch_size=20,num_epochs=25,workers=8)

2023-02-25 14:24:19.419741: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2023-02-25 14:24:19.432194: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2600000000 Hz




# Give it simulated data and infer source and destination branch

In [10]:
# software for simulating data
import ipcoal

In [11]:
# use the same species tree
t = toytree.tree(mod.newick)
t.draw(ts='p');

### Define simulation parameters

In [12]:
# MUTATION
mut = 1e-8

# NE
Ne = 100_000

# INTROGRESSION
# (the `source` and `dest` here are what 
# we are trying to infer!)
source = 5
dest = 3
time = 0.5
magnitude = 0.35

admix = (
    source,
    dest,
    time,
    magnitude
)

### Simulate

In [13]:
# build ipcoal Model object using our defined parameters
ipmod = ipcoal.Model(
    tree=t,
    admixture_edges=[admix],
    Ne=Ne,
    mut=mut,
    )

In [14]:
# run the simulation
ipmod.sim_snps(5000)

### Get answer from model

In [15]:
# load the predictions DataFrame
pred = mod.predict_from_alignment(ipmod.seqs)
pred

1,"0,2","0,3","0,4","1,2","1,3","1,4","2,0","2,1","2,3","2,4",...,"3,5","4,0","4,1","4,2","4,3","4,5","4,6","5,3","5,4","6,4"
0,7.083513000000001e-22,0.999888,1.6404370000000002e-23,3.886871e-07,6.891427e-28,8.560573999999999e-19,2.962834e-21,1.822131e-10,8.201354e-31,1.1681259999999999e-24,...,3.691465e-18,4.627179e-28,6.081071e-17,3.505266e-16,1.8854939999999998e-19,5.835578e-19,1.136955e-18,1.1384e-11,1.092091e-24,1.653196e-15


In [16]:
# print the 'source,dest'
pred.columns[pred.loc[0].argmax()]

'0,3'