# Model identification tests
### Can DISMaL correctly identify the model simulated data was generated under?

In [1]:
! pip install ..
from dismal.simulate import MsprimeSimulation
from dismal.demography import DemographicModel
from dismal.metrics import likelihood_ratio_test
import pandas as pd
import random
from scipy.stats import poisson

Processing /Users/s2341012/Dropbox/DISMaL_chapter/DISMaL
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: dismal
  Building wheel for dismal (setup.py) ... [?25ldone
[?25h  Created wheel for dismal: filename=dismal-0.1-py3-none-any.whl size=21643 sha256=c00dede3c636f034d1fe6e69b5895d717d9e1e2f62117d07520800b11a48db3c
  Stored in directory: /private/var/folders/qk/10crn8pd5_d038f0tw_7z2800000gp/T/pip-ephem-wheel-cache-mn8v8fhi/wheels/10/c9/59/9e87bfb89308645a07e4be9051b0645d49425b51da0dd9ddc7
Successfully built dismal
Installing collected packages: dismal
  Attempting uninstall: dismal
    Found existing installation: dismal 0.1
    Uninstalling dismal-0.1:
      Successfully uninstalled dismal-0.1
Successfully installed dismal-0.1


In [2]:
def four_model_lrt(iso_model, iim_model, sc_model, gim_model, multiple_test_corr="Bonferroni"):
    """Perform six tests between four models:
        1. IIM better than ISO?
        2. SC better than ISO?
        3. GIM better than ISO?
        4. GIM better than IIM?
        5. GIM better than SC?
        6. SEC lower AIC than IIM?"""
    
    if multiple_test_corr == "Bonferroni":
        alpha = 0.05/5 # 5 LRTs, Bonferroni correction
    else:
        raise NotImplementedError("Only Bonferroni correction implemented")
    
    # Tests 1-3
    test_bools = [likelihood_ratio_test(iso_model, mod, alpha=alpha)["Reject H0"] for mod in [iim_model, sc_model, gim_model]]

    # Tests 4-5:
    test_bools.append([likelihood_ratio_test(mod, gim_model, alpha=alpha)["Reject H0"] for mod in [iim_model, sc_model]])

    # Test 6:
    test_bools.append(iim_model.aic > sc_model.aic)

    if test_bools[0:1]: # reject IIM and SEC models
        inferred_model_type = "ISO"
    elif test_bools[3] and test_bools[5]: # reject GIM, IIM better than SC
        inferred_model_type = "IIM"
    elif test_bools[4]:
        inferred_model_type = "SC"
    else:
        assert all(test_bools[2:5]) # GIM better than all three simpler models
        inferred_model_type = "GIM"

    return inferred_model_type

def test_model_identification(S, true_model):
    model_names = ["ISO", "IIM", "SC", "GIM"]
    inferred_models = []

    for model in model_names:
        inferred_models.append(DemographicModel(S, model=model).infer_parameters(verbose=False))

    neglls = [model.negll for model in inferred_models]
    aics = [model.aic for model in inferred_models]
    n_params = [model.n_params for model in inferred_models]

    pd.DataFrame({"True model": true_model, "Model": model_names, "negll": neglls, "aic":aics, "n_params":n_params})

    inferred_model_type = four_model_lrt(inferred_models[0], inferred_models[1], inferred_models[2], inferred_models[3])
    
    return {"True model": true_model, "Inferred model": inferred_model_type}

## 1. Data generated under isolation model

#### Scenarios
* A: equal population sizes, t1/v corresponding to initial values
* B: random variation in population sizes, t1/v corresponding to initial values
* C: equal population sizes, bad values for t1/v

#### A: equal population sizes

In [4]:
iso_a = [simulate_msprime(theta0=2, theta1=2, theta2=2, theta1_prime=2, theta2_prime=2,
                           t1=5, v=5, m1_star=0, m2_star=0, m1_prime_star=0, m2_prime_star=0, Ne=1e5, block_len=200, num_replicates=2000) for _ in range(0,5)]

for sim in iso_a:
    print(test_model_identification(sim, "ISO"))

{'True model': 'ISO', 'Inferred model': 'ISO'}


  np.exp(beta*t1) * (poisson.cdf(s, (t1*(beta+rel_mu))) - poisson.cdf(s, (t0*(beta+rel_mu)))))
  betas.append((beta/(beta+rel_mu)) * ((rel_mu/(beta+rel_mu))**s) *
  np.exp(gamma*t0) * poisson.cdf(s, (t0*(gamma+rel_mu))))
  gammas.append((gamma/(gamma+rel_mu)) * ((rel_mu/(gamma+rel_mu))**s) *


{'True model': 'ISO', 'Inferred model': 'ISO'}


  + pij1[i, 0:3] @ cc[0:3, 0:3] @ beta_matrix(beta=beta, s_vals=s_vals, t1=t1, t0=t0) +
  negll = np.sum(ll_matrix * S)


{'True model': 'ISO', 'Inferred model': 'ISO'}
{'True model': 'ISO', 'Inferred model': 'ISO'}
{'True model': 'ISO', 'Inferred model': 'ISO'}


#### B: random variation in population sizes

In [5]:
iso_b = [simulate_msprime(random.uniform(0.1, 10), theta1=random.uniform(0.1, 10),
                           theta2=random.uniform(0.1, 10), theta1_prime=random.uniform(0.1, 10),
                             theta2_prime=random.uniform(0.1, 10), t1=5, v=5, m1_star=0, m2_star=0,
                               m1_prime_star=0, m2_prime_star=0, Ne=1e-5, block_len=200, num_replicates=2000) for _ in range(0,5)]

for sim in iso_b:
    print(test_model_identification(sim, "ISO"))

#### C: values for t1 and v far from starting values

In [9]:
# Default values for t1 and v are 5
iso_c = [simulate_msprime(theta0=2, theta1=2, theta2=2, theta1_prime=2, theta2_prime=2,
                           t1=0.1, v=10, m1_star=0, m2_star=0, m1_prime_star=0, m2_prime_star=0, Ne=1e-6, block_len=200, num_replicates=2000) for _ in range(0,5)]

for sim in iso_c:
    print(test_model_identification(sim, "ISO"))

{'True model': 'ISO', 'Inferred model': 'ISO'}
{'True model': 'ISO', 'Inferred model': 'ISO'}
{'True model': 'ISO', 'Inferred model': 'ISO'}
{'True model': 'ISO', 'Inferred model': 'ISO'}


  np.exp(beta*t1) * (poisson.cdf(s, (t1*(beta+rel_mu))) - poisson.cdf(s, (t0*(beta+rel_mu)))))
  betas.append((beta/(beta+rel_mu)) * ((rel_mu/(beta+rel_mu))**s) *


{'True model': 'ISO', 'Inferred model': 'ISO'}


## 2. Data generated under IIM model
#### (Random variation in migration rates, constant population sizes)

In [3]:
iim = [simulate_msprime(theta0=1, theta1=1, theta2=1, theta1_prime=1, theta2_prime=1, 
                          t1=5, v=5, m1_star=random.uniform(0.5, 10), m2_star=random.uniform(0.5, 10),
                               m1_prime_star=0, m2_prime_star=0, Ne=1e-6, block_len=200, num_replicates=2000) for _ in range(0,5)]

for sim in iim:
    print(test_model_identification(sim, "IIM"))

  np.exp(gamma*t0) * poisson.cdf(s, (t0*(gamma+rel_mu))))
  gammas.append((gamma/(gamma+rel_mu)) * ((rel_mu/(gamma+rel_mu))**s) *


{'True model': 'IIM', 'Inferred model': 'ISO'}


  np.exp(beta*t1) * (poisson.cdf(s, (t1*(beta+rel_mu))) - poisson.cdf(s, (t0*(beta+rel_mu)))))
  betas.append((beta/(beta+rel_mu)) * ((rel_mu/(beta+rel_mu))**s) *
  + pij1[i, 0:3] @ cc[0:3, 0:3] @ beta_matrix(beta=beta, s_vals=s_vals, t1=t1, t0=t0) +
  negll = np.sum(ll_matrix * S)


KeyboardInterrupt: 

In [10]:
iim_mod = DemographicModel(iim[0], "IIM").infer_parameters(verbose=False)
iso_mod = DemographicModel(iim[0], "ISO").infer_parameters(verbose=False)

In [11]:
iim_mod

{'theta0': 1.033558414566036, 'theta1': 1.0361544758410404, 'theta2': 1.5723587714406568, 'theta1_prime': 0.9849842926354715, 'theta2_prime': 1.0200873794211849, 't1': 5.582202231859374, 'v': 5.0730788388636086, 'm1_star': 0.047424179849636136, 'm2_star': 0.13065236166727914, 'm1_prime_star': 0.0, 'm2_prime_star': 0.0, '-lnL': 10857.700711008725, 'aic': 21733.40142201745}

In [12]:
iso_mod

{'theta0': 1.3935010677055433, 'theta1': 1.277929947708059, 'theta2': 0.07147083742840482, 'theta1_prime': 1.2488476618493227, 'theta2_prime': 1.3068798532260029, 't1': 4.952638209220554, 'v': 4.98841067213646, 'm1_star': 0.0, 'm2_star': 0.0, 'm1_prime_star': 0.0, 'm2_prime_star': 0.0, '-lnL': 10843.403548701262, 'aic': 21700.807097402525}

## 4. Data generated under GIM model

#### 4A: using default parameters of CWH

In [18]:
gim4a = MsprimeSimulation(theta0=5, theta1=5, theta2=5, theta1_prime=5, theta2_prime=5, 
                          t1=5, v=5, m1_star=0, m2_star=0,
                               m1_prime_star=0, m2_prime_star=0, Ne=1e5, block_len=200, num_replicates=2000)

In [25]:
DemographicModel(gim4a.S, "ISO").infer_parameters(verbose=False)

Estimated pi for population 1: 5.0655
Estimated pi for population 2: 4.994


{'theta0': 22.62582893488037, 'theta1': 4.7283997290598645, 'theta2': 13.905942005908699, 'theta1_prime': 23.95224131196548, 'theta2_prime': 23.61528466407819, 't1': 44.306306900385586, 'v': 5.965067054244172, 'm1_star': 0.0, 'm2_star': 0.0, 'm1_prime_star': 0.0, 'm2_prime_star': 0.0, '-lnL': 17963.59030068641, 'aic': 35941.18060137282}

In [12]:
gim2 = [simulate_msprime(theta0=1, theta1=1, theta2=1, theta1_prime=1, theta2_prime=1, 
                          t1=5, v=5, m1_star=2, m2_star=2,
                               m1_prime_star=2, m2_prime_star=2, Ne=1000, block_len=200, num_replicates=2000) for _ in range(0,5)]

gim2[0]

array([[729, 382, 314, 181, 120,  89,  59,  39,  30,  26,  10,   9,   5,
          1,   1,   1,   2,   2,   0],
       [716, 448, 267, 181, 133,  94,  51,  36,  28,  10,  15,  11,   3,
          3,   1,   1,   1,   0,   0],
       [582, 445, 288, 216, 143, 117,  74,  41,  34,  17,  12,   7,   4,
          9,   2,   3,   2,   4,   0]])

In [13]:
for sim in gim2:
    print(test_model_identification(sim, "GIM"))

  np.exp(beta*t1) * (poisson.cdf(s, (t1*(beta+rel_mu))) - poisson.cdf(s, (t0*(beta+rel_mu)))))
  betas.append((beta/(beta+rel_mu)) * ((rel_mu/(beta+rel_mu))**s) *
  negll = np.sum(ll_matrix * S)


{'True model': 'GIM', 'Inferred model': 'ISO'}
{'True model': 'GIM', 'Inferred model': 'ISO'}
{'True model': 'GIM', 'Inferred model': 'ISO'}
{'True model': 'GIM', 'Inferred model': 'ISO'}
{'True model': 'GIM', 'Inferred model': 'ISO'}


In [14]:
DemographicModel(gim2[0], "ISO").infer_parameters(verbose=False)

{'theta0': 8.464286432760588, 'theta1': 4.070538141491049, 'theta2': 4.001012527593325, 'theta1_prime': 0.10262002242750565, 'theta2_prime': 0.39280556168823066, 't1': 0.00020920072733951066, 'v': 0.10361232351271003, 'm1_star': 0.0, 'm2_star': 0.0, 'm1_prime_star': 0.0, 'm2_prime_star': 0.0, '-lnL': 11541.003258280909, 'aic': 23096.006516561818}

In [15]:
DemographicModel(gim2[0], "GIM").infer_parameters(verbose=False)

{'theta0': 0.3763866107862913, 'theta1': 1.5955742847055123, 'theta2': 1.0620768722925729, 'theta1_prime': 1.5963794566523049, 'theta2_prime': 1.5747715644678062, 't1': 3.9608984296347454, 'v': 8.229730324161585, 'm1_star': 1.6132310769091536, 'm2_star': 0.9595441581331854, 'm1_prime_star': 3.9233019026254365, 'm2_prime_star': 3.6224422581833617, '-lnL': 11538.928677637858, 'aic': 23099.857355275715}

In [16]:
DemographicModel(gim2[0], "IIM").infer_parameters(verbose=False)

{'theta0': 2.1879677025003876, 'theta1': 1.189630887889246, 'theta2': 1.1740726216649815, 'theta1_prime': 2.4551059735659115, 'theta2_prime': 2.4989209182539804, 't1': 0.0, 'v': 5.098515306734724, 'm1_star': 4.040647211955948, 'm2_star': 3.689321741635956, 'm1_prime_star': 0.0, 'm2_prime_star': 0.0, '-lnL': 11539.633748412165, 'aic': 23097.26749682433}

In [17]:
DemographicModel(gim2[0], "SC").infer_parameters(verbose=False)

{'theta0': 2.654682767149195, 'theta1': 1.4625812438682075, 'theta2': 1.2554091887814323, 'theta1_prime': 1.357204131372435, 'theta2_prime': 1.5554795392816083, 't1': 4.789249253748401, 'v': 0.5037914966772996, 'm1_star': 0.0, 'm2_star': 0.0, 'm1_prime_star': 3.678526961179659, 'm2_prime_star': 3.140640751946219, '-lnL': 11539.660675529092, 'aic': 23097.321351058185}