In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import resample as rp
import intervals as ci
import simulation as sim
import summary as sm

from time import time
from datetime import datetime
from multiprocessing import Pool, cpu_count

## Site Diversity

### Resample over sites (change the number of observed sites)
#### 1. Bootstrap 
#### 2. Delete-one Jackknife 
#### 3. Delete-mj Jackknife (random split)

In [29]:
%%time
max_sites = np.linspace(1000, 5000, 5, dtype=int)
result = []
n_experiment = 3
n_replicate = 2

for exp in range(n_experiment):
    pop_ts = sim.sim_one_population(
        diploid_size=200,
        seq_len=1e9,
        rec_rate=1e-8,
        mut_rate=1e-8
    )

    pop_ts_diversity = pop_ts.diversity(span_normalise = False, windows = 'sites').mean()
    
    print('Population site diveristy:', pop_ts_diversity)
    print('Experiment:', exp)
    
    for num_sites in max_sites:
        print(f'The shape of observed population (inds, sites) is: {(num_sites)} \n')
        start = datetime.now()
        print('start_time:', start)

        inputs = [(sim.observe(ts=pop_ts, num_inds=200, max_sites=num_sites)) for _ in range(n_replicate)]

        pool = Pool(cpu_count())

        bt_vals = pool.map(rp.bt_resample_sites, inputs)
        jk_delete_one_vals = pool.map(rp.jk_delete_one_sites, inputs)
        jk_delete_mj_vals = pool.map(rp.jk_delete_mj_sites, inputs)

        coverage_rate = {'bt_standard': np.zeros((n_replicate, ), dtype=int), 
                         'jk_delete_one': np.zeros((n_replicate, ), dtype=int),
                         'jk_delete_mj': np.zeros((n_replicate, ), dtype=int)}

        for i, (bt, jk_one, (jk_mj, sizes)) in enumerate(zip(bt_vals, jk_delete_one_vals, jk_delete_mj_vals)):
            obs_ts = inputs[i]
            obs_ts_diversity = obs_ts.diversity(span_normalise = False, windows = 'sites').mean()

            bt_CI = ci.bt_standard(bt, 0.95, obs_ts_diversity)
            jk_one_CI = ci.jk_delete_one(jk_one, 0.95, obs_ts_diversity)
            jk_mj_CI = ci.jk_delete_mj(jk_mj, 0.95, obs_ts_diversity, sizes)

            if bt_CI[0] < pop_ts_diversity < bt_CI[1]:
                coverage_rate['bt_standard'][i] = 1

            if jk_one_CI[0] < pop_ts_diversity < jk_one_CI[1]:
                coverage_rate['jk_delete_one'][i] = 1

            if jk_mj_CI[0] < pop_ts_diversity < jk_mj_CI[1]:
                coverage_rate['jk_delete_mj'][i] = 1

        print('Run time:', datetime.now() - start)
        result.append([exp, num_sites, pop_ts_diversity, coverage_rate['bt_standard'].mean(),
                    coverage_rate['jk_delete_one'].mean(), coverage_rate['jk_delete_mj'].mean()])

# save the results to csv file
result_df = pd.DataFrame(result)
result_df.columns = ['experiment', 'num_sites', 'pop_ts_diversity', 
                     'bt_standard_normal', 'jk_delete_one', 'jk_delete_mj']

#filename = datetime.now().strftime("%m%d") + 'over_sites_site_diversity.csv'
#result_df.to_csv(f'../data/{filename}', index=False)

Population site diveristy: 0.1458479941890861
Experiment: 0
The shape of observed population (inds, sites) is: 1000 

start_time: 2021-09-04 15:48:46.354337
Run time: 0:00:03.415927
The shape of observed population (inds, sites) is: 2000 

start_time: 2021-09-04 15:48:49.770536
Run time: 0:00:01.171352
The shape of observed population (inds, sites) is: 3000 

start_time: 2021-09-04 15:48:50.942205
Run time: 0:00:01.279362
The shape of observed population (inds, sites) is: 4000 

start_time: 2021-09-04 15:48:52.221889
Run time: 0:00:01.533724
The shape of observed population (inds, sites) is: 5000 

start_time: 2021-09-04 15:48:53.755851
Run time: 0:00:01.709504
Population site diveristy: 0.14973514115842126
Experiment: 1
The shape of observed population (inds, sites) is: 1000 

start_time: 2021-09-04 15:48:56.992651
Run time: 0:00:00.886473
The shape of observed population (inds, sites) is: 2000 

start_time: 2021-09-04 15:48:57.879428
Run time: 0:00:00.993621
The shape of observed pop

### Resample over inds (change the number of observed sites)
#### 1. Bootstrap 
#### 2. Delete-one Jackknife 
#### 3. Delete-mj Jackknife (random split)

In [2]:
pop_ts = sim.sim_one_population(
        diploid_size=200,
        seq_len=1e9,
        rec_rate=1e-8,
        mut_rate=1e-8
    )

pop_ts_diversity = pop_ts.diversity(span_normalise = False, windows = 'sites').mean()
pop_ts_diversity


0.15096368856612855

In [4]:
inputs = [sim.observe(ts=pop_ts, num_inds=200, max_sites=1000) for i in range(10)]

In [5]:
%%time
bt_inds_vals = list(map(rp.bt_resample_inds, inputs))

CPU times: user 2.79 s, sys: 3.24 s, total: 6.03 s
Wall time: 3min 1s


In [8]:
%%time
def jk_delete_one_inds(ts):
    '''delete one jackknife resample over inds
    '''
    samples_index = np.arange(ts.num_samples)
    sites_index = np.arange(ts.num_sites)
    inputs = [(ts, np.delete(samples_index, i), sites_index) for i in samples_index]
    
    pool = Pool(cpu_count())
    jk_vals = np.array(pool.map(sm.get_diversity_general, inputs))
    
    return jk_vals

jk_delete_on_inds_vals = list(map(jk_delete_one_inds, inputs))

CPU times: user 2.53 s, sys: 3.25 s, total: 5.78 s
Wall time: 2min 13s


In [12]:
%%time

def jk_delete_mj_inds(ts):
    '''delete_mj jackknife resampling methods over inds with unequal sizes
    '''
    samples_index = np.arange(ts.num_samples)
    sites_index = np.arange(ts.num_sites)
    
    n_samples = ts.num_samples
    n_fold = int(np.sqrt(n_samples))

    # where to cut off the array
    random = np.random.multinomial(
        n=n_samples - n_fold,
        pvals=np.ones(n_fold) / (n_fold),
    )
    random += 1
    cutoff = np.cumsum(random)

    # index of sites 
    index = np.arange(n_samples)
    index = np.split(index, cutoff)[0:n_fold]
    
    inputs = [(ts, np.delete(samples_index, i), sites_index) for i in index]
    
    pool = Pool(cpu_count())
    jk_vals = np.array(pool.map(sm.get_diversity_general, inputs))
    
    return jk_vals


jk_delete_mj_inds_vals = list(map(jk_delete_mj_inds, inputs))

CPU times: user 1.64 s, sys: 2.04 s, total: 3.69 s
Wall time: 34.9 s


In [10]:
len(jk_delete_on_inds_vals)

10