In [2]:
import tskit
import tszip
import numpy as np
import pandas as pd
from numpy.random import default_rng

In [3]:
def get_local_ancestry(ts, admixture_time, per_batch):
    # target_samples are at time==0
    target_samples = np.intersect1d(
                ts.samples(),
                np.where(ts.tables.nodes.asdict()['time']==0)[0]
            )
    # target ancestors are at time==admixture_time
    target_ancestors = np.where(ts.tables.nodes.asdict()['time']==admixture_time)[0]
    
    nsample = len(target_samples)
    l = [x for x in range(0, nsample, per_batch)]
    r = [x for x in range(per_batch, nsample+per_batch, per_batch)]
    #print(l, r)
    dfs = []
    for i in range(len(l)):
        local = ts.tables.link_ancestors(          
            samples = target_samples[l[i]:r[i]],
            ancestors = target_ancestors
        )

        local_df = pd.DataFrame({
            'left': local.left, 
            'right': local.right,
            'parent': local.parent,
            'child': local.child
        })
        
        dfs.append(local_df)
        
    local_ancestry_df = pd.concat(dfs)
    pop_of_node = dict(zip(range(len(ts.tables.nodes)), ts.tables.nodes.population))
    # local ancestry population
    local_ancestry_df['localpop'] = [pop_of_node[x] for x in local_ancestry_df['parent']]
    # sampling population
    local_ancestry_df['samplepop'] = [pop_of_node[x] for x in local_ancestry_df['child']]
    local_ancestry_df = local_ancestry_df.sort_values(['samplepop', 'child', 'left']).reset_index(drop=True)
    return(local_ancestry_df)

In [6]:
ts = tszip.decompress('/home/kele/Documents/lai/lai-sim/results/simulations/AmericanAdmixture_4B11/AA_42/test_anal_1.sample.tsz')

print(len(ts.samples()))
ts

175114


Tree Sequence,Unnamed: 1
Trees,342987
Sequence Length,2565228.0
Sample Nodes,175114
Total Size,90.2 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,1693922,45.2 MiB,
Individuals,233370,14.2 MiB,✅
Migrations,0,4 Bytes,
Mutations,75840,2.1 MiB,
Nodes,445586,14.4 MiB,✅
Populations,4,444 Bytes,✅
Provenances,4,17.4 KiB,
Sites,75840,1.2 MiB,


In [12]:
[s.position for s in ts.sites()]

[1.2742944359779358,
 42.79837908688933,
 73.14186565345153,
 77.70543927233666,
 166.39498272514902,
 212.2924223949667,
 229.60578837851062,
 239.33020239695907,
 253.95831123087555,
 312.8965358028654,
 453.7063127136789,
 455.2432009463664,
 458.3674761727452,
 529.4313842137344,
 545.71736064367,
 599.1509334724396,
 646.1419532359578,
 686.763868689537,
 695.3500957749784,
 746.4046758699697,
 771.8510902135167,
 799.5224416851997,
 821.7596545377746,
 931.0459000070114,
 943.0218509614933,
 985.9024206802715,
 1025.9012150326744,
 1206.3463911088184,
 1281.6836734134704,
 1289.653668596875,
 1297.2256690599024,
 1338.5502100337762,
 1345.5442794265691,
 1351.3077791603282,
 1360.7574570556171,
 1369.024188055424,
 1382.8181520248763,
 1388.6979620996863,
 1423.6273282323964,
 1515.3275527425576,
 1523.703059512889,
 1553.7801657780074,
 1599.2533378293738,
 1638.2902904902585,
 1710.5155942104757,
 1742.9543086886406,
 1867.660406040959,
 1961.7500392948277,
 1965.4399013072252,

In [4]:
ts = tszip.decompress('/home/kele/Documents/lai/lai-sim/results/simulations/AmericanAdmixture_4B11/AA_42/test_anal_1.sample.filter.tsz')

print(len(ts.samples()))
ts

400


Tree Sequence,Unnamed: 1
Trees,65403
Sequence Length,2565228.0
Sample Nodes,400
Total Size,9.3 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,252513,6.7 MiB,
Individuals,202,12.6 KiB,✅
Migrations,0,4 Bytes,
Mutations,1000,28.3 KiB,
Nodes,17852,591.5 KiB,✅
Populations,4,444 Bytes,✅
Provenances,5,17.9 KiB,
Sites,1000,16.6 KiB,


In [67]:
la = get_local_ancestry(ts, admixture_time=20, per_batch=12)
la

34


Unnamed: 0,left,right,parent,child,localpop,samplepop
0,0.0,1032662.0,17588,0,0,0
1,1032662.0,1502117.0,18959,0,0,0
2,1502117.0,1502437.0,2220,0,0,0
3,1502437.0,2065843.0,18959,0,0,0
4,2065843.0,2066010.0,14009,0,0,0
...,...,...,...,...,...,...
1940,73718.0,681222.0,61532,399,1,3
1941,681222.0,681617.0,57704,399,1,3
1942,681617.0,707838.0,61532,399,1,3
1943,707838.0,708216.0,57704,399,1,3


In [70]:
la.to_hdf('./temp.hdf', key = 'local_ancestry', mode = 'w', complib = 'blosc:lz4', format ='fixed')

/home/kele/Documents/lai/lai-sim/workflow/notebooks


ImportError: Missing optional dependency 'tables'.  Use pip or conda to install tables.

In [52]:
len(np.where(ts.tables.nodes.asdict()['time']==0)[0])

400

In [53]:
ts.tables.nodes.asdict()['time'][ts.samples()]

array([ 0.,  0.,  0., ..., 20., 20., 20.])

In [54]:
len(ts.samples())

175114

In [30]:
random_seed = 42


rng = default_rng(random_seed)
seeds = rng.bit_generator._seed_seq.spawn(10)

In [32]:
type(seeds)

list

In [27]:
rng = np.random.default_rng(seed)
rng.choice(range(9), 3, replace=False)

array([1, 3, 0])

In [28]:
rng.choice(range(9), 3, replace=False)

array([4, 3, 0])

In [29]:
rng.choice(range(9), 3, replace=False)

array([3, 0, 8])

In [3]:
ts = tszip.decompress('/home/kele/Documents/lai/lai-sim/results/sims/AmericanAdmixture_4B11/AA_42.full.tsz')

In [4]:
ts

Tree Sequence,Unnamed: 1
Trees,556279
Sequence Length,2565228.0
Sample Nodes,466740
Total Size,156.9 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,3004827,80.2 MiB,
Individuals,233370,14.2 MiB,✅
Migrations,0,4 Bytes,
Mutations,127028,3.5 MiB,
Nodes,1044829,33.9 MiB,✅
Populations,4,444 Bytes,✅
Provenances,3,16.9 KiB,
Sites,127028,2.1 MiB,


In [22]:
take = np.array([], dtype = int)
per_pop = 14
for pop in ts.populations():
    samples = ts.samples(population = pop.id)[:per_pop]
    take = np.concatenate([take, samples])

In [23]:
take

array([     0,      1,      2,      3,      4,      5,      6,      7,
            8,      9,     10,     11,     12,     13,  28948,  28949,
        28950,  28951,  28952,  28953,  28954,  28955,  28956,  28957,
        28958,  28959,  28960,  28961,  91804,  91805,  91806,  91807,
        91808,  91809,  91810,  91811,  91812,  91813,  91814,  91815,
        91816,  91817, 174744, 174745, 174746, 174747, 174748, 174749,
       174750, 174751, 174752, 174753, 362754, 362755, 362756, 362757])

In [27]:
take[::2]

array([     0,      2,      4,      6,      8,     10,     12,  28948,
        28950,  28952,  28954,  28956,  28958,  28960,  91804,  91806,
        91808,  91810,  91812,  91814,  91816, 174744, 174746, 174748,
       174750, 174752, 362754, 362756])

In [24]:
simp_ts = ts.simplify(
    samples=take, 
    map_nodes=True, 
    filter_populations=False
)

In [26]:
simp_ts[0]

Tree Sequence,Unnamed: 1
Trees,28525
Sequence Length,2565228.0
Sample Nodes,56
Total Size,4.1 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,102078,2.7 MiB,
Individuals,28,1.8 KiB,✅
Migrations,0,4 Bytes,
Mutations,8435,238.9 KiB,
Nodes,7653,252.9 KiB,✅
Populations,4,444 Bytes,✅
Provenances,4,17.4 KiB,
Sites,8435,140.0 KiB,


In [57]:
[1,1,3,4][-1:]

[4]

In [47]:
from numpy.random import default_rng
random_seed = 42
rng = default_rng(random_seed)

def sample_inds(ts, pop_id, nind):
    """return the haploid sample ids representing sampling nind individuals from pop_id in ts"""
    hap_samples = ts.samples(population = pop.id)
    # sample from the first haploids of each ind
    take = rng.choice(hap_samples[::2], nind, replace=False)
    samples = np.empty(nind*2, dtype=int)
    samples[0::2] = take
    samples[1::2] = take+1
    return(samples)
    

sample_inds(ts, 3, 18), sample_inds(ts, 2, 18)

(array([371680, 371681, 437354, 437355, 376066, 376067, 372022, 372023,
        443206, 443207, 441892, 441893, 408372, 408373, 464192, 464193,
        417488, 417489, 407764, 407765, 416116, 416117, 372536, 372537,
        439250, 439251, 430796, 430797, 452014, 452015, 383692, 383693,
        435252, 435253, 444488, 444489]),
 array([409088, 409089, 441564, 441565, 441766, 441767, 379922, 379923,
        428418, 428419, 451974, 451975, 433238, 433239, 455616, 455617,
        391518, 391519, 435580, 435581, 463678, 463679, 443688, 443689,
        369806, 369807, 448788, 448789, 382984, 382985, 369378, 369379,
        400588, 400589, 399606, 399607]))

In [61]:
a = sample_inds(ts, 2, 18)
a.sort()
a

array([378580, 378581, 379510, 379511, 380026, 380027, 387572, 387573,
       394098, 394099, 402364, 402365, 409136, 409137, 410760, 410761,
       414278, 414279, 414454, 414455, 414836, 414837, 425890, 425891,
       428288, 428289, 433842, 433843, 435144, 435145, 437292, 437293,
       456378, 456379, 460274, 460275])

In [None]:
c = np.empty((a.size + b.size), dtype=a.dtype)
c[0::2] = a
c[1::2] = b

In [50]:
import os 
import pandas as pd

In [28]:
np.empty(4)

array([4.66610009e-310, 0.00000000e+000, 6.92363728e-310, 6.92353203e-310])

In [52]:
os.chdir('/home/kele/Documents/lai/lai-sim')

In [53]:
config= dict()
config["simulations"] = 'config/simulations.tsv'
config["analyses"] =  'config/analyses.tsv'

In [55]:
# this file has one line per base simulation run
# up through the recap and mutate step
simulations = pd.read_csv(config["simulations"], sep="\t")
analyses = pd.read_csv(config["analyses"], sep="\t")
units = analyses.merge(simulations, on=['sim_name'])
units

Unnamed: 0,anal_name,sim_name,nsamp_admixed,nsamp_ref,MAC_filter,max_snps,model_name,random_seed,slim_script_path,ancestral_Ne,mutation_rate,chr,chr_len
0,test_anal_1,AA_42,50,50,5,10000,AmericanAdmixture_4B11,42,workflow/scripts/sim/SLiM/AmericanAdmixture_4B...,7310,1.44e-08,chr22,0.25


In [44]:
next(simulations.itertuples()).sim_name

'AA_42'

In [45]:
simulations

Unnamed: 0_level_0,sim_name,model_name,random_seed,slim_script_path,ancestral_Ne,mutation_rate,chr,chr_len
sim_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AA_42,AA_42,AmericanAdmixture_4B11,42,../workflow/scripts/sim/SliM/AmericanAdmixture...,7310,1.44e-08,chr22,0.25
AA_52,AA_52,AmericanAdmixture_4B11,52,workflow/scripts/sim/SliM/AmericanAdmixture_4B...,7310,1.44e-08,chr22,0.25


In [46]:
[f'results/sims/{s.model_name}/{s.sim_name}.full.tsz' for s in simulations.itertuples()]

['results/sims/AmericanAdmixture_4B11/AA_42.full.tsz',
 'results/sims/AmericanAdmixture_4B11/AA_52.full.tsz']

In [30]:
#units = analyses.merge(simulations, on = ['sim_name'])
#units = units.set_index(['sim_name', 'anal_name'])
units

Unnamed: 0,anal_name,sim_name,nsamp_admixed,nsamp_ref,MAC_filter,max_snps,model_name,random_seed,slim_script_path,ancestral_Ne,mutation_rate,chr,chr_len
0,test_anal_1,AA_42,50,50,5,10000,AmericanAdmixture_4B11,42,workflow/scripts/sim/SliM/AmericanAdmixture_4B...,7310,1.44e-08,chr22,0.25


In [25]:
simulations

Unnamed: 0_level_0,model_name,random_seed,slim_script_path,ancestral_Ne,mutation_rate,chr,chr_len
sim_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AA_42,AmericanAdmixture_4B11,42,workflow/scripts/sim/SliM/AmericanAdmixture_4B...,7310,1.44e-08,chr22,0.25
AA_52,AmericanAdmixture_4B11,52,workflow/scripts/sim/SliM/AmericanAdmixture_4B...,7310,1.44e-08,chr22,0.25


In [28]:
analyses

Unnamed: 0_level_0,sim_name,nsamp_admixed,nsamp_ref,MAC_filter,max_snps
anal_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
test_anal_1,AA_42,50,50,5,10000


In [48]:
units

NameError: name 'units' is not defined

In [23]:
units.loc[('AA_42', 'test_anal_1')]

nsamp_admixed                                                      50
nsamp_ref                                                          50
MAC_filter                                                          5
max_snps                                                        10000
model_name                                     AmericanAdmixture_4B11
random_seed                                                        42
slim_script_path    workflow/scripts/sim/SliM/AmericanAdmixture_4B...
ancestral_Ne                                                     7310
mutation_rate                                                     0.0
chr                                                             chr22
chr_len                                                          0.25
Name: (AA_42, test_anal_1), dtype: object

Unnamed: 0_level_0,Unnamed: 1_level_0,nsamp_admixed,nsamp_ref,MAC_filter,max_snps,model_name,random_seed,slim_script_path,ancestral_Ne,mutation_rate,chr,chr_len
sim_name,anal_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AA_42,test_anal_1,50,50,5,10000,AmericanAdmixture_4B11,42,workflow/scripts/sim/SliM/AmericanAdmixture_4B...,7310,1.44e-08,chr22,0.25


In [14]:
[f'results/sims/{u.model_name}/{u.sim_name}.full.tsz' for u in units.itertuples()]

['results/sims/AmericanAdmixture_4B11/AA_42.full.tsz']

In [7]:
	.set_index("sim_name", drop=False)
	.sort_index()
)

# this file has one line per analysis run
# each line should reference a simulation
# specifies the sampling and filtering
# may also specify a limited genomic span?? (probably not worth it)
# we will have to see how best to specify the LAI parameters

	.set_index("anal_name", drop=False)
	.sort_index()
)

ValueError: 'sim_name' is both an index level and a column label, which is ambiguous.