In [1]:
import numpy as np
import msprime
import os
import pandas as pd

simu_settings = {    
    'n_samples': 5,
    'n_replicates': 6,
    'segment_length': 2e4,
    'recombination_rate': 1e-8,
    'mutation_rate': 5e-7,
               }

dataset_name = 'data_test'
outdir = f'simulations_hX09jsbdc89fez/{dataset_name}/'
scenario_params_path =  os.path.join(outdir, f"{dataset_name}_params.csv")
list_params = []
print(f'Simulations will be saved in {outdir}')

n_scenarios = 3
for scenario in range(n_scenarios):
    print(scenario)
    # Draw param
    Ne = np.random.uniform(100,1000)
    
    # Store simulation parameters
    list_params.append(dict(**dict({'scenario_idx':scenario,'Ne':Ne}, **simu_settings)))
    
    # Simulate
    tree_sequence = list(msprime.simulate(
        sample_size = simu_settings['n_samples'], 
        num_replicates = simu_settings['n_replicates'],
        length = simu_settings['segment_length'], 
        recombination_rate = simu_settings['recombination_rate'], 
        mutation_rate = simu_settings['mutation_rate'], 
        Ne = Ne, 
    ))
    
    # Save data in DNADNA format
    for replicate, ts in enumerate(tree_sequence):
        snps = ts.genotype_matrix().T.astype(np.uint8)
        pos = np.round(ts.tables.asdict()["sites"]["position"]).astype(int)
        # Default layout (can be changed)
        filename = f"scenario_{scenario}/{dataset_name}_{scenario}_{replicate}.npz"
        fullpath = os.path.join(outdir, filename)
        if os.path.isfile(fullpath):
            print(f"Warning: overwriting {fullpath}.")

        os.makedirs(os.path.dirname(fullpath), exist_ok=True)
        np.savez_compressed(fullpath, SNP=snps, POS=pos)
        
scenario_par = pd.DataFrame(list_params,columns=list_params[0].keys())
scenario_par.to_csv(scenario_params_path)

Simulations will be saved in simulations_hX09jsbdc89fez/data_test/
0
1
2


In [2]:
scenario_par

Unnamed: 0,scenario_idx,Ne,n_samples,n_replicates,segment_length,recombination_rate,mutation_rate
0,0,324.31713,5,6,20000.0,1e-08,5e-07
1,1,982.083469,5,6,20000.0,1e-08,5e-07
2,2,589.529747,5,6,20000.0,1e-08,5e-07


In [3]:
! less {scenario_params_path}

,scenario_idx,Ne,n_samples,n_replicates,segment_length,recombination_rate,mutation_rate
0,0,324.3171297251008,5,6,20000.0,1e-08,5e-07
1,1,982.0834692427014,5,6,20000.0,1e-08,5e-07
2,2,589.529747274963,5,6,20000.0,1e-08,5e-07
[K[7m(END)[m[K09jsbdc89fez/data_test/data_test_params.csv (END)[m[K

## Check loading of one of the npz in dnadna environment:

In [5]:
from dnadna.snp_sample import SNPSample
SNPSample.from_file(fullpath)


SNPSample(
    snp=tensor([[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
                 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
                 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
                 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1],
                [1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
                 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
                 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
                 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1],
                [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
                 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
                 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
                 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 