## Generate reads from the reference

We generate reads with NanoSim.
We first extract the NanoSim read error model to some directory. This is only necessary once.

In [8]:
%load_ext autoreload
%autoreload 2

import logging
from pathlib import Path

from simreaduntil.shared_utils.logging_utils import add_comprehensive_stream_handler_to_logger
add_comprehensive_stream_handler_to_logger(None, level=logging.INFO)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
n_procs = 4
perfect = True
use_slurm = True
on_cluster = False
nanosim_dir = Path("external/ont_nanosim/")
nanosim_model_dir = Path("runs/nanosim_models")
nanosim_model_prefix = nanosim_model_dir / "human_NA12878_DNA_FAB49712_guppy/training"
reads_output_dir = "runs/enrich_usecase/nanosim_reads"
ref_genome_path = "runs/enrich_usecase/data/chm13v2.0_normalized1000000firsttwo.fa.gz"

assert nanosim_dir.exists(), "move to the repo root repository"

In [3]:
# only necessary once
!mkdir runs/nanosim_models
!tar -xvzf external/ont_nanosim/pre-trained_models/human_NA12878_DNA_FAB49712_guppy.tar.gz -C "{nanosim_model_dir}"

x human_NA12878_DNA_FAB49712_guppy/
x human_NA12878_DNA_FAB49712_guppy/training_unaligned_length.pkl
x human_NA12878_DNA_FAB49712_guppy/training_reads_alignment_rate
x human_NA12878_DNA_FAB49712_guppy/training_model_profile
x human_NA12878_DNA_FAB49712_guppy/training_aligned_region.pkl
x human_NA12878_DNA_FAB49712_guppy/training_first_match.hist
x human_NA12878_DNA_FAB49712_guppy/training_strandness_rate
x human_NA12878_DNA_FAB49712_guppy/training_gap_length.pkl
x human_NA12878_DNA_FAB49712_guppy/training_error_markov_model
x human_NA12878_DNA_FAB49712_guppy/training_aligned_reads.pkl
x human_NA12878_DNA_FAB49712_guppy/training_chimeric_info
x human_NA12878_DNA_FAB49712_guppy/training_ht_ratio.pkl
x human_NA12878_DNA_FAB49712_guppy/training_match_markov_model
x human_NA12878_DNA_FAB49712_guppy/training_ht_length.pkl
x human_NA12878_DNA_FAB49712_guppy/training_error_rate.tsv


In [11]:
from simreaduntil.shared_utils.utils import print_cmd_and_run
from simreaduntil.usecase_helpers.utils import get_gen_nanosim_reads_cmd

if on_cluster:
    n_reads_per_sim = 1_000_000
else:    
    # n_reads_per_sim = 160_000
    n_reads_per_sim = 10
    use_slurm = False
    
nanosim_command = get_gen_nanosim_reads_cmd(nanosim_dir, nanosim_model_prefix, ref_genome_path, reads_dir=reads_output_dir, n_reads_per_sim=n_reads_per_sim, perfect=perfect, use_slurm=use_slurm)
print_cmd_and_run(nanosim_command, dry=True)
# print_cmd_and_run(nanosim_command, dry=False) # todo2: only working in the command line, not in the notebook for some reason

2023-07-27 20:20:19,776 - Dry run, so not executing the command:
#!/usr/bin/bash
seed=1
conda run -n nanosim python -c "import HTSeq; print(HTSeq.__version__)"

# cd <correct_dir>
conda run -n nanosim \
    python "external/ont_nanosim/src/simulator.py" genome \
    --model_prefix "runs/nanosim_models/human_NA12878_DNA_FAB49712_guppy/training" \
    --ref_g "runs/enrich_usecase/data/chm13v2.0_normalized1000000firsttwo.fa.gz" \
    -dna_type linear \
    --output "runs/enrich_usecase/nanosim_reads/perfect_reads_seed$seed" \
    --number 10 \
    --seed "$seed" \
    --strandness 0.5 \
    --basecaller guppy \
    --aligned_rate "100%" \
    --num_threads "4" \
    --perfect \
    --no_error_profile \
    --no_flanking


**Paste this command into the terminal to run it.**

You need to modify the simulator config file to take the reads as input by adding a line `reads_file = <path>`