In [1]:
import sys
sys.path.append('../centroFlye_repo/scripts')

import os
import pandas as pd
import numpy as np
import scipy as sc
import bisect
import matplotlib
%matplotlib inline 
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from collections import defaultdict, Counter
import heapq
from copy import deepcopy
from itertools import accumulate
import edlib
import re


from utils.bio import read_bio_seq, read_bio_seqs, write_bio_seqs, compress_homopolymer
from utils.os_utils import smart_makedirs
from read_kmer_cloud import get_reads_kmer_clouds, filter_reads_kmer_clouds
from ncrf_parser import NCRF_Report
from utils.various import find_all_overlap, take_closest, get_kmers
from utils.os_utils import smart_makedirs
from cloud_contig import CloudContig, map_reads, map_reads_fast
from distance_based_kmer_recruitment import get_kmer_freqs_from_ncrf_report, get_kmer_dist_map, filter_dist_tuples


%load_ext autoreload
%autoreload 2

In [2]:
plt.rcParams['figure.figsize'] = [10, 8]


# Simulate repeats and resolve them

In [3]:
simulations_dir = "../data/simulations"
smart_makedirs(simulations_dir)

centroFlye_scripts_dir = "../centroFlye_repo/scripts"
data_dir = "../data"
centroFlye_results_dir = "../data/centroFlye_results/"

In [4]:
def simulate_repeat(multiplicity, div_rate, outdir, seed,
                    centroFlye_scripts_dir=centroFlye_scripts_dir,
                    centroFlye_results_dir=centroFlye_results_dir,
                    threads=30, coverage=43):
    !python {centroFlye_scripts_dir}/simulate_tandem_repeat.py \
        --unit {centroFlye_results_dir}/DXZ1_star/DXZ1_rc_star.fasta  \
        --multiplicity {multiplicity} \
        --div-rate {div_rate} \
        -o {outdir} \
       --seed 468273
    
    !~/soft/NanoSim/src/simulator.py linear \
        --seed 468273 \
        -r {outdir}/flanked_tandem_repeat.fasta \
        -c ../data/nanosim_training/training \
        -n 3000
    !mkdir {outdir}/nanosim
    !mv simulated_error_profile {outdir}/nanosim/.
    !mv simulated.log {outdir}/nanosim/.
    !mv simulated_reads.fasta {outdir}/nanosim/.
    
    !python {centroFlye_scripts_dir}/run_ncrf_parallel.py \
        --reads {outdir}/nanosim/simulated_reads.fasta \
        -t {threads} \
        --outdir {outdir}/nanosim/NCRF \
        --repeat {centroFlye_results_dir}/DXZ1_star/DXZ1_rc_star.fasta
    
    !python {centroFlye_scripts_dir}/distance_based_kmer_recruitment.py \
        --ncrf {outdir}/nanosim/NCRF/report.ncrf \
        --coverage 43 \
        --"min-coverage" 4 \
        --outdir {outdir}/nanosim/recruited_unique_kmers_k19
    
    
    !python {centroFlye_scripts_dir}/read_placer.py \
        --ncrf {outdir}/nanosim/NCRF/report.ncrf \
        --n-motif 1 \
        --genomic-kmers {outdir}/nanosim/recruited_unique_kmers_k19/unique_kmers_min_edge_cov_4.txt \
        --outdir {outdir}/nanosim/tr_resolution \
        --k-cloud 19
    
    !python /Poppy/abzikadze/tandem_flye/py/eltr_polisher.py \
        --read-placement {outdir}/nanosim/tr_resolution/read_positions.csv \
        --outdir {outdir}/nanosim/polishing \
        --output-progress \
        --error-mode nano \
        --num-iters 4 \
        --num-threads {threads} \
        --unit {centroFlye_results_dir}/DXZ1_star/DXZ1_rc_star.fasta \
        --ncrf {outdir}/nanosim/NCRF/report.ncrf

In [5]:
seed = 468273
div_rates = [0.01, 0.005, 0.0001]
div_rates_fns = ['1', '05', '001']

If you wish to replicate the simulation results, please, rerun the following cell (note that it can take up to 24 hours):

In [6]:
#for div_rate, fn in zip(div_rates, div_rates_fns):
#    simulate_repeat(multiplicity=500, div_rate=div_rate, outdir=f"../data/simulations/sim_m500_d{fn}", seed=seed)
    

# Benchmarking with divergence 1%

In [7]:
simulations_dir = \
"../data/simulations/sim_m500_d1/"

reference_tandem_repeat_fn = \
os.path.join(simulations_dir, 'tandem_repeat.fasta')

In [8]:
! head -n 2 {simulations_dir}/simulation.log

full_tr_len = 1027000
total_n_mut = 10390


In [9]:
reference_tandem_repeat = read_bio_seq(reference_tandem_repeat_fn)

In [10]:
def get_all_kmers(seq, k=19):
    all_kmers = defaultdict(int)
    for i in range(len(seq) - k + 1):
        kmer = seq[i:i+k]
        all_kmers[kmer] += 1
    return all_kmers

In [11]:
ref_kmers = get_all_kmers(reference_tandem_repeat)
unique_ref_kmers  = set([k for k in ref_kmers if ref_kmers[k] == 1])

#unique kmers in reference

In [12]:
len(unique_ref_kmers)

52738

# Simulated reads

In [13]:
! grep -c "_unaligned" {simulations_dir}/nanosim/simulated_reads.fasta

1150


In [14]:
! grep -c "_aligned" {simulations_dir}/nanosim/simulated_reads.fasta

1850


In [15]:
reads = read_bio_seqs(os.path.join(simulations_dir, 'nanosim', 'simulated_reads.fasta'))

In [16]:
total_length_aligned_reads = sum(int(r_id.split('_')[6]) for r_id, read in reads.items() if '_aligned' in r_id)

In [17]:
total_length_aligned_reads

60796193

In [18]:
coverage = total_length_aligned_reads / (len(reference_tandem_repeat) + 400000)
print(coverage), print(round(coverage))

42.604199719691664
43


(None, None)

In [19]:
ncrf_report_fn = os.path.join(simulations_dir, 'nanosim', 'NCRF', 'report.ncrf')

In [20]:
! grep "flanked" {ncrf_report_fn} | cut -d ' ' -f 1 | sort | uniq | wc -l

1380


In [21]:
ncrf_reads_report = NCRF_Report(ncrf_report_fn)

In [22]:
len(ncrf_reads_report.records)

1147

# Unique 19-mers

In [23]:
all_kmers = get_kmer_freqs_from_ncrf_report(ncrf_reads_report, k=19,verbose=False,max_nonuniq=3)

In [24]:
len(all_kmers)

16949930

In [25]:
coverage = round(coverage)
bottom, top = 0.9, 3
mean_survival_rate=0.34
filtered_kmers = {kmer: v for kmer, v in all_kmers.items() \
                  if bottom*coverage*mean_survival_rate <= v <= top*coverage*mean_survival_rate}

In [26]:
len(filtered_kmers)

140780

In [27]:
recruited_unique_kmers_fn = \
    os.path.join(simulations_dir, 'nanosim', 'recruited_unique_kmers_k19', 'unique_kmers_min_edge_cov_4.txt')

In [28]:
recruited_unique_kmers = get_kmers(recruited_unique_kmers_fn)

In [29]:
len(recruited_unique_kmers)

34717

In [30]:
len(recruited_unique_kmers & unique_ref_kmers)

31927

In [31]:
erroneously_recruited_kmers = recruited_unique_kmers - unique_ref_kmers
len(erroneously_recruited_kmers)

2790

In [32]:
Counter([ref_kmers[kmer] for kmer in erroneously_recruited_kmers])

Counter({0: 1547, 2: 1242, 3: 1})

# Reconstructed sequence

In [33]:
ncrf_reads_fn = \
os.path.join(simulations_dir, 'nanosim', 'NCRF', 'report.ncrf')
ncrf_reads = NCRF_Report(ncrf_reads_fn)

In [34]:
prefix_reads, internal_reads, suffix_reads = ncrf_reads.classify()
len(prefix_reads), len(internal_reads), len(suffix_reads)

(13, 1127, 7)

In [35]:
!grep -c "None" {simulations_dir}/nanosim/tr_resolution/read_positions.csv

1


In [36]:
!edlib-aligner -l {simulations_dir}/tandem_repeat.fasta {simulations_dir}/nanosim/polishing/final_sequence_4.fasta



Using NW alignment mode.
Reading queries...
Read 1 queries, 1027000 residues total.
Reading target fasta file...
Read target, 1024904 residues.

Comparing queries to target...
1/1
Scores:
<query number>: <score>, <num_locations>, [(<start_location_in_target>, <end_location_in_target>)]
#0: 7281  1  [ (0, 1024903) ]

Cpu time of searching: 0.408573
