In [8]:
import os
import sys
#split record in the SAM file
import pandas as pd
#check cigar string 
import re

In [9]:
#move to working directory 
os.chdir('/mnt/f/nanopore_data/rDNA/recall_rdna_analysis/SNP/for_new_variant/')

## Example
### From reads to SAM file
Map ONT reads to single copy of rDNA unit sequences with Minimap2  
`minimap2 -ax map-ont cel_5S_rDNA.fa N2EM_5S_rDNA_ONT_reads.fa > sam/N2EM_5S.sam`  

### From BAM file to SAM file  
`samtools view -h bam/N2EM_5S_sort.bam > sam/N2EM_5S.sam`

### Structure of files  
sam/  
------N2EM_5S.sam  
------...  
indel_df/  
------N2EM_5S_bin_2.tsv  
------N2EM_5S_bin_5.tsv  
------N2EM_5S_bin_10.tsv  
------...  
    

In [10]:
#Build functions
#range is optimized for 5S rDNAs
def ReadCigar2df(start, cigar_string, bin_set = 1):
    global indel_df
    for num , IDM in re.findall('(\d+)([IDM])', cigar_string):
        if IDM == 'M':
            start += int(num)
            continue
        #print(num, IDM, start)
        if IDM == "D":
            if (int(num) > int(bin_set)) and (int(num) <100):
                for i in range( start-1 ,start + int(num)-1):
                    indel_df.DEL[i] += 1
            start += int(num)
        if IDM == "I":               
            if (int(num) > int(bin_set)) and (int(num) <100):
                for i in range( start ,start + int(num)):
                    indel_df.INS[i] += 1
#range is optimized for 45S rDNAs
def Read_45SrDNA_Cigar2df(start, cigar_string, bin_set = 2):
    global indel_df
    for num , IDM in re.findall('(\d+)([IDM])', cigar_string):
        if IDM == 'M':
            start += int(num)
            continue
        #print(num, IDM, start)
        if IDM == "D":
            if int(num) > int(bin_set):
                for i in range( start-1 ,start + int(num)-1):
                    indel_df.DEL[i] += 1
            start += int(num)
        if IDM == "I":
            if int(num) > int(bin_set):
                for i in range( start ,start + int(num)):
                    indel_df.INS[i] += 1

In [13]:
# for 5S rDNA 
stage = "cendr_CB4856"
#stage can be N2EM, N2L1, N2YA, CB4856, AF16
rDNA = "5S"
fname = stage + "_" + rDNA + ".sam" 
bin_set = 0
#set_bin = [2,5,10] #minimal length of INDEL (not included)in the cigar string
#for bin_set in set_bin:
counter = 0
#make empty df to load indel with bin size 
indel_df = pd.DataFrame({"pos": list(range(1,1001))})
indel_df["DEL"] = [0]*1000
indel_df["INS"] = [0]*1000
indel_fname =  "indel_df/" + stage + "_" + rDNA + "_bin_" + str(bin_set) + ".tsv" 
with open(fname , "r") as samfile:
    for _ in range(2): #skip the first 2 rows in the SAM file
        next(samfile)
    for line in samfile:   
        sam_content = line.split("\t")
        flag = sam_content[1]
        if flag != 4 :
            counter += 1
            pos = int(sam_content[3])
            cigar = str(sam_content[5])
            ReadCigar2df(pos, cigar , bin_set)
            if counter%500 == 0: 
                sys.stdout.write('\r'+"{} cigar processed".format(counter))
                sys.stdout.flush()
    indel_df.to_csv(indel_fname, index=False, header=True, sep="\t")
print("\nExtracted {} records of cigar to file with INDEL size > {}\n".format(counter , bin_set))

161500 cigar processed
Extracted 161980 records of cigar to file with INDEL size > 0



In [12]:
# for 45S rDNA
stage = "cendr_CB4856"
rDNA = "45S"
fname = stage + "_" + rDNA + ".sam" 
#set_bin = [2,5,10]
bin_set= 0
#for bin_set in set_bin:
counter = 0
#make empty df to load indel with bin size 
indel_df = pd.DataFrame({"pos": list(range(1,7601))})
indel_df["DEL"] = [0] * 7600
indel_df["INS"] = [0] * 7600
indel_fname =  "indel_df/" + stage + "_" + rDNA + "_bin_" + str(bin_set) + ".tsv" 
with open(fname , "r") as samfile:
    for _ in range(2): #skip the first 2 rows in the SAM file
        next(samfile)
    for line in samfile:    
        sam_content = line.split("\t")
        flag = sam_content[1]
        if flag != 4 :
            counter += 1
            pos = int(sam_content[3])
            cigar = str(sam_content[5])
            Read_45SrDNA_Cigar2df(pos, cigar , bin_set)
            if counter%500 == 0: 
                sys.stdout.write('\r'+"{} cigar processed".format(counter))
                sys.stdout.flush()
    indel_df.to_csv(indel_fname, index=False, header=True, sep="\t")
print("\nExtracted {} records of cigar to file with minimal INDEL size {}\n".format(counter , bin_set))

2279000 cigar processed
Extracted 2279419 records of cigar to file with minimal INDEL size 0



The output file could be used for plotting using the provided Rscript in Rstudio.  