In [1]:
import pysam
sam_f = '/home/ytanigaw/data/nanopore/20161008_wgs_caucasian_48hr.10k.bwa.mapq60.20kb.chr11.sam'
ref_f = '/share/PI/mrivas/data/hg19/hg19.fa'
hg19 = pysam.FastaFile(ref_f)

In [2]:
def read_sam(sam_f, sep = None):
    '''
    read a sam file
    Currently assumes that sam file has exactly one line of header
    '''
    with open(sam_f, 'r') as f:
        head = f.readline().strip()
        if(sep != None):
            entries = [line.strip().split(sep) for line in f]
        else:
            entries = [line.strip().split() for line in f]
    return(head, entries)

In [3]:
def parse_cigar(cigar):
    '''
    parse CIGAR string and return them as list of 2-tupples
    '''
    parsed = []
    start = 0
    i = 0
    ref_len = 0
    while(i < len(cigar)):
        while('0' <= cigar[i] <= '9'):
            i = i + 1
        segment_length = int(cigar[start:i])
        segment_type = cigar[i]
        parsed.append((segment_length, segment_type))
        i = i + 1
        start = i
    return(parsed, ref_len)

In [4]:
def retrieve_alignment(seq, ref, cigar_list, gap_char = '_'):
    '''
    Based on parsed CIGAR string and read(seq) and reference(ref),
    reconstruct an alignment and count match/mismatches
    '''
    aln_seq = []
    aln_ref = []
    aln_chr = []
    ptr_seq = 0
    ptr_ref = 0
    stats = {'M':0, 'I':0, 'D':0, 'N':0, 'S':0, 'H':0, 'P':0, '=':0, 'X':0}
    for i in xrange(len(cigar_list)):
        if(cigar_list[i][1] == 'M'):
            for j in xrange(cigar_list[i][0]):
                aln_seq.append(seq[ptr_seq])
                aln_ref.append(ref[ptr_ref])
                if(seq[ptr_seq].upper() == ref[ptr_ref].upper()):
                    aln_chr.append('=')
                    stats['='] = stats['='] + 1
                else:
                    aln_chr.append('X')
                    stats['X'] = stats['X'] + 1                    
                ptr_seq = ptr_seq + 1
                ptr_ref = ptr_ref + 1
        elif(cigar_list[i][1] == 'I'):
            aln_seq.append(seq[ptr_seq : ptr_seq + cigar_list[i][0]])
            ptr_seq = ptr_seq + cigar_list[i][0]
            aln_ref.append(gap_char * cigar_list[i][0])    
            aln_chr.append('I' * cigar_list[i][0])    
            stats['I'] = stats['I'] + cigar_list[i][0]                               
        elif(cigar_list[i][1] == 'D'):
            aln_seq.append(gap_char * cigar_list[i][0])
            aln_ref.append(ref[ptr_ref : ptr_ref + cigar_list[i][0]])
            ptr_ref = ptr_ref + cigar_list[i][0]
            aln_chr.append('D' * cigar_list[i][0])    
            stats['D'] = stats['D'] + cigar_list[i][0]                                           
        else:
            aln_seq.append(seq[ptr_seq : ptr_seq + cigar_list[i][0]])
            ptr_seq = ptr_seq + cigar_list[i][0]
            aln_ref.append(ref[ptr_ref : ptr_ref + cigar_list[i][0]])
            ptr_ref = ptr_ref + cigar_list[i][0]
            aln_chr.append(cigar_list[i][1] * cigar_list[i][0])        
            stats[cigar_list[i][1]] = stats[cigar_list[i][1]] + cigar_list[i][0]
    return(''.join(aln_seq), ''.join(aln_ref), ''.join(aln_chr), ptr_ref, stats)    

In [5]:
def process_entry(e, reference, gap_char = '_'):
    '''
    process one line in a sam file and returns an object of sam_entry class
    '''
    cigar_list, ref_len = parse_cigar(e[5])
    (aln_seq, aln_ref, aln_chr, ptr_ref, counts) = \
    retrieve_alignment(seq = e[9],
                       ref = reference.fetch(reference = e[2],
                                             start = int(e[3]) - 1, 
                                             end = int(e[3]) - 1 + 2 * len(e[9])),
                       cigar_list = cigar_list, 
                       gap_char = gap_char)
    return(sam_entry(e[2], int(e[3]), int(e[3]) + ptr_ref, len(e[9]), counts, aln_seq, aln_ref, aln_chr, e))

In [6]:
class sam_entry:
    def __init__(self, name, aln_start, aln_end, seq_len, count, aln_seq, aln_ref, aln_chr, raw):
        self.name = name
        self.aln_start = aln_start
        self.aln_end = aln_end
        self.seq_len = seq_len
        self.count = count
        self.aln_seq = aln_seq
        self.aln_ref = aln_ref
        self.aln_chr = aln_chr
        self.raw = raw
    def format_aln(self, width = None):
        strs = []
        if(width == None):
            width = len(self.aln_chr)
        for batch in xrange(int((len(self.aln_chr) + width - 1) / width)):            
            strs.append(self.aln_seq[batch * width : (batch + 1) * width])
            strs.append(self.aln_chr[batch * width : (batch + 1) * width])
            strs.append(self.aln_ref[batch * width : (batch + 1) * width])
            strs.append('')
        return('\n'.join(strs))
    def format_count(self):
        return('\t'.join(["{:6d}".format(self.count['=']),
                          "{:6d}".format(self.count['X']),
                          "{:6d}".format(self.count['I']),
                          "{:6d}".format(self.count['D']),
                          "{:6d}".format(self.count['N']),
                          "{:6d}".format(self.count['S']),
                          "{:6d}".format(self.count['H']),                          
                          "{:6d}".format(self.count['P'])]))
            

In [7]:
def format_counts(list):
    strs = []
    strs.append('\t'.join(["{:>6}".format('='),
                           "{:>6}".format('X'),
                           "{:>6}".format('I'),
                           "{:>6}".format('D'),
                           "{:>6}".format('N'),
                           "{:>6}".format('S'),
                           "{:>6}".format('H'),
                           "{:>6}".format('P')]))
    return('\n'.join(strs + [i.format_count() for i in list]))

In [8]:
def get_counts_main(sam_f, reference):
    head, entries = read_sam(sam_f)
    data = [process_entry(e, reference) for e in entries]
    return(head, data)

In [9]:
(head, data) = get_counts_main(sam_f, hg19)

In [10]:
print format_counts(data)

     =	     X	     I	     D	     N	     S	     H	     P
 13810	   753	   247	  1940	     0	 15350	     0	     0
   303	   831	    46	   129	     0	 46745	     0	     0
   470	  1262	    40	   173	     0	 31619	     0	     0
  5104	 14326	   641	  2078	     0	    15	     0	     0
  3794	   496	   157	   485	     0	 21985	     0	     0
  6641	 18278	  1687	  1856	     0	    68	     0	     0
  5288	 14677	   560	  2113	     0	    51	     0	     0


In [11]:
print data[0].format_aln(width = 100)

GTTGTCATCCCTACCCCGACAGGATCACCTCACTATCATTACAAGTCACG_GTTGCTTAATGACAGGGATA_GCTCCGAGAAA__TGTCATTAGACAGTT
GTTGTCATCCCTACCCCGACA_GATCACCACAC_A_CAT___Aagtcacgtgttgcttaatgacagggatacagtctgagaaatgtgtcattagacag_t

TTCGTCACCAT__GAACTTTATAGGGTTC____ACAAACC_AGATGCCGGTAGAGCTTCATATATATGTCTAGCCTGTTGCCTTGGTTGCTAC_AATCGG
ttcgtcaccatgcgaactttatagggttctcatacaaacctagat___ggtagagcct_a_ctatatgtctagcctgttg_ctcctaagctacaaatcgg

TATAGCCCTGTCACCAGCCCCCAGGCCT__TCTTCTACACTT_T__TGGAGCTTCTCGC__CAGCAG_CTCAAGAAT_AAAATGAAGGAATATTGTGCTT
tacagcCCTGTCACCAGCCCCCAGGCCTGCTCTTCTACACTTCTCCTGGAGCTTCCTGCCACAGCAGCCTGAAGAATAAAAATGAAGGAAtactgtgc__

TTGAATACCG_CGGCAACTAAAACACACCGGTAAGTATTTGGGTACCCAAACATCTCAACCTAT_AAAGGTACAGT_AAAACAGAGTATA__C__ATATG
_tgaataccgtaggcaactaaaacacaatggtaagtatttgggtacctaaacatctcaacctataaaaggtacagtaaaaacagagtataatcttatagg

ACCATCATCCCATATAACGGTCCGTCATCGACAAATGTTGTTACGCGAGCACATTTTATGCAG___GTGACCATACACGACACACAGAGGAAATTCAGGG
accatcatcccata_aacagtccgtcactgacaaatgttgttacgc_agcacatTTTATGCAGTGTGTGACCATACACGATACACAG