In [1]:
import pysam
sam_f = '/home/ytanigaw/data/nanopore/20161008_wgs_caucasian_48hr.10k.bwa.mapq60.20kb.chr11.sam'
ref_f = '/share/PI/mrivas/data/hg19/hg19.fa'
hg19 = pysam.FastaFile(ref_f)

In [2]:
def read_sam(sam_f, sep = None):
    '''
    read a sam file
    '''
    with open(sam_f, 'r') as f:
        head = f.readline().strip()
        if(sep != None):
            entries = [line.strip().split(sep) for line in f]
        else:
            entries = [line.strip().split() for line in f]
    return(head, entries)

In [3]:
def parse_cigar(cigar):
    parsed = []
    start = 0
    i = 0
    while(i < len(cigar)):
        while('0' <= cigar[i] <= '9'):
            i = i + 1
        segment_length = int(cigar[start:i])
        segment_type = cigar[i]
        parsed.append((segment_length, segment_type))
        i = i + 1
        start = i
    return(parsed)

In [4]:
def retrieve_alignment(seq, ref, cigar_list, gap_char = '_'):
    aln_seq = []
    aln_ref = []
    aln_chr = []
    ptr_seq = 0
    ptr_ref = 0
    stats = {'M':0, 'I':0, 'D':0, 'N':0, 'S':0, 'H':0, 'P':0, '=':0, 'X':0}
    for i in xrange(len(cigar_list)):
        if(cigar_list[i][1] == 'M'):
            for j in xrange(cigar_list[i][0]):
                aln_seq.append(seq[ptr_seq])
                aln_ref.append(ref[ptr_ref])
                if(seq[ptr_seq].upper() == ref[ptr_ref].upper()):
                    aln_chr.append('=')
                    stats['='] = stats['='] + 1
                else:
                    aln_chr.append('X')
                    stats['X'] = stats['X'] + 1                    
                ptr_seq = ptr_seq + 1
                ptr_ref = ptr_ref + 1
        elif(cigar_list[i][1] == 'I'):
            aln_seq.append(seq[ptr_seq : ptr_seq + cigar_list[i][0]])
            ptr_seq = ptr_seq + cigar_list[i][0]
            aln_ref.append(gap_char * cigar_list[i][0])    
            aln_chr.append('I' * cigar_list[i][0])    
            stats['I'] = stats['I'] + cigar_list[i][0]                               
        elif(cigar_list[i][1] == 'D'):
            aln_seq.append(gap_char * cigar_list[i][0])
            aln_ref.append(ref[ptr_ref : ptr_ref + cigar_list[i][0]])
            ptr_ref = ptr_ref + cigar_list[i][0]
            aln_chr.append('D' * cigar_list[i][0])    
            stats['D'] = stats['D'] + cigar_list[i][0]                                           
        else:
            aln_seq.append(seq[ptr_seq : ptr_seq + cigar_list[i][0]])
            ptr_seq = ptr_seq + cigar_list[i][0]
            aln_ref.append(ref[ptr_ref : ptr_ref + cigar_list[i][0]])
            ptr_ref = ptr_ref + cigar_list[i][0]
            aln_chr.append(cigar_list[i][1] * cigar_list[i][0])        
            stats[cigar_list[i][1]] = stats[cigar_list[i][1]] + cigar_list[i][0]
    return(aln_seq, aln_ref, aln_chr, ptr_ref, stats)    

In [15]:
def process_entry(e, reference, gap_char = '_'):
    (aln_seq, aln_ref, aln_chr, ptr_ref, counts) = \
    retrieve_alignment(seq = e[9],
                       ref = reference.fetch(reference = e[2],
                                             start = int(e[3]) - 1, 
                                             end = int(e[3]) - 1 + len(e[9])),
                       cigar_list = parse_cigar(e[5]), 
                       gap_char = gap_char)
    return(e[2], int(e[3]), int(e[3]) + ptr_ref, len(e[9]), counts, aln_seq, aln_ref, aln_chr)

In [20]:
class sam_entry:
    def __init__(self, name, aln_start, aln_end, seq_len, count, aln_seq, aln_ref, aln_chr):
        self.name = name
        self.aln_start = aln_start
        self.aln_end = aln_end
        self.seq_len = seq_len
        self.count = count
        self.aln_seq = aln_seq
        self.aln_ref = aln_ref
        self.aln_chr = aln_chr
    def format_aln(self, width = None):
        strs = []
        if(width == None):
            width = len(self.aln_chr)
        for batch in xrange(int((len(self.aln_chr) + width - 1) / width)):            
            strs.append(self.aln_seq[batch * width : (batch + 1) * width])
            strs.append(self.aln_chr[batch * width : (batch + 1) * width])
            strs.append(self.aln_ref[batch * width : (batch + 1) * width])
            strs.append('')
        return('\n'.join(strs))
            

In [16]:
head, entries = read_sam(sam_f)

In [17]:
e = entries[0]
reference = hg19

In [18]:
name, aln_start, aln_end, seq_len, count, aln_seq, aln_ref, aln_chr = \
process_entry(entries[0], reference)

In [19]:
print name, aln_start, aln_end, seq_len, count

chr11 9162869 9194722 30160 {'P': 0, 'S': 15350, 'D': 1940, 'X': 753, 'I': 247, 'H': 0, '=': 13810, 'M': 0, 'N': 0}


In [7]:
ref = reference.fetch(reference = e[2], start = int(e[3]) - 1, end = int(e[3]) - 1 + len(e[9]))

In [11]:
cigar_list = parse_cigar(e[5])

In [14]:
(aln_seq, aln_ref, aln_chr, stats) = retrieve_alignment(seq = e[9],
                                                        ref = ref,
                                                        cigar_list = cigar_list, 
                                                        gap_char = '_')

In [15]:
stats

{'=': 13810,
 'D': 1940,
 'H': 0,
 'I': 247,
 'M': 0,
 'N': 0,
 'P': 0,
 'S': 15350,
 'X': 753}

In [16]:
e

['channel_75_80bdccec-0a01-48bb-854d-094281ccc083_qscore_8.2_read_score_-1.8',
 '16',
 'chr11',
 '9162869',
 '60',
 '21M1I11M1I1M1I3M3I8M1D20M1D11M2D13M1I12M2D16M4D7M1D4M3I10M1I1M1I19M1I12M1D34M2D12M1D1M2D13M2D6M1D9M1D20M3I9M1D53M1D11M1D13M2D1M2D19M1I31M1I16M3D35M1I5M1D41M6D43M2D7M1D7M1D8M2D19M2D3M1D3M9D18M1I35M2D13M2D5M2D5M3D4M1D14M1D9M2D15M1D6M1D14M1D8M3D15M1I17M1I5M1D22M1D3M1D4M1I17M2D38M1D7M1D4M1D14M5D19M2I37M3D7M1D13M1D17M4D3M2D12M1D8M5D11M1D8M2D3M2D5M2D4M1D3M1D7M1D9M1D36M1D11M2I3M1D8M1I5M2D19M1D9M2D14M1D18M1D11M6D17M2D18M1D18M3D15M1D2M1D7M1I24M2D14M1I8M1I11M9D29M1D36M1D8M2D11M1D5M1I3M1I9M1D1M1D24M3D2M1D9M4D20M1D15M3D10M1D5M1D2M1D13M1D12M2I3M1D43M4D16M1D10M2D29M2D1M9D3M1D19M3D4M4D5M5D8M2D10M2D10M1I15M1D4M1D35M4D2M7D3M2D6M2D12M1I2M1D23M1D6M8D15M1D14M2D18M1D29M2D23M5D5M1I8M1D12M2D22M2I15M5I35M3D36M1I8M2D6M1D6M1D9M5D2M1D2M1D2M2D4M5D5M2D35M1D28M1I25M1D10M2D20M1I10M1D9M1D1M1D4M2D1M2D3M1D6M1D5M1D1M3D5M2D9M1D24M4D17M1D7M3D16M1D4M1D5M3D8M2D10M2D19M2D31M4D31M1D16M1D7M1D4M1D5M1D1M1D3M2D32M1

In [2]:
def cigar2count(cigar_str, read, ref):
    '''
    parse CIGAR string and return summary stats
    
    need to expand to obtain a list of positions
    '''
    stats = {'M':0, 'I':0, 'D':0, 'N':0, 'S':0, 'H':0, 'P':0, '=':0, 'X':0}
    segment_start = 0
    i = 0
    relative_pos_on_read = 0
    relative_pos_on_ref = 0
    print len(read)
    while(i < len(cigar_str)):
        while('0' <= cigar_str[i] <= '9'):
            i = i + 1
        segment_length = int(cigar_str[segment_start:i])
        segment_type = cigar_str[i]
        relative_pos_on_read = relative_pos_on_read + segment_length
        if(segment_type in "MD"):
            relative_pos_on_ref = relative_pos_on_ref + segment_length
        if(segment_type == 'M'):
            #for aligned_pos in xrange(segment_length):
            #    if(read[relative_pos_on_read - aligned_pos] == 
            #       ref[relative_pos_on_ref - aligned_pos]):
            #        stats['='] = stats['='] + segment_length
            #    else:
            #        stats['X'] = stats['X'] + segment_length
            stats[segment_type] = stats[segment_type] + segment_length
        else:
            stats[segment_type] = stats[segment_type] + segment_length
        i = i + 1
        segment_start = i
    return(stats)

In [103]:
def read_sam(sam_f, sep = None):
    '''
    read a sam file
    '''
    with open(sam_f, 'r') as f:
        head = f.readline().strip()
        if(sep != None):
            entries = [line.strip().split(sep) for line in f]
        else:
            entries = [line.strip().split() for line in f]
    return(head, entries)

In [4]:
def count(e, reference):
    ref = reference.fetch(reference = e[2], start = int(e[3]) - 1, end = int(e[3]) - 1 + len(e[9]))
    return(cigar2count(cigar_str=e[5], read = e[9], ref = ref))

In [5]:
def get_counts_main(sam_f, reference):
    head, entries = read_sam(sam_f)
    counts = [count(e, reference) for e in entries]
    return(head, entries, counts)

In [6]:
(head, entries, counts) = get_counts_main(sam_f=sam_f, reference = hg19)

30160
47925
33391
20086
26432
26674
20576


In [7]:
entries[0][9]

'GTTGTCATCCCTACCCCGACAGGATCACCTCACTATCATTACAAGTCACGGTTGCTTAATGACAGGGATAGCTCCGAGAAATGTCATTAGACAGTTTTCGTCACCATGAACTTTATAGGGTTCACAAACCAGATGCCGGTAGAGCTTCATATATATGTCTAGCCTGTTGCCTTGGTTGCTACAATCGGTATAGCCCTGTCACCAGCCCCCAGGCCTTCTTCTACACTTTTGGAGCTTCTCGCCAGCAGCTCAAGAATAAAATGAAGGAATATTGTGCTTTTGAATACCGCGGCAACTAAAACACACCGGTAAGTATTTGGGTACCCAAACATCTCAACCTATAAAGGTACAGTAAAACAGAGTATACATATGACCATCATCCCATATAACGGTCCGTCATCGACAAATGTTGTTACGCGAGCACATTTTATGCAGGTGACCATACACGACACACAGAGGAAATTCAGGGCTTTCTAGAACCTTTCTAAGGCCCCATCTCCCTAAGGGCACCTGATGACCCCACCCCTGCACTGCACCAGGCCTCCAACACCACCACCATGTCACCGCCAGCACTTGGGCCCTGTCTGCAGTACCTGGCTCCTGCCACAAACTTGCCATCCTTGCCGGAGCAGCCTGGGAGTATTGTTGATTGCAGTGACAACGGCAGAAGTTCCGCTCTTATGCTTTTCTCAGGGACTACTTATTCTTCTCACGTCTCATAATAGGTTGTGCTTTTCTGTGAAGAGAGAAAAGCACAGTGAAGTTTCTCAACAACACCAACCCTGACTCGTCCACAGGTCATTTGCCATGGGCCTTCCGTTCACACACAGGTAACTTAATGAATAGCACAAGTGTTCTGATGTCACTGGTCATGTTTCAAATGATCTCATTTCTATATCCTTATAAATTTATGTATGTGGCCCTCTTGATATAAAGATGTTCTTTTCCCTAGTCTGAAGACAAAAATTTGAGGATGGTTTCTGCATGAAAATTCTCCAG

In [8]:
counts[0]

{'=': 0,
 'D': 1940,
 'H': 0,
 'I': 247,
 'M': 14563,
 'N': 0,
 'P': 0,
 'S': 15350,
 'X': 0}

In [9]:
counts

[{'=': 0,
  'D': 1940,
  'H': 0,
  'I': 247,
  'M': 14563,
  'N': 0,
  'P': 0,
  'S': 15350,
  'X': 0},
 {'=': 0,
  'D': 129,
  'H': 0,
  'I': 46,
  'M': 1134,
  'N': 0,
  'P': 0,
  'S': 46745,
  'X': 0},
 {'=': 0,
  'D': 173,
  'H': 0,
  'I': 40,
  'M': 1732,
  'N': 0,
  'P': 0,
  'S': 31619,
  'X': 0},
 {'=': 0,
  'D': 2078,
  'H': 0,
  'I': 641,
  'M': 19430,
  'N': 0,
  'P': 0,
  'S': 15,
  'X': 0},
 {'=': 0,
  'D': 485,
  'H': 0,
  'I': 157,
  'M': 4290,
  'N': 0,
  'P': 0,
  'S': 21985,
  'X': 0},
 {'=': 0,
  'D': 1856,
  'H': 0,
  'I': 1687,
  'M': 24919,
  'N': 0,
  'P': 0,
  'S': 68,
  'X': 0},
 {'=': 0,
  'D': 2113,
  'H': 0,
  'I': 560,
  'M': 19965,
  'N': 0,
  'P': 0,
  'S': 51,
  'X': 0}]

In [10]:
e = entries[0]

In [12]:
len(e[9])

30160

In [13]:
e[2]

'chr11'

In [14]:
int(e[3]) - 1

9162868

In [15]:
ref = hg19.fetch(reference = e[2], start = int(e[3]) - 1, end = int(e[3]) - 1 + len(e[9]))

## scratch code follows

In [17]:
e[0]

'channel_75_80bdccec-0a01-48bb-854d-094281ccc083_qscore_8.2_read_score_-1.8'

In [18]:
seq = e[9]

In [19]:
pos = int(e[3]) - 1

In [20]:
pos

9162868

In [21]:
cigar = e[5]

In [24]:
cigar2count(cigar_str= cigar,read=seq, ref=ref )

30160


{'=': 0,
 'D': 1940,
 'H': 0,
 'I': 247,
 'M': 14563,
 'N': 0,
 'P': 0,
 'S': 15350,
 'X': 0}

In [25]:
ref[:20]

'GTTGTCATCCCTACCCCGAC'

In [26]:
seq[:20]

'GTTGTCATCCCTACCCCGAC'

In [34]:
def parse_cigar(cigar):
    parsed = []
    start = 0
    i = 0
    relative_pos = 0
    while(i < len(cigar)):
        while('0' <= cigar[i] <= '9'):
            i = i + 1
        segment_length = int(cigar[start:i])
        segment_type = cigar[i]
        relative_pos = relative_pos + segment_length
        parsed.append((relative_pos, segment_length, segment_type))
        i = i + 1
        start = i
    return(parsed)

In [41]:
cigar_list = parse_cigar(cigar)

In [42]:
cigar_list[0], cigar_list[1]

((21, 21, 'M'), (22, 1, 'I'))

In [81]:
aln_seq = []
aln_ref = []
aln_chr = []
gap_char = '_'
ptr_seq = 0
ptr_ref = 0
#i = 0
for i in xrange(len(cigar_list)):
    if(cigar_list[i][2] == 'M'):
        aln_seq.append(seq[ptr_seq : ptr_seq + cigar_list[i][1]])
        ptr_seq = ptr_seq + cigar_list[i][1]
        aln_ref.append(ref[ptr_ref : ptr_ref + cigar_list[i][1]])
        ptr_ref = ptr_ref + cigar_list[i][1]
        aln_chr.append('M' * cigar_list[i][1])
    elif(cigar_list[i][2] == 'I'):
        aln_seq.append(seq[ptr_seq : ptr_seq + cigar_list[i][1]])
        ptr_seq = ptr_seq + cigar_list[i][1]
        aln_ref.append(gap_char * cigar_list[i][1])    
        aln_chr.append('I' * cigar_list[i][1])    
    elif(cigar_list[i][2] == 'D'):
        aln_seq.append(gap_char * cigar_list[i][1])
        aln_ref.append(ref[ptr_ref : ptr_ref + cigar_list[i][1]])
        ptr_ref = ptr_ref + cigar_list[i][1]
        aln_chr.append('D' * cigar_list[i][1])    
    else:
        aln_seq.append(seq[ptr_seq : ptr_seq + cigar_list[i][1]])
        ptr_seq = ptr_seq + cigar_list[i][1]
        aln_ref.append(ref[ptr_ref : ptr_ref + cigar_list[i][1]])
        ptr_ref = ptr_ref + cigar_list[i][1]
        aln_chr.append('?' * cigar_list[i][1])        
    

In [83]:
aln = '\n'.join([''.join(aln_seq), ''.join(aln_chr), ''.join(aln_ref)])

In [84]:
with open('../data/seq1.aln.txt', 'w') as f:
    f.write(aln + '\n')

In [75]:
ptr_ref

21

In [76]:
ptr_seq

43

In [100]:
aln_seq = []
aln_ref = []
aln_chr = []
gap_char = '_'
ptr_seq = 0
ptr_ref = 0
for i in xrange(len(cigar_list)):
    if(cigar_list[i][2] == 'M'):
        for j in xrange(cigar_list[i][1]):
            aln_seq.append(seq[ptr_seq])
            aln_ref.append(ref[ptr_ref])
            if(seq[ptr_seq].upper() == ref[ptr_ref].upper()):
                aln_chr.append('=')
            else:
                aln_chr.append('X')
            ptr_seq = ptr_seq + 1
            ptr_ref = ptr_ref + 1
            
#        aln_seq.append(seq[ptr_seq : ptr_seq + cigar_list[i][1]])
#        ptr_seq = ptr_seq + cigar_list[i][1]
#        aln_ref.append(ref[ptr_ref : ptr_ref + cigar_list[i][1]])
#        ptr_ref = ptr_ref + cigar_list[i][1]
#        aln_chr.append('M' * cigar_list[i][1])
    elif(cigar_list[i][2] == 'I'):
        aln_seq.append(seq[ptr_seq : ptr_seq + cigar_list[i][1]])
        ptr_seq = ptr_seq + cigar_list[i][1]
        aln_ref.append(gap_char * cigar_list[i][1])    
        aln_chr.append('I' * cigar_list[i][1])    
    elif(cigar_list[i][2] == 'D'):
        aln_seq.append(gap_char * cigar_list[i][1])
        aln_ref.append(ref[ptr_ref : ptr_ref + cigar_list[i][1]])
        ptr_ref = ptr_ref + cigar_list[i][1]
        aln_chr.append('D' * cigar_list[i][1])    
    else:
        aln_seq.append(seq[ptr_seq : ptr_seq + cigar_list[i][1]])
        ptr_seq = ptr_seq + cigar_list[i][1]
        aln_ref.append(ref[ptr_ref : ptr_ref + cigar_list[i][1]])
        ptr_ref = ptr_ref + cigar_list[i][1]
        aln_chr.append('?' * cigar_list[i][1])        
    

In [101]:
aln = '\n'.join([''.join(aln_seq), ''.join(aln_chr), ''.join(aln_ref)])

In [102]:
with open('../data/seq1.aln.txt', 'w') as f:
    f.write(aln + '\n')

In [29]:
stats = {'M':0, 'I':0, 'D':0, 'N':0, 'S':0, 'H':0, 'P':0, '=':0, 'X':0}
start = 0
i = 0
relative_pos = 0
while(i < len(cigar)):
    while('0' <= cigar[i] <= '9'):
        i = i + 1
    segment_length = int(cigar[start:i])
    segment_type = cigar[i]
    relative_pos = relative_pos + segment_length
    stats[segment_type] = stats[segment_type] + segment_length
    print(relative_pos, segment_length, segment_type)    
    i = i + 1
    start = i
      

(21, 21, 'M')
(22, 1, 'I')
(33, 11, 'M')
(34, 1, 'I')
(35, 1, 'M')
(36, 1, 'I')
(39, 3, 'M')
(42, 3, 'I')
(50, 8, 'M')
(51, 1, 'D')
(71, 20, 'M')
(72, 1, 'D')
(83, 11, 'M')
(85, 2, 'D')
(98, 13, 'M')
(99, 1, 'I')
(111, 12, 'M')
(113, 2, 'D')
(129, 16, 'M')
(133, 4, 'D')
(140, 7, 'M')
(141, 1, 'D')
(145, 4, 'M')
(148, 3, 'I')
(158, 10, 'M')
(159, 1, 'I')
(160, 1, 'M')
(161, 1, 'I')
(180, 19, 'M')
(181, 1, 'I')
(193, 12, 'M')
(194, 1, 'D')
(228, 34, 'M')
(230, 2, 'D')
(242, 12, 'M')
(243, 1, 'D')
(244, 1, 'M')
(246, 2, 'D')
(259, 13, 'M')
(261, 2, 'D')
(267, 6, 'M')
(268, 1, 'D')
(277, 9, 'M')
(278, 1, 'D')
(298, 20, 'M')
(301, 3, 'I')
(310, 9, 'M')
(311, 1, 'D')
(364, 53, 'M')
(365, 1, 'D')
(376, 11, 'M')
(377, 1, 'D')
(390, 13, 'M')
(392, 2, 'D')
(393, 1, 'M')
(395, 2, 'D')
(414, 19, 'M')
(415, 1, 'I')
(446, 31, 'M')
(447, 1, 'I')
(463, 16, 'M')
(466, 3, 'D')
(501, 35, 'M')
(502, 1, 'I')
(507, 5, 'M')
(508, 1, 'D')
(549, 41, 'M')
(555, 6, 'D')
(598, 43, 'M')
(600, 2, 'D')
(607, 7, 'M')

In [26]:
stats

{'=': 0, 'D': 20, 'H': 0, 'I': 13, 'M': 244, 'N': 0, 'P': 0, 'S': 0, 'X': 0}

In [43]:
for type in ['N', 'S', 'H', 'P', '=', 'X']:
    print(type, type in cigar)

('N', False)
('S', True)
('H', False)
('P', False)
('=', False)
('X', False)


In [40]:
cigar_t.split(['M', 'I', 'D'])

TypeError: expected a character buffer object

## reference sequence
- we can read reference sequence if needed

In [15]:
import pysam

In [17]:
ref_f = '/share/PI/mrivas/data/hg19/hg19.fa'

In [22]:
hg19 = pysam.FastaFile(ref_f)

In [24]:
ref = hg19.fetch(reference = 'chr11', start = pos, end = pos + len(seq))

In [26]:
len(ref)

30160