In [1]:
import numpy as np
import scipy as sc
import pandas as pd
import itertools as it
import subprocess as sp
import logging
import sys
import re
import collections
import enum
import json
import pysam
import pgenlib as pg

## data type to represent nucleotide and mismatch

In [2]:
@enum.unique
class Nucleotide(enum.Enum):
    N = -1
    A = 0
    C = 1
    G = 2
    T = 3
    
Mismatch = collections.namedtuple('Mismatch', 'reference_position reference read quality')    

### specify data file and reference file

In [3]:
bam_file_name = '/share/PI/mrivas/data/nanopore-wgs-consortium/poretools_fastq.12894489.geq12500.bam'
fasta_file_name = '/share/PI/mrivas/data/hg19/chr20.fa'

### read with pysam

In [4]:
bamfile = pysam.AlignmentFile(filename = bam_file_name, 
                              mode = 'rb')
reference = pysam.FastaFile(fasta_file_name)

## class file

In [5]:
class read:
    '''class to manage mapped read    
    '''
    def __init__(self, aligned_segment, reference):
        # This copy will be removed
        self.aligned_segment = aligned_segment
        
        # find all mismatches
        self.find_mismatches(aligned_segment, reference)
        
        # copy useful info
        self.query_name      = aligned_segment.query_name     
        self.reference_name  = aligned_segment.reference_name
        self.reference_start = aligned_segment.reference_start
        self.reference_end   = aligned_segment.reference_end
        self.length          = aligned_segment.reference_length


    def get_mismatches(self, quality_threshod = 14):
        return([i for i in self.mismatches 
                if i.quality >= quality_threshod and
                   i.reference is not Nucleotide['N']])

    def n_mismatch(self, quality_threshod = 14):
        return(len(self.get_mismatches(quality_threshod)))
    
    def n_match(self, quality_threshod = 14):
        return(self.length - self.n_mismatch(quality_threshod))
    
        
    def find_mismatches(self, aligned_segment, reference):
        '''find all mismatches
        [args]
        pysam.AlignedSegment aligned_segment: mapped fragment
        pysam.FastxFile      reference:       reference sequence
        '''
        aligned_pairs = np.array(aligned_segment.get_aligned_pairs(matches_only=True, with_seq=True))
        
        # fetch corresponding reference sequence
        reference_str = reference.fetch(reference = aligned_segment.reference_name,
                                        start     = aligned_segment.reference_start, 
                                        end       = aligned_segment.reference_end).upper()
        
        # obtain nucleotide letters on both read and reference
        read_letters = np.array([aligned_segment.query_sequence[int(read_position)].upper()
                                 for read_position in aligned_pairs[:, 0]])
        ref_letters  = np.array([reference_str[int(ref_position) - 
                                               aligned_segment.reference_start] 
                                 for ref_position in aligned_pairs[:, 1]])

        # enumerate all the mismatches by comparing nucleotide letters
        self.mismatches = [Mismatch(reference_position = \
                                      int(aligned_pairs[mismatch_pos_on_pairs][1]),
                                    reference = \
                                      Nucleotide[ref_letters[mismatch_pos_on_pairs]],
                                    read      = \
                                      Nucleotide[read_letters[mismatch_pos_on_pairs]],
                                    quality   = \
                                      aligned_segment.query_qualities[int(aligned_pairs[mismatch_pos_on_pairs][0])])
                           for mismatch_pos_on_pairs 
                           in np.where(read_letters != ref_letters)[0]]

    def __str__(self):
        return('\t'.join(['{}:{}-{}'.format(self.reference_name, 
                                            self.reference_start,
                                            self.reference_end),
                          str(self.n_match()),
                          str(self.n_mismatch()),
                          self.query_name]))

# test the behavior of the code

In [6]:
aligned_segment = bamfile.next()

In [7]:
r = read(aligned_segment, reference)

In [8]:
print(r)

chr20:59980-65177	5190	7	33ecb953-edd6-445b-87da-85c4551d8c8c_Basecall_Alignment_template


### this object holds all the mismatches 

In [9]:
r.mismatches[:10]

[Mismatch(reference_position=59980, reference=<Nucleotide.N: -1>, read=<Nucleotide.C: 1>, quality=3),
 Mismatch(reference_position=59981, reference=<Nucleotide.N: -1>, read=<Nucleotide.G: 2>, quality=4),
 Mismatch(reference_position=59982, reference=<Nucleotide.N: -1>, read=<Nucleotide.G: 2>, quality=4),
 Mismatch(reference_position=59983, reference=<Nucleotide.N: -1>, read=<Nucleotide.A: 0>, quality=4),
 Mismatch(reference_position=59984, reference=<Nucleotide.N: -1>, read=<Nucleotide.T: 3>, quality=14),
 Mismatch(reference_position=59987, reference=<Nucleotide.N: -1>, read=<Nucleotide.T: 3>, quality=17),
 Mismatch(reference_position=59988, reference=<Nucleotide.N: -1>, read=<Nucleotide.A: 0>, quality=15),
 Mismatch(reference_position=59989, reference=<Nucleotide.N: -1>, read=<Nucleotide.A: 0>, quality=15),
 Mismatch(reference_position=59990, reference=<Nucleotide.N: -1>, read=<Nucleotide.T: 3>, quality=6),
 Mismatch(reference_position=59991, reference=<Nucleotide.N: -1>, read=<Nucleo

### but it reports mismatches with base call quality threshold

In [10]:
r.get_mismatches()

[Mismatch(reference_position=60992, reference=<Nucleotide.G: 2>, read=<Nucleotide.T: 3>, quality=14),
 Mismatch(reference_position=61171, reference=<Nucleotide.A: 0>, read=<Nucleotide.T: 3>, quality=14),
 Mismatch(reference_position=61220, reference=<Nucleotide.G: 2>, read=<Nucleotide.A: 0>, quality=17),
 Mismatch(reference_position=63344, reference=<Nucleotide.T: 3>, read=<Nucleotide.A: 0>, quality=15),
 Mismatch(reference_position=63377, reference=<Nucleotide.C: 1>, read=<Nucleotide.A: 0>, quality=16),
 Mismatch(reference_position=63378, reference=<Nucleotide.C: 1>, read=<Nucleotide.A: 0>, quality=16),
 Mismatch(reference_position=63798, reference=<Nucleotide.C: 1>, read=<Nucleotide.T: 3>, quality=15)]