Skip to content

Commit

Permalink
{AH} add min_base_quality option
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreasHeger committed Nov 22, 2017
1 parent 4b87d06 commit eb51ad3
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 16 deletions.
4 changes: 3 additions & 1 deletion pysam/libcalignedsegment.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ cdef class PileupRead:

# factor methods
cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file)
cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu, AlignmentFile alignment_file)
cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu,
uint32_t min_base_quality,
AlignmentFile alignment_file)
cdef makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file)
cdef uint32_t get_alignment_length(bam1_t * src)
54 changes: 44 additions & 10 deletions pysam/libcalignedsegment.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ from cpython cimport array as c_array
from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
INT8_MAX, INT16_MAX, INT32_MAX, \
UINT8_MAX, UINT16_MAX, UINT32_MAX
from libc.stdio cimport sprintf
from libc.stdio cimport snprintf

from pysam.libcutils cimport force_bytes, force_str, \
charptr_to_str, charptr_to_bytes
Expand All @@ -86,6 +86,9 @@ cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
cdef char* CODE2CIGAR= "MIDNSHP=XB"
cdef int NCIGAR_CODES = 10

# dimensioned for 8000 pileup limit (+ insertions/deletions)
cdef uint32_t MAX_PILEUP_BUFFER_SIZE = 10000

if IS_PYTHON3:
CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
else:
Expand Down Expand Up @@ -557,8 +560,12 @@ cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file):


cdef class PileupColumn
cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos,
int n_pu, AlignmentFile alignment_file):
cdef makePileupColumn(bam_pileup1_t ** plp,
int tid,
int pos,
int n_pu,
uint32_t min_base_quality,
AlignmentFile alignment_file):
'''return a PileupColumn object constructed from pileup in `plp` and
setting additional attributes.
Expand All @@ -570,9 +577,8 @@ cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos,
dest.tid = tid
dest.pos = pos
dest.n_pu = n_pu
dest.min_base_quality = 13
MAX_BUFFER_SIZE = 1000
dest.buf = <uint8_t *>calloc(MAX_BUFFER_SIZE, sizeof(uint8_t))
dest.min_base_quality = min_base_quality
dest.buf = <uint8_t *>calloc(MAX_PILEUP_BUFFER_SIZE, sizeof(uint8_t))
if dest.buf == NULL:
raise MemoryError("could not allocate pileup buffer")

Expand Down Expand Up @@ -2590,6 +2596,12 @@ cdef class PileupColumn:
if self.buf is not NULL:
free(self.buf)

def set_min_base_quality(self, min_base_quality):
"""set the minimum base quality for this pileup column.
"""
self.min_base_quality = min_base_quality


property reference_id:
'''the reference sequence number as defined in the header'''
def __get__(self):
Expand Down Expand Up @@ -2677,7 +2689,9 @@ cdef class PileupColumn:
cdef uint8_t * buf = self.buf
cdef bam_pileup1_t * p = NULL
n = 0


# todo: reference sequence to count matches/mismatches
# todo: convert assertions to exceptions
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
if pileup_base_qual_skip(p, self.min_base_quality):
Expand All @@ -2686,12 +2700,15 @@ cdef class PileupColumn:
if p.is_head:
buf[n] = '^'
n += 1
assert n < MAX_PILEUP_BUFFER_SIZE

if p.b.core.qual > 93:
buf[n] = 126
else:
buf[n] = p.b.core.qual + 33
n += 1

assert n < MAX_PILEUP_BUFFER_SIZE

if not p.is_del:
if p.qpos < p.b.core.l_qseq:
cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
Expand All @@ -2710,6 +2727,7 @@ cdef class PileupColumn:
cc = toupper(cc)
buf[n] = cc
n += 1
assert n < MAX_PILEUP_BUFFER_SIZE
else:
if p.is_refskip:
if bam_is_rev(p.b):
Expand All @@ -2719,11 +2737,17 @@ cdef class PileupColumn:
else:
buf[n] = '*'
n += 1
assert n < MAX_PILEUP_BUFFER_SIZE

if p.indel > 0:
buf[n] = '+'
n += 1
n += sprintf(<char *>&(buf[n]), "%i", p.indel)
assert n < MAX_PILEUP_BUFFER_SIZE
n += snprintf(<char *>&(buf[n]),
MAX_PILEUP_BUFFER_SIZE - n,
"%i",
p.indel)
assert n < MAX_PILEUP_BUFFER_SIZE
for j from 1 <= j <= p.indel:
cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
if bam_is_rev(p.b):
Expand All @@ -2732,10 +2756,16 @@ cdef class PileupColumn:
cc = toupper(cc)
buf[n] = cc
n += 1
assert n < MAX_PILEUP_BUFFER_SIZE
elif p.indel < 0:
buf[n] = '-'
n += 1
n += sprintf(<char *>&(buf[n]), "%i", -p.indel)
assert n < MAX_PILEUP_BUFFER_SIZE
n += snprintf(<char *>&(buf[n]),
MAX_PILEUP_BUFFER_SIZE - n,
"%i",
-p.indel)
assert n < MAX_PILEUP_BUFFER_SIZE
for j from 1 <= j <= -p.indel:
# int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
cc = 'N'
Expand All @@ -2745,12 +2775,16 @@ cdef class PileupColumn:
cc = toupper(cc)
buf[n] = cc
n += 1
assert n < MAX_PILEUP_BUFFER_SIZE
if p.is_tail:
buf[n] = '$'
n += 1
assert n < MAX_PILEUP_BUFFER_SIZE

buf[n] = ':'
n += 1
assert n < MAX_PILEUP_BUFFER_SIZE

# quicker to ensemble all and split than to encode all separately.
return force_str(PyBytes_FromStringAndSize(<char*>buf, n)).split(":")

Expand Down
1 change: 1 addition & 0 deletions pysam/libcalignmentfile.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ cdef class IteratorColumn:
cdef int pos
cdef int n_plp
cdef int mask
cdef uint32_t min_base_quality
cdef bam_pileup1_t * plp
cdef bam_plp_t pileup_iter
cdef __iterdata iterdata
Expand Down
16 changes: 12 additions & 4 deletions pysam/libcalignmentfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,11 @@ cdef class AlignmentFile(HTSFile):
If set to True, detect if read pairs overlap and only take
the higher quality base. This is the default.
min_base_quality: int
Minimum base quality. Bases below the minimum quality will
not be output.
truncate : bool
By default, the samtools pileup engine outputs all reads
Expand Down Expand Up @@ -2131,6 +2136,7 @@ cdef class IteratorColumn:
self.stepper = kwargs.get("stepper", None)
self.max_depth = kwargs.get("max_depth", 8000)
self.ignore_overlaps = kwargs.get("ignore_overlaps", False)
self.min_base_quality = kwargs.get("min_base_quality", 13)
self.iterdata.seq = NULL
self.tid = 0
self.pos = 0
Expand Down Expand Up @@ -2310,10 +2316,11 @@ cdef class IteratorColumnRegion(IteratorColumn):
if self.pos >= self.stop: raise StopIteration

return makePileupColumn(&self.plp,
self.tid,
self.pos,
self.n_plp,
self.samfile)
self.tid,
self.pos,
self.n_plp,
self.min_base_quality,
self.samfile)


cdef class IteratorColumnAllRefs(IteratorColumn):
Expand Down Expand Up @@ -2345,6 +2352,7 @@ cdef class IteratorColumnAllRefs(IteratorColumn):
self.tid,
self.pos,
self.n_plp,
self.min_base_quality,
self.samfile)

# otherwise, proceed to next reference or stop
Expand Down
2 changes: 1 addition & 1 deletion tests/AlignmentFile_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,7 +1012,7 @@ def checkRange(self, contig, start=None, end=None, truncate=False):
'''compare results from iterator with those from samtools.'''
# check if the same reads are returned and in the same order
for column in self.samfile.pileup(
contig, start, end, truncate=truncate):
contig, start, end, truncate=truncate, min_base_quality=0):
if truncate:
self.assertGreaterEqual(column.reference_pos, start)
self.assertLess(column.reference_pos, end)
Expand Down

0 comments on commit eb51ad3

Please sign in to comment.