Skip to content

Commit

Permalink
Revert to original iterator API by keeping a pointer to the header
Browse files Browse the repository at this point in the history
On reading the SAM header, store a pointer to it in the htsFile
struct for use by the iterators.  Allows the existing iterator API
to work with SAM files, which means programs that use BAM indexes
don't need to be modified for SAM.  The one disadvantage is that
the header may be kept in memory for longer than is expected
(although reading SAM needs it anyway, so it shouldn't be a
huge issue).

The header is currently only kept for SAM files.  CRAM does its
own thing, and some programs (notably samtools sort) expect to
be able to drop the original header after reading it.  As sort
can currently have thousands of BAM files open at once, keeping
all their headers could lead to excessive memory consumption.

To allow a NULL header in bam, the checks on core.tid and
core.mtid in sam_read1 have to be relaxed a little (although
it's no worse than before when the iterator used bam_read1).

Reference counting is used to prevent the header from being
deleted prematurely.
  • Loading branch information
daviesrob committed Feb 13, 2019
1 parent 24b121a commit 4c2c536
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 119 deletions.
12 changes: 11 additions & 1 deletion hts.c
Original file line number Diff line number Diff line change
Expand Up @@ -946,6 +946,7 @@ int hts_close(htsFile *fp)
}

save = errno;
bam_hdr_destroy(fp->bam_header);
hts_idx_destroy(fp->idx);
free(fp->fn);
free(fp->fn_aux);
Expand Down Expand Up @@ -2174,7 +2175,16 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re
khint_t k;
bidx_t *bidx;
uint64_t min_off, max_off;
hts_itr_t *iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
hts_itr_t *iter;

// It's possible to call this function with NULL idx iff
// tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE
if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) {
errno = EINVAL;
return NULL;
}

iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
if (iter) {
if (tid < 0) {
uint64_t off = hts_itr_off(idx, tid);
Expand Down
3 changes: 3 additions & 0 deletions htslib/hts.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,8 @@ typedef struct htsFormat {
struct __hts_idx_t;
typedef struct __hts_idx_t hts_idx_t;

typedef struct bam_hdr_t bam_hdr_t;

// Maintainers note htsFile cannot be an opaque structure because some of its
// fields are part of libhts.so's ABI (hence these fields must not be moved):
// - fp is used in the public sam_itr_next()/etc macros
Expand All @@ -226,6 +228,7 @@ typedef struct {
htsFormat format;
hts_idx_t *idx;
const char *fnidx;
bam_hdr_t *bam_header;
} htsFile;

// A combined thread pool and queue allocation size.
Expand Down
112 changes: 46 additions & 66 deletions htslib/sam.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,15 @@ extern "C" {
@field sdict header dictionary
*/

typedef struct {
typedef struct bam_hdr_t {
int32_t n_targets, ignore_sam_err;
uint32_t l_text;
uint32_t *target_len;
int8_t *cigar_tab;
char **target_name;
char *text;
void *sdict;
uint32_t ref_count;
} bam_hdr_t;

/****************************
Expand Down Expand Up @@ -397,94 +398,73 @@ int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthre
@param end End position in target
@return An iterator on success; NULL on failure
Despite the name, this function does not work with SAM files. Use
sam_itr_queryi2() instead.
The following special values (defined in htslib/hts.h)can be used for @p tid.
When using one of these values, @p beg and @p end are ignored.
HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file
HTS_IDX_START iterates over the entire file
HTS_IDX_REST iterates from the current position to the end of the file
HTS_IDX_NONE always returns "no more alignment records"
When using HTS_IDX_REST or HTS_IDX_NONE, NULL can be passed in to @p idx.
*/
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end);

/// Create a BAM/CRAM iterator
/// Create a SAM/BAM/CRAM iterator
/** @param idx Index
@param hdr Header
@param region Region specification
@return An iterator on success; NULL on failure
Despite the name, this function does not work with SAM files. Use
sam_itr_querys2() instead.
Regions are parsed by hts_parse_reg(), and take one of the following forms:
region | Outputs
--------------- | -------------
REF | All reads with RNAME REF
REF: | All reads with RNAME REF
REF:START | Reads with RNAME REF overlapping START to end of REF
REF:-END | Reads with RNAME REF overlapping start of REF to END
REF:START-END | Reads with RNAME REF overlapping START to END
. | All reads from the start of the file
* | Unmapped reads at the end of the file (RNAME '*' in SAM)
The form `REF:` should be used when the reference name itself contains a colon.
Note that SAM files must be bgzf-compressed for iterators to work.
*/
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region);

/// Create a multi-region iterator
/** @param idx Index
@param hdr Header
@param reglist Array of regions to iterate over
@param regcount Number of items in reglist
Each @p reglist entry should have the reference name in the `reg` field, an
array of regions for that reference in `intervals` and the number of items
in `intervals` should be stored in `count`. No other fields need to be filled
in.
The iterator will return all reads overlapping the given regions. If a read
overlaps more than one region, it will only be returned once.
*/
hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount);

/// Get the next read from a BAM/CRAM iterator
/// Get the next read from a SAM/BAM/CRAM iterator
/** @param htsfp Htsfile pointer for the input file
@param itr Iterator
@param r Pointer to a bam1_t struct
@return >= 0 on success; -1 when there is no more data; < -1 on error
Despite the name, this function does not work with SAM files. Use
sam_itr_queryi2() instead.
*/
#define sam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), (htsfp))
#define sam_itr_multi_next(htsfp, itr, r) hts_itr_multi_next((htsfp), (itr), (r))

// Format agnostic version of the above iterators.
// The key difference is availability of the header.

/// Create a SAM/BAM/CRAM iterator
/** @param idx Index
@param hdr Header
@param tid Target id
@param beg Start position in target
@param end End position in target
@return An iterator on success; NULL on failure
*/
hts_itr_t *sam_itr_queryi2(const hts_idx_t *idx, bam_hdr_t *hdr, int tid, int beg, int end);

/// Create a SAM/BAM/CRAM iterator
/** @param idx Index
@param hdr Header
@param region Region specification
@return An iterator on success; NULL on failure

Regions are parsed by hts_parse_reg(), and take one of the following forms:
REF
REF:
REF:START
REF:-END
REF:START-END
.
*
Where:
REF is a reference name, which should appear as the ID in an @SQ header line.
START is a numeric start position
END is a numeric end position
The first two forms will create an iterator over the entire target reference.
The third will output everything from START to the end of the target.
The fourth will output everything from the start end the target to END.
The fifth will output all reads in the given range
The sixth creates an iterator from the start of the file
The seventh creates an iterator over the uplaced reads (i.e. all reads with
reference name '*' in a SAM file)
The region parser searches for the colon separating REF from START right-to-left.
This means reference names that themselves contain a colon can be used as
long as a trailing colon is always appended to the name.
Note that SAM files must be bgzf-compressed for iterators to work.
*/

hts_itr_t *sam_itr_querys2(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region);

/// Get the next read from a SAM/BAM/CRAM iterator
/// Get the next read from a BAM/CRAM multi-iterator
/** @param htsfp Htsfile pointer for the input file
@param hdr Header
@param itr Iterator
@param r Pointer to a bam1_t struct
@return >= 0 on success; -1 when there is no more data; < -1 on error
*/
int sam_itr_next2(htsFile *fp, bam_hdr_t *hdr, hts_itr_t *iter, void *r);
*/
#define sam_itr_multi_next(htsfp, itr, r) hts_itr_multi_next((htsfp), (itr), (r))

/***************
*** SAM I/O ***
Expand Down
74 changes: 24 additions & 50 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ void bam_hdr_destroy(bam_hdr_t *h)
{
int32_t i;
if (h == NULL) return;
if (h->ref_count > 0) {
--h->ref_count;
return;
}
if (h->target_name) {
for (i = 0; i < h->n_targets; ++i)
free(h->target_name[i]);
Expand Down Expand Up @@ -758,12 +762,12 @@ struct itr_cd {
bam_hdr_t *h;
};

static int sam_readrec(BGZF *ignored, void *cdv, void *bv, int *tid, int *beg, int *end)
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
{
struct itr_cd *cd = (struct itr_cd *)cdv;
htsFile *fp = (htsFile *)fpv;
bam1_t *b = bv;
cd->fp->line.l = 0;
int ret = sam_read1(cd->fp, cd->h, b);
fp->line.l = 0;
int ret = sam_read1(fp, fp->bam_header, b);
if (ret >= 0) {
*tid = b->core.tid;
*beg = b->core.pos;
Expand All @@ -773,6 +777,15 @@ static int sam_readrec(BGZF *ignored, void *cdv, void *bv, int *tid, int *beg, i
}

// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
{
htsFile *fp = (htsFile *)fpv;
bam1_t *b = bv;
fp->line.l = 0;
int ret = sam_read1(fp, fp->bam_header, b);
return ret;
}

static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
{
htsFile *fp = fpv;
Expand Down Expand Up @@ -857,29 +870,6 @@ static int64_t bam_ptell(void *fp)
return bgzf_tell(fd);
}

// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
{
htsFile *fp = fpv;
bam1_t *b = bv;
switch (fp->format.format) {
case bam: return bam_read1(bgzfp, b);
case cram: {
int ret = cram_get_bam_seq(fp->fp.cram, &b);
if (ret < 0)
return cram_eof(fp->fp.cram) ? -1 : -2;

if (bam_tag2cigar(b, 1, 1) < 0)
return -2;
return ret;
}
default:
// TODO Need headers available to implement this for SAM files
hts_log_error("Not implemented for SAM files");
abort();
}
}

hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
{
switch (fp->format.format) {
Expand Down Expand Up @@ -969,11 +959,11 @@ hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
{
const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
if (idx == NULL)
return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
else if (cidx->fmt == HTS_FMT_CRAI)
return cram_itr_query(idx, tid, beg, end, cram_readrec);
return cram_itr_query(idx, tid, beg, end, sam_readrec);
else
return hts_itr_query(idx, tid, beg, end, bam_readrec);
return hts_itr_query(idx, tid, beg, end, sam_readrec);
}

static int cram_name2id(void *fdv, const char *ref)
Expand All @@ -983,31 +973,13 @@ static int cram_name2id(void *fdv, const char *ref)
}

hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
{
const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
if (cidx->fmt == HTS_FMT_CRAI)
return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
else
return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
}

// Format agnostic version of the above iterators.
// This is implemented using the itr_cd structure as the hts iterator 'data',
// to get access to htsFile instead of BGZF and the bam_hdr_t pointers..
hts_itr_t *sam_itr_querys2(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
{
const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
sam_readrec);
}

int sam_itr_next2(htsFile *fp, bam_hdr_t *hdr, hts_itr_t *iter, void *r)
{
struct itr_cd cd = {fp, hdr};
return hts_itr_next(fp->fp.bgzf, iter, r, &cd);
}

hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
{
const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
Expand Down Expand Up @@ -1232,7 +1204,9 @@ bam_hdr_t *sam_hdr_read(htsFile *fp)
h = sam_hdr_parse(str.l, str.s);
if (!h) goto error;
h->l_text = str.l; h->text = str.s;
return sam_hdr_sanitise(h);
fp->bam_header = sam_hdr_sanitise(h);
fp->bam_header->ref_count = 1;
return fp->bam_header;

error:
bam_hdr_destroy(h);
Expand Down Expand Up @@ -1735,7 +1709,7 @@ int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
switch (fp->format.format) {
case bam: {
int r = bam_read1(fp->fp.bgzf, b);
if (r >= 0) {
if (h && r >= 0) {
if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
b->core.mtid >= h->n_targets || b->core.mtid < -1)
return -3;
Expand Down
4 changes: 2 additions & 2 deletions test/test_view.c
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,11 @@ int sam_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in,
} else {
for (i = optind + 1; i < argc; ++i) {
hts_itr_t *iter;
if ((iter = sam_itr_querys2(idx, h, argv[i])) == 0) {
if ((iter = sam_itr_querys(idx, h, argv[i])) == 0) {
fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
goto fail;
}
while ((r = sam_itr_next2(in, h, iter, b)) >= 0) {
while ((r = sam_itr_next(in, iter, b)) >= 0) {
if (!opts->benchmark && sam_write1(out, h, b) < 0) {
fprintf(stderr, "Error writing output.\n");
hts_itr_destroy(iter);
Expand Down

0 comments on commit 4c2c536

Please sign in to comment.