Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion htslib/synced_bcf_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ typedef enum
BCF_SR_PAIR_LOGIC, // combination of the PAIR_* values above
BCF_SR_ALLOW_NO_IDX, // allow to proceed even if required index is not present (at the user's risk)
BCF_SR_REGIONS_OVERLAP, // include overlapping records with POS outside the regions: 0=no, 1=VCF line overlap, 2=true variant overlap [1]
BCF_SR_TARGETS_OVERLAP // include overlapping records with POS outside the targets: 0=no, 1=VCF line overlap, 2=true variant overlap [0]
BCF_SR_TARGETS_OVERLAP, // include overlapping records with POS outside the targets: 0=no, 1=VCF line overlap, 2=true variant overlap [0]
BCF_SR_AUTO_TARGETS_FROM_REGIONS // route a dense single-base regions file through the streaming-targets code path; sets readers->targets, so incompatible with bcf_sr_set_targets() [off]
}
bcf_sr_opt_t;

Expand Down
106 changes: 106 additions & 0 deletions synced_bcf_reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/kseq.h"
#include "htslib/khash_str2int.h"
#include "htslib/bgzf.h"
#include "htslib/hfile.h"
#include "htslib/thread_pool.h"
#include "bcf_sr_sort.h"

Expand Down Expand Up @@ -70,6 +71,7 @@ typedef struct
sr_sort_t sort;
int regions_overlap, targets_overlap;
int *closefile; // close htsfile with sync reader close or not
int auto_targets_from_regions; // BCF_SR_AUTO_TARGETS_FROM_REGIONS opt-in
}
aux_t;

Expand Down Expand Up @@ -141,6 +143,10 @@ int bcf_sr_set_opt(bcf_srs_t *readers, bcf_sr_opt_t opt, ...)
if ( readers->targets ) readers->targets->overlap = BCF_SR_AUX(readers)->targets_overlap;
return 0;

case BCF_SR_AUTO_TARGETS_FROM_REGIONS:
BCF_SR_AUX(readers)->auto_targets_from_regions = 1;
return 0;

default:
break;
}
Expand Down Expand Up @@ -187,6 +193,88 @@ static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters)
return NULL;
}

// Sniff a regions BED/TSV: returns 1 iff (a) the path is a regular local
// file we can safely re-open, (b) the first SNIFF_LINES non-comment entries
// are *all* single-base regions (CHROM\tN-1\tN), and (c) those entries are
// densely packed (average intra-chromosome inter-entry distance below
// DENSITY_BP). Used by bcf_sr_set_regions() when the caller has opted in
// via BCF_SR_AUTO_TARGETS_FROM_REGIONS to auto-promote dense single-base
// BEDs to the streaming-targets path (samtools/bcftools#2557): the default
// per-region `tbx_itr_queryi()` path is 300-500x slower than a sequential
// scan at SNP-panel sizes (84M entries / 23GB VCF didn't complete in 11+
// hours of 100% CPU in production).
//
// The density check rejects sparse BEDs (e.g. ~11 SNPs per chromosome)
// where streaming would do a full-chromosome scan to satisfy a handful of
// records — pre-promotion the tabix seek path is the cheap one there.
#define BCF_SR_REGIONS_SNIFF_LINES 256
#define BCF_SR_REGIONS_DENSITY_BP 10000
static int sniff_regions_singlebase(const char *fname)
{
// Bail on inputs whose bytes are gone after the sniff close: FIFOs,
// stdin, character devices. Each subsequent hts_open() in
// bcf_sr_regions_init() would see a truncated stream (or block) for
// these. Remote URLs reopen freshly per hts_open() and are safe in
// principle, but the extra round-trip outweighs the heuristic value
// here — keep the auto-promote local-files-only.
if ( hisremote(fname) ) return 0;
if ( fname[0]=='-' && fname[1]==0 ) return 0;
struct stat sb;
if ( stat(fname, &sb) != 0 ) return 0;
if ( !S_ISREG(sb.st_mode) ) return 0;

htsFile *fp = hts_open(fname, "r");
if (!fp) return 0;
kstring_t line = {0,0,0};
kstring_t prev_chr = {0,0,0};
hts_pos_t prev_pos = -1;
hts_pos_t total_intra_dist = 0;
int n = 0, n_intra = 0, all_singlebase = 1;
while (n < BCF_SR_REGIONS_SNIFF_LINES)
{
int ret = hts_getline(fp, KS_SEP_LINE, &line);
if (ret < 0) break; // EOF or read error
if (line.l == 0 || line.s[0] == '#') continue; // skip headers/comments
// Parse CHROM\tSTART\tEND[\t...]. BED 1-bp regions have END-START==1.
char *chr = line.s;
char *p = line.s;
while (*p && *p != '\t') p++;
if (*p != '\t') { all_singlebase = 0; break; }
size_t chr_len = p - chr;
char *start_s = ++p;
while (*p && *p != '\t') p++;
if (*p != '\t') { all_singlebase = 0; break; }
char *end_s = ++p;
hts_pos_t start = strtoll(start_s, NULL, 10);
hts_pos_t end = strtoll(end_s, NULL, 10);
if (end - start != 1) { all_singlebase = 0; break; }

if ( prev_chr.l == chr_len && prev_pos >= 0 &&
memcmp(prev_chr.s, chr, chr_len) == 0 )
{
hts_pos_t d = start - prev_pos;
if (d < 0) d = -d;
total_intra_dist += d;
n_intra++;
}
prev_chr.l = 0;
kputsn(chr, chr_len, &prev_chr);
prev_pos = start;
n++;
}
free(line.s);
free(prev_chr.s);
if (hts_close(fp) < 0)
hts_log_error("Error on closing %s", fname);
if ( !all_singlebase || n != BCF_SR_REGIONS_SNIFF_LINES ) return 0;
// Need at least one same-chrom comparison and the sample must be dense.
// (If every entry sits on a different chromosome, the panel is sparse
// by construction — reject.)
if ( n_intra == 0 ) return 0;
if ( total_intra_dist / n_intra > BCF_SR_REGIONS_DENSITY_BP ) return 0;
return 1;
}

int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file)
{
if ( readers->nreaders || readers->regions )
Expand All @@ -197,6 +285,24 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file)
return 0;
}

// #2557 fastpath: a dense single-base BED (typical SNP panel: HGDP+1kGP
// 84M sites, AADR 1240k, PGS Catalog) hits a 300-500x per-region seek
// overhead in the default path. The streaming-targets code path
// (bcf_sr_set_targets) handles the same workload at near-baseline
// speed. Opt-in via BCF_SR_AUTO_TARGETS_FROM_REGIONS — when set, the
// sniffer decides whether the file qualifies. The opt-in is required
// because this routes regions through readers->targets, which is
// observable to callers and incompatible with a subsequent
// bcf_sr_set_targets() call. The 0/1/2 --regions-overlap semantics
// match --targets-overlap so the user-set value carries over unchanged.
if ( is_file
&& BCF_SR_AUX(readers)->auto_targets_from_regions
&& sniff_regions_singlebase(regions) )
{
BCF_SR_AUX(readers)->targets_overlap = BCF_SR_AUX(readers)->regions_overlap;
return bcf_sr_set_targets(readers, regions, is_file, 0);
}

readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2);
if ( !readers->regions ) return -1;
readers->explicit_regs = 1;
Expand Down
Loading