Skip to content

Commit

Permalink
mpileup: new -s and -S options to include only selected samples
Browse files Browse the repository at this point in the history
pulling over of pd3/bcftools@cf5c354
adding in `-S,--samples-file` option and exiting if no samples are
read from the file or list

TODO: add exclude logic with `^` prefix as in other bcftools commands

removed `config.h` from `sample.c` as leftover this is in samtools,
but not bcftools at the moment
  • Loading branch information
mcshane committed May 27, 2016
1 parent ec2dca5 commit cddc380
Show file tree
Hide file tree
Showing 8 changed files with 178 additions and 65 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(bcftools_
peakfit.o: peakfit.c peakfit.h $(htslib_hts_h) $(HTSDIR)/htslib/kstring.h
consensus.o: consensus.c $(htslib_hts_h) $(HTSDIR)/htslib/kseq.h rbuf.h $(bcftools_h) $(HTSDIR)/htslib/regidx.h
mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(htslib_regidx_h) $(bcftools_h) $(call_h) $(bam2bcf_h) $(sample_h)
sample.o: $(sample_h) $(htslib_hts_h) $(HTSDIR)/htslib/khash_str2int.h
version.o: version.h version.c

test/test-rbuf.o: test/test-rbuf.c rbuf.h
Expand Down
6 changes: 6 additions & 0 deletions doc/bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1360,6 +1360,12 @@ regions listed in the BED file.
*--ff, --excl-flags* 'STR'|'INT'::
Filter flags: skip reads with mask bits set [UNMAP,SECONDARY,QCFAIL,DUP]

*-s, --samples* 'LIST'::
list of sample names. See *<<common_options,Common Options>>*

*-S, --samples-file* 'FILE'::
file of sample names. See *<<common_options,Common Options>>*

*-x, --ignore-overlaps*::
Disable read-pair overlap detection.

Expand Down
69 changes: 54 additions & 15 deletions mpileup.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,12 @@ DEALINGS IN THE SOFTWARE. */

typedef struct {
int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
int rflag_require, rflag_filter, output_type;
int rflag_require, rflag_filter, output_type, sample_is_file;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
char *reg, *pl_list, *fai_fname, *output_fname;
char *reg, *pl_list, *fai_fname, *output_fname, *samples_list;
faidx_t *fai;
void *rghash;
void *rghash_exc;
regidx_t *bed;
gvcf_t *gvcf;
int argc;
Expand All @@ -86,6 +86,7 @@ typedef struct {
bam_hdr_t *h;
mplp_ref_t *ref;
const mplp_conf_t *conf;
void *rghash_inc; // whitelist for readgroups on per-bam basis
} mplp_aux_t;

typedef struct {
Expand Down Expand Up @@ -158,6 +159,7 @@ static int mplp_func(void *data, bam1_t *b)
int ret, skip = 0, ref_len;
do {
int has_ref;
skip = 0;
ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
if (ret < 0) break;
// The 'B' cigar operation is not part of the specification, considering as obsolete.
Expand All @@ -172,9 +174,16 @@ static int mplp_func(void *data, bam1_t *b)
skip = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],b->core.pos, bam_endpos(b)-1, NULL) ? 0 : 1;
if (skip) continue;
}
if (ma->conf->rghash) { // exclude read groups
if (ma->rghash_inc)
{
// skip unless read group is listed
uint8_t *rg = bam_aux_get(b, "RG");
if ( !rg ) { skip = 1; continue; }
if ( !khash_str2int_has_key(ma->rghash_inc, (const char*)(rg+1)) ) { skip = 1; continue; }
}
if (ma->conf->rghash_exc) { // exclude read groups
uint8_t *rg = bam_aux_get(b, "RG");
skip = (rg && khash_str2int_get(ma->conf->rghash, (const char*)(rg+1), NULL)==0);
skip = (rg && khash_str2int_get(ma->conf->rghash_exc, (const char*)(rg+1), NULL)==0);
if (skip) continue;
}
if (ma->conf->flag & MPLP_ILLUMINA13) {
Expand Down Expand Up @@ -278,7 +287,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bam_mplp_t iter;
bam_hdr_t *h = NULL; /* header of first file in input list */
char *ref;
void *rghash = NULL;
void *rghash_exc = NULL;

bcf_callaux_t *bca = NULL;
bcf_callret1_t *bcr = NULL;
Expand All @@ -303,6 +312,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
exit(EXIT_FAILURE);
}

void *samples_inc = NULL;
if ( conf->samples_list )
{
int nsamples = 0;
char **samples = hts_readlist(conf->samples_list, conf->sample_is_file, &nsamples);
if (!samples) {
fprintf(stderr,"[%s] could not read the samples list %s\n", __func__, conf->samples_list);
exit(EXIT_FAILURE);
}
if ( nsamples ) samples_inc = khash_str2int_init();
for (i=0; i<nsamples; i++)
{
khash_str2int_inc(samples_inc, strdup(samples[i]));
free(samples[i]);
}
free(samples);
}

// read the header of each file in the list and initialize data
for (i = 0; i < n; ++i) {
bam_hdr_t *h_tmp;
Expand All @@ -329,9 +356,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
exit(EXIT_FAILURE);
}
bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
if ( samples_inc ) data[i]->rghash_inc = khash_str2int_init();
bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text, samples_inc, data[i]->rghash_inc);
// Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search)
rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
rghash_exc = bcf_call_add_rg(rghash_exc, h_tmp->text, conf->pl_list);
if (conf->reg) {
hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]);
if (idx == NULL) {
Expand Down Expand Up @@ -458,7 +486,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
// Initialise the calling algorithm
bca = bcf_call_init(-1., conf->min_baseQ);
bcr = (bcf_callret1_t*) calloc(sm->n, sizeof(bcf_callret1_t));
bca->rghash = rghash;
bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
bca->min_frac = conf->min_frac;
bca->min_support = conf->min_support;
Expand Down Expand Up @@ -519,7 +546,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0);
flush_bcf_records(conf, bcf_fp, bcf_hdr, bcf_rec);
// call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0)
// check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, NULL) >= 0)
{
bcf_callaux_clean(bca, &bc);
for (i = 0; i < gplp.n; ++i)
Expand Down Expand Up @@ -552,15 +580,17 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
bam_smpl_destroy(sm); free(buf.s);
for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
bcf_call_del_rghash(rghash);
bcf_call_del_rghash(rghash_exc);
bam_mplp_destroy(iter);
bam_hdr_destroy(h);
for (i = 0; i < n; ++i) {
sam_close(data[i]->fp);
if (data[i]->iter) hts_itr_destroy(data[i]->iter);
if ( data[i]->rghash_inc ) khash_str2int_destroy_free(data[i]->rghash_inc);
free(data[i]);
}
free(data); free(plp); free(n_plp);
if ( samples_inc ) khash_str2int_destroy_free(samples_inc);
free(mp_ref.ref[0]);
free(mp_ref.ref[1]);
return ret;
Expand Down Expand Up @@ -689,6 +719,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
" [%s]\n", tmp_filter);
fprintf(fp,
" -s, --samples LIST comma separated list of samples to include\n"
" -S, --samples-file FILE file of samples to include\n"
" -x, --ignore-overlaps disable read-pair overlap detection\n"
"\n"
"Output options:\n"
Expand Down Expand Up @@ -774,6 +806,8 @@ int bam_mpileup(int argc, char *argv[])
{"min-bq", required_argument, NULL, 'Q'},
{"ignore-overlaps", no_argument, NULL, 'x'},
{"output-type", required_argument, NULL, 'O'},
{"samples", required_argument, NULL, 's'},
{"samples-file", required_argument, NULL, 'S'},
{"output-tags", required_argument, NULL, 't'},
{"ext-prob", required_argument, NULL, 'e'},
{"gap-frac", required_argument, NULL, 'F'},
Expand All @@ -786,7 +820,7 @@ int bam_mpileup(int argc, char *argv[])
{"platforms", required_argument, NULL, 'P'},
{NULL, 0, NULL, 0}
};
while ((c = getopt_long(argc, argv, "Ag:f:r:l:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xt:",lopts,NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "Ag:f:r:l:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xt:s:S:",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
Expand Down Expand Up @@ -824,6 +858,11 @@ int bam_mpileup(int argc, char *argv[])
case 'I': mplp.flag |= MPLP_NO_INDEL; break;
case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
case '6': mplp.flag |= MPLP_ILLUMINA13; break;
case 's': mplp.samples_list = optarg; break;
case 'S':
mplp.samples_list = optarg;
mplp.sample_is_file = 1;
break;
case 'O':
switch (optarg[0]) {
case 'b': mplp.output_type = FT_BCF_GZ; break;
Expand Down Expand Up @@ -854,11 +893,11 @@ int bam_mpileup(int argc, char *argv[])
case 'G': {
FILE *fp_rg;
char buf[1024];
mplp.rghash = khash_str2int_init();
mplp.rghash_exc = khash_str2int_init();
if ((fp_rg = fopen(optarg, "r")) == NULL)
fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg);
while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me...
khash_str2int_inc(mplp.rghash, strdup(buf));
khash_str2int_inc(mplp.rghash_exc, strdup(buf));
fclose(fp_rg);
}
break;
Expand Down Expand Up @@ -907,7 +946,7 @@ int bam_mpileup(int argc, char *argv[])
}
else
ret = mpileup(&mplp, argc - optind, argv + optind);
if (mplp.rghash) khash_str2int_destroy_free(mplp.rghash);
if (mplp.rghash_exc) khash_str2int_destroy_free(mplp.rghash_exc);
free(mplp.reg); free(mplp.pl_list);
if (mplp.fai) fai_destroy(mplp.fai);
if (mplp.bed) regidx_destroy(mplp.bed);
Expand Down
89 changes: 40 additions & 49 deletions sample.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* sample.c -- group data by sample.
Copyright (C) 2010, 2011 Broad Institute.
Copyright (C) 2013 Genome Research Ltd.
Copyright (C) 2013, 2016 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
Expand All @@ -23,67 +23,53 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */

#include <config.h>

#include <stdlib.h>
#include <string.h>
#include <htslib/hts.h>
#include <htslib/khash_str2int.h>
#include "sample.h"
#include "htslib/khash.h"
KHASH_MAP_INIT_STR(sm, int)

bam_sample_t *bam_smpl_init(void)
{
bam_sample_t *s;
s = (bam_sample_t*) calloc(1, sizeof(bam_sample_t));
s->rg2smid = kh_init(sm);
s->sm2id = kh_init(sm);
s->rg2smid = khash_str2int_init();
s->sm2id = khash_str2int_init();
return s;
}

void bam_smpl_destroy(bam_sample_t *sm)
{
int i;
khint_t k;
khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
if (sm == 0) return;
for (i = 0; i < sm->n; ++i) free(sm->smpl[i]);
if ( !sm ) return;
if ( sm->rg2smid ) khash_str2int_destroy_free(sm->rg2smid);
if ( sm->sm2id ) khash_str2int_destroy_free(sm->sm2id);
free(sm->smpl);
for (k = kh_begin(rg2smid); k != kh_end(rg2smid); ++k)
if (kh_exist(rg2smid, k)) free((char*)kh_key(rg2smid, k));
kh_destroy(sm, (khash_t(sm)*) sm->rg2smid);
kh_destroy(sm, (khash_t(sm)*) sm->sm2id);
free(sm);
}

static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, const char *val)
static void add_pair(bam_sample_t *sm, void *sm2id, const char *readgroup, const char *sample)
{
khint_t k_rg, k_sm;
int ret;
khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
k_rg = kh_get(sm, rg2smid, key);
if (k_rg != kh_end(rg2smid)) return; // duplicated @RG-ID
k_rg = kh_put(sm, rg2smid, strdup(key), &ret);
k_sm = kh_get(sm, sm2id, val);
if (k_sm == kh_end(sm2id)) { // absent
if (sm->n == sm->m) {
sm->m = sm->m? sm->m<<1 : 1;
sm->smpl = (char**) realloc(sm->smpl, sizeof(char*) * sm->m);
}
sm->smpl[sm->n] = strdup(val);
k_sm = kh_put(sm, sm2id, sm->smpl[sm->n], &ret);
kh_val(sm2id, k_sm) = sm->n++;
if ( khash_str2int_has_key(sm->rg2smid,readgroup) ) return; // duplicated @RG-ID

int ismpl;
if ( khash_str2int_get(sm2id, sample, &ismpl) < 0 )
{
// the sample encountered for the first time
ismpl = sm->n++;
hts_expand0(char*,sm->n,sm->m,sm->smpl);
sm->smpl[ismpl] = strdup(sample);
khash_str2int_set(sm2id, sm->smpl[ismpl], ismpl);
}
kh_val(rg2smid, k_rg) = kh_val(sm2id, k_sm);
khash_str2int_set(sm->rg2smid, strdup(readgroup), ismpl);
}

int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt, void *white_list, void *white_hash)
{
const char *p = txt, *q, *r;
kstring_t buf, first_sm;
int n = 0;
khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id;
if (txt == 0) {
add_pair(sm, sm2id, fn, fn);
add_pair(sm, sm->sm2id, fn, fn);
return 0;
}
memset(&buf, 0, sizeof(kstring_t));
Expand All @@ -99,36 +85,41 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
ioq = *u; ior = *v; *u = *v = '\0';
buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf);
add_pair(sm, sm2id, buf.s, r);
if ( !first_sm.s )
kputs(r,&first_sm);
if ( !white_list || khash_str2int_has_key(white_list,r) )
{
buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf);
add_pair(sm, sm->sm2id, buf.s, r);
if ( !first_sm.s )
kputs(r,&first_sm);
if ( white_list ) khash_str2int_inc(white_hash,strdup(q));
}
*u = ioq; *v = ior;
} else break;
p = q > r? q : r;
++n;
}
if (n == 0) add_pair(sm, sm2id, fn, fn);
if (n == 0) add_pair(sm, sm->sm2id, fn, fn);
// If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but
// use the tag instead.
else if ( n==1 && first_sm.s )
add_pair(sm,sm2id,fn,first_sm.s);
add_pair(sm,sm->sm2id,fn,first_sm.s);
if ( first_sm.s )
free(first_sm.s);

// add_pair(sm, sm2id, fn, fn);
free(buf.s);
return 0;
}

int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str)
{
khint_t k;
khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
if (rg) {
int ismpl;
if ( rg )
{
str->l = 0;
kputs(fn, str); kputc('/', str); kputs(rg, str);
k = kh_get(sm, rg2smid, str->s);
} else k = kh_get(sm, rg2smid, fn);
return k == kh_end(rg2smid)? -1 : kh_val(rg2smid, k);
if ( khash_str2int_get(sm->rg2smid, str->s, &ismpl) < 0 ) return -1;
return ismpl;
}
if ( khash_str2int_get(sm->rg2smid, fn, &ismpl) < 0 ) return -1;
return ismpl;
}
3 changes: 2 additions & 1 deletion sample.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* sample.h -- group data by sample.
Copyright (C) 2010 Broad Institute.
Copyright (C) 2016 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
Expand Down Expand Up @@ -34,7 +35,7 @@ typedef struct {
} bam_sample_t;

bam_sample_t *bam_smpl_init(void);
int bam_smpl_add(bam_sample_t *sm, const char *abs, const char *txt);
int bam_smpl_add(bam_sample_t *sm, const char *abs, const char *txt, void *white_list, void *white_hash);
int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str);
void bam_smpl_destroy(bam_sample_t *sm);

Expand Down
Loading

0 comments on commit cddc380

Please sign in to comment.