Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured
compilation options. Especially for OS X this will potentially save a
lot of trouble.

The current version of pysam wraps 3rd-party code from htslib-1.18, samtools-1.18, and bcftools-1.18.
The current version of pysam wraps 3rd-party code from htslib-1.20, samtools-1.20, and bcftools-1.20.

Pysam is available through `pypi
<https://pypi.python.org/pypi/pysam>`_. To install, type::
Expand Down
27 changes: 26 additions & 1 deletion bcftools/LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ the INSTALL document), the use of this software is governed by the GPL license.

The MIT/Expat License

Copyright (C) 2012-2023 Genome Research Ltd.
Copyright (C) 2012-2024 Genome Research Ltd.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -772,3 +772,28 @@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-----------------------------------------------------------------------------

License for edlib.[ch]

The MIT License (MIT)

Copyright (c) 2014 Martin Šošić

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 changes: 14 additions & 6 deletions bcftools/abuf.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* The MIT License

Copyright (c) 2021-2023 Genome Research Ltd.
Copyright (c) 2021-2024 Genome Research Ltd.

Author: Petr Danecek <pd3@sanger.ac.uk>

Expand Down Expand Up @@ -411,13 +411,21 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo
buf->tmp2 = dst.s;
ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
}
if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
if ( ret!=0 ) error("An error occurred while updating INFO/%s (errcode=%d)\n",tag,ret);
}
}
static void _split_table_set_history(abuf_t *buf)
{
int i,j;
int i,j,ret;
bcf1_t *rec = buf->split.rec;

// Don't update if the tag already exists. This is to prevent -a from overwriting -m
int m = 0;
char *tmp = NULL;
ret = bcf_get_info_string(buf->hdr,rec,buf->split.info_tag,&tmp,&m);
free(tmp);
if ( ret>0 ) return;

buf->tmps.l = 0;
ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
for (i=1; i<rec->n_allele; i++)
Expand All @@ -441,8 +449,8 @@ static void _split_table_set_history(abuf_t *buf)
kputc(',',&buf->tmps);
}
buf->tmps.s[--buf->tmps.l] = 0;
if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
if ( (ret=bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
error("An error occurred while updating INFO/%s (errcode=%d)\n",buf->split.info_tag,ret);
}
}
static void _split_table_set_gt(abuf_t *buf)
Expand Down Expand Up @@ -668,7 +676,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo
#undef BRANCH
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
}
if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
if ( ret!=0 ) error("An error occurred while updating FORMAT/%s (errcode=%d)\n",tag,ret);
}
}
static inline int _is_acgtn(char *seq)
Expand Down
20 changes: 14 additions & 6 deletions bcftools/abuf.c.pysam.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

/* The MIT License

Copyright (c) 2021-2023 Genome Research Ltd.
Copyright (c) 2021-2024 Genome Research Ltd.

Author: Petr Danecek <pd3@sanger.ac.uk>

Expand Down Expand Up @@ -413,13 +413,21 @@ static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mo
buf->tmp2 = dst.s;
ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
}
if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
if ( ret!=0 ) error("An error occurred while updating INFO/%s (errcode=%d)\n",tag,ret);
}
}
static void _split_table_set_history(abuf_t *buf)
{
int i,j;
int i,j,ret;
bcf1_t *rec = buf->split.rec;

// Don't update if the tag already exists. This is to prevent -a from overwriting -m
int m = 0;
char *tmp = NULL;
ret = bcf_get_info_string(buf->hdr,rec,buf->split.info_tag,&tmp,&m);
free(tmp);
if ( ret>0 ) return;

buf->tmps.l = 0;
ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
for (i=1; i<rec->n_allele; i++)
Expand All @@ -443,8 +451,8 @@ static void _split_table_set_history(abuf_t *buf)
kputc(',',&buf->tmps);
}
buf->tmps.s[--buf->tmps.l] = 0;
if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
if ( (ret=bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
error("An error occurred while updating INFO/%s (errcode=%d)\n",buf->split.info_tag,ret);
}
}
static void _split_table_set_gt(abuf_t *buf)
Expand Down Expand Up @@ -670,7 +678,7 @@ static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mo
#undef BRANCH
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
}
if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
if ( ret!=0 ) error("An error occurred while updating FORMAT/%s (errcode=%d)\n",tag,ret);
}
}
static inline int _is_acgtn(char *seq)
Expand Down
120 changes: 114 additions & 6 deletions bcftools/bam2bcf.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* bam2bcf.c -- variant calling.

Copyright (C) 2010-2012 Broad Institute.
Copyright (C) 2012-2022 Genome Research Ltd.
Copyright (C) 2012-2024 Genome Research Ltd.

Author: Heng Li <lh3@sanger.ac.uk>

Expand Down Expand Up @@ -249,6 +249,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
{
int i, n, ref4, is_indel, ori_depth = 0;

#ifdef GLF_DEBUG
fprintf(stderr, "Call GLFGEN\n");
#endif

// clean from previous run
r->ori_depth = 0;
r->mq0 = 0;
Expand All @@ -268,6 +272,15 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
}

// Detect if indel occurs anywhere in this sample
int indel_in_sample = 0;
if (bca->edlib) {
for (i = n = 0; i < _n; ++i) {
const bam_pileup1_t *p = pl + i;
if (p->indel) indel_in_sample = 1;
}
}

// fill the bases array
double nqual_over_60 = bca->nqual / 60.0;
int ADR_ref_missed[4] = {0};
Expand Down Expand Up @@ -298,7 +311,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
b = p->aux>>16&0x3f; // indel type
seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias

if ( !bca->indels_v20 )
if (bca->edlib) {
if (indel_in_sample) {
seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
} else if (p->aux & 0xff) {
// An indel in another sample, but not this. So just use
// basic sequence confidences.
q = bam_get_qual(p->b)[p->qpos];
if (q > bca->max_baseQ) q = bca->max_baseQ;
seqQ = 99;
}
}

if ( !bca->indels_v20 && !bca->edlib )
{
/*
This heuristics was introduced by e4e161068 and claims to fix #1446. However, we obtain
Expand Down Expand Up @@ -330,6 +355,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
}
}

#ifdef GLF_DEBUG
fprintf(stderr, "GLF %s\t%d\t%d\n", bam_get_qname(p->b),
bca->indel_types[b], q);
#endif
if (q < bca->min_baseQ)
{
if (!p->indel && b < 4) // not an indel read
Expand All @@ -341,6 +370,50 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
}
continue;
}

#ifndef MIN
#define MIN(a,b) ((a)<(b)?(a):(b))
#endif

#ifndef MAX
#define MAX(a,b) ((a)>(b)?(a):(b))
#endif

#if 1 // TEST 6
if (bca->edlib) {
// Deeper data should rely more heavily on counts of data
// than quality, as quality can be unreliable and prone to
// miscalculations through BAQ, STR analysis, etc.
// So we put a cap on how good seqQ can be.
//
// Is it simply the equivalent of increasing -F filter?
// Not quite, as the latter removes many real variants upfront.
// This calls them and then post-adjusts quality, potentially
// dropping it later or changing genotype. So we still get
// calls, but lower qual.
seqQ = MIN(seqQ, bca->seqQ_offset-(MIN(20,_n)*5));

if (indel_in_sample && p->indel == 0 && b != 0) {
// This read doesn't contain an indel in CIGAR, but it
// is assigned to an indel now (b != 0), These are
// reads we've corrected with realignment, but they're
// also enriched for FPs so at high depth we reduce their
// confidence and let the depth do the talking. If it's
// real and deep, then we don't need every read aligning.
// We also reduce base quality too to reflect the
// chance of our realignment being incorrect.

seqQ = MIN(seqQ, seqQ/2 + 5); // q2p5

// Finally reduce indel quality.
// This is a blend of indelQ and base QUAL.
q = MIN((int)bam_get_qual(p->b)[p->qpos]/4+10, q/4+1);
}
}
#endif

// Note baseQ changes some output fields such as I16, but has no
// significant affect on "call".
baseQ = p->aux>>8&0xff;
}
else
Expand Down Expand Up @@ -375,6 +448,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
}
mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
if ( !mapQ ) r->mq0++;
#ifdef GLF_DEBUG
fprintf(stderr, "GLF2 %s\t%d\t%d\t%d,%d\n",
bam_get_qname(p->b), b, q,
seqQ, mapQ);
#endif
if (q > seqQ) q = seqQ;
mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
if (q > mapQ) q = mapQ;
Expand Down Expand Up @@ -478,9 +556,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
}

// Else consider downgrading bca->bases[] scores by AD vs AD_ref_missed
// ratios. This is detrimental on Illumina, but beneficial on PacBio CCS.
// It's possibly related to the homopolyer error likelihoods or overall
// Indel accuracy. Maybe tie this in to the -h option?

r->ori_depth = ori_depth;
// glfgen
errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype

// TODO: account for the number of unassigned reads. If depth is 50,
// but AD is 5,7 then it may look like a variant but it probably
// should be low quality.

return n;
}

Expand Down Expand Up @@ -1147,10 +1235,30 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
if ( bc->ori_ref < 0 )
{
bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
if ( fmt_flag&B2B_INFO_IDV )
bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
if ( fmt_flag&B2B_INFO_IMF )
bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
uint32_t idv = bca->max_support;
if ( fmt_flag&B2B_INFO_IMF) {
float max_frac;
// Recompute IDV and IMF based on alignment results for more
// accurate counts, but only when in new "--indels-cns" mode.
if (bc->ADF && bc->ADR && bca->edlib) {
int max_ad = 0;
for (int k = 1; k < rec->n_allele; k++) {
if (max_ad < bc->ADF[k] + bc->ADR[k])
max_ad = bc->ADF[k] + bc->ADR[k];
}
max_frac = (double)(max_ad) / bc->ori_depth;
idv = max_ad;
} else {
max_frac = bca->max_frac;
}
// Copied here to maintain order for consistency of "make check"
if ( fmt_flag&B2B_INFO_IDV )
bcf_update_info_int32(hdr, rec, "IDV", &idv, 1);
bcf_update_info_float(hdr, rec, "IMF", &max_frac, 1);
} else {
if ( fmt_flag&B2B_INFO_IDV )
bcf_update_info_int32(hdr, rec, "IDV", &idv, 1);
}
}
bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
if ( fmt_flag&B2B_INFO_ADF )
Expand Down
Loading